blob: 8ac2f1dd31aab66c5fbc0858a34bcac8a3fbe112 [file] [log] [blame]
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<!--
| Generated by Apache Maven Doxia at 2021-06-15
| Rendered using Apache Maven Stylus Skin 1.5
-->
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Apache Hadoop Amazon Web Services support &#x2013; S3 Select</title>
<style type="text/css" media="all">
@import url("../../css/maven-base.css");
@import url("../../css/maven-theme.css");
@import url("../../css/site.css");
</style>
<link rel="stylesheet" href="../../css/print.css" type="text/css" media="print" />
<meta name="Date-Revision-yyyymmdd" content="20210615" />
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
</head>
<body class="composite">
<div id="banner">
<a href="http://hadoop.apache.org/" id="bannerLeft">
<img src="http://hadoop.apache.org/images/hadoop-logo.jpg" alt="" />
</a>
<a href="http://www.apache.org/" id="bannerRight">
<img src="http://www.apache.org/images/asf_logo_wide.png" alt="" />
</a>
<div class="clear">
<hr/>
</div>
</div>
<div id="breadcrumbs">
<div class="xleft">
<a href="http://www.apache.org/" class="externalLink">Apache</a>
&gt;
<a href="http://hadoop.apache.org/" class="externalLink">Hadoop</a>
&gt;
<a href="../../index.html">Apache Hadoop Amazon Web Services support</a>
&gt;
S3 Select
</div>
<div class="xright"> <a href="http://wiki.apache.org/hadoop" class="externalLink">Wiki</a>
|
<a href="https://gitbox.apache.org/repos/asf/hadoop.git" class="externalLink">git</a>
&nbsp;| Last Published: 2021-06-15
&nbsp;| Version: 3.3.1
</div>
<div class="clear">
<hr/>
</div>
</div>
<div id="leftColumn">
<div id="navcolumn">
<h5>General</h5>
<ul>
<li class="none">
<a href="../../../index.html">Overview</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/SingleCluster.html">Single Node Setup</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/ClusterSetup.html">Cluster Setup</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/CommandsManual.html">Commands Reference</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/FileSystemShell.html">FileSystem Shell</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/Compatibility.html">Compatibility Specification</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/DownstreamDev.html">Downstream Developer's Guide</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/AdminCompatibilityGuide.html">Admin Compatibility Guide</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/InterfaceClassification.html">Interface Classification</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/filesystem/index.html">FileSystem Specification</a>
</li>
</ul>
<h5>Common</h5>
<ul>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/CLIMiniCluster.html">CLI Mini Cluster</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/FairCallQueue.html">Fair Call Queue</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/NativeLibraries.html">Native Libraries</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/Superusers.html">Proxy User</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/RackAwareness.html">Rack Awareness</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/SecureMode.html">Secure Mode</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/ServiceLevelAuth.html">Service Level Authorization</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/HttpAuthentication.html">HTTP Authentication</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/CredentialProviderAPI.html">Credential Provider API</a>
</li>
<li class="none">
<a href="../../../hadoop-kms/index.html">Hadoop KMS</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/Tracing.html">Tracing</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/UnixShellGuide.html">Unix Shell Guide</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/registry/index.html">Registry</a>
</li>
</ul>
<h5>HDFS</h5>
<ul>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HdfsDesign.html">Architecture</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HdfsUserGuide.html">User Guide</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HDFSCommands.html">Commands Reference</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HDFSHighAvailabilityWithQJM.html">NameNode HA With QJM</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HDFSHighAvailabilityWithNFS.html">NameNode HA With NFS</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/ObserverNameNode.html">Observer NameNode</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/Federation.html">Federation</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/ViewFs.html">ViewFs</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/ViewFsOverloadScheme.html">ViewFsOverloadScheme</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HdfsSnapshots.html">Snapshots</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HdfsEditsViewer.html">Edits Viewer</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HdfsImageViewer.html">Image Viewer</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HdfsPermissionsGuide.html">Permissions and HDFS</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HdfsQuotaAdminGuide.html">Quotas and HDFS</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/LibHdfs.html">libhdfs (C API)</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/WebHDFS.html">WebHDFS (REST API)</a>
</li>
<li class="none">
<a href="../../../hadoop-hdfs-httpfs/index.html">HttpFS</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/ShortCircuitLocalReads.html">Short Circuit Local Reads</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/CentralizedCacheManagement.html">Centralized Cache Management</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HdfsNfsGateway.html">NFS Gateway</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HdfsRollingUpgrade.html">Rolling Upgrade</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/ExtendedAttributes.html">Extended Attributes</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/TransparentEncryption.html">Transparent Encryption</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HdfsMultihoming.html">Multihoming</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/ArchivalStorage.html">Storage Policies</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/MemoryStorage.html">Memory Storage Support</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/SLGUserGuide.html">Synthetic Load Generator</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HDFSErasureCoding.html">Erasure Coding</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HDFSDiskbalancer.html">Disk Balancer</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HdfsUpgradeDomain.html">Upgrade Domain</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HdfsDataNodeAdminGuide.html">DataNode Admin</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs-rbf/HDFSRouterFederation.html">Router Federation</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HdfsProvidedStorage.html">Provided Storage</a>
</li>
</ul>
<h5>MapReduce</h5>
<ul>
<li class="none">
<a href="../../../hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapReduceTutorial.html">Tutorial</a>
</li>
<li class="none">
<a href="../../../hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapredCommands.html">Commands Reference</a>
</li>
<li class="none">
<a href="../../../hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapReduce_Compatibility_Hadoop1_Hadoop2.html">Compatibility with 1.x</a>
</li>
<li class="none">
<a href="../../../hadoop-mapreduce-client/hadoop-mapreduce-client-core/EncryptedShuffle.html">Encrypted Shuffle</a>
</li>
<li class="none">
<a href="../../../hadoop-mapreduce-client/hadoop-mapreduce-client-core/PluggableShuffleAndPluggableSort.html">Pluggable Shuffle/Sort</a>
</li>
<li class="none">
<a href="../../../hadoop-mapreduce-client/hadoop-mapreduce-client-core/DistributedCacheDeploy.html">Distributed Cache Deploy</a>
</li>
<li class="none">
<a href="../../../hadoop-mapreduce-client/hadoop-mapreduce-client-core/SharedCacheSupport.html">Support for YARN Shared Cache</a>
</li>
</ul>
<h5>MapReduce REST APIs</h5>
<ul>
<li class="none">
<a href="../../../hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapredAppMasterRest.html">MR Application Master</a>
</li>
<li class="none">
<a href="../../../hadoop-mapreduce-client/hadoop-mapreduce-client-hs/HistoryServerRest.html">MR History Server</a>
</li>
</ul>
<h5>YARN</h5>
<ul>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/YARN.html">Architecture</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/YarnCommands.html">Commands Reference</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/CapacityScheduler.html">Capacity Scheduler</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/FairScheduler.html">Fair Scheduler</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/ResourceManagerRestart.html">ResourceManager Restart</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/ResourceManagerHA.html">ResourceManager HA</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/ResourceModel.html">Resource Model</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/NodeLabel.html">Node Labels</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/NodeAttributes.html">Node Attributes</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/WebApplicationProxy.html">Web Application Proxy</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/TimelineServer.html">Timeline Server</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/TimelineServiceV2.html">Timeline Service V.2</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/WritingYarnApplications.html">Writing YARN Applications</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/YarnApplicationSecurity.html">YARN Application Security</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/NodeManager.html">NodeManager</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/DockerContainers.html">Running Applications in Docker Containers</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/RuncContainers.html">Running Applications in runC Containers</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/NodeManagerCgroups.html">Using CGroups</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/SecureContainer.html">Secure Containers</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/ReservationSystem.html">Reservation System</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/GracefulDecommission.html">Graceful Decommission</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/OpportunisticContainers.html">Opportunistic Containers</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/Federation.html">YARN Federation</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/SharedCache.html">Shared Cache</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/UsingGpus.html">Using GPU</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/UsingFPGA.html">Using FPGA</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/PlacementConstraints.html">Placement Constraints</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/YarnUI2.html">YARN UI2</a>
</li>
</ul>
<h5>YARN REST APIs</h5>
<ul>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/WebServicesIntro.html">Introduction</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/ResourceManagerRest.html">Resource Manager</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/NodeManagerRest.html">Node Manager</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/TimelineServer.html#Timeline_Server_REST_API_v1">Timeline Server</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/TimelineServiceV2.html#Timeline_Service_v.2_REST_API">Timeline Service V.2</a>
</li>
</ul>
<h5>YARN Service</h5>
<ul>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/yarn-service/Overview.html">Overview</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/yarn-service/QuickStart.html">QuickStart</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/yarn-service/Concepts.html">Concepts</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/yarn-service/YarnServiceAPI.html">Yarn Service API</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/yarn-service/ServiceDiscovery.html">Service Discovery</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/yarn-service/SystemServices.html">System Services</a>
</li>
</ul>
<h5>Hadoop Compatible File Systems</h5>
<ul>
<li class="none">
<a href="../../../hadoop-aliyun/tools/hadoop-aliyun/index.html">Aliyun OSS</a>
</li>
<li class="none">
<a href="../../../hadoop-aws/tools/hadoop-aws/index.html">Amazon S3</a>
</li>
<li class="none">
<a href="../../../hadoop-azure/index.html">Azure Blob Storage</a>
</li>
<li class="none">
<a href="../../../hadoop-azure-datalake/index.html">Azure Data Lake Storage</a>
</li>
<li class="none">
<a href="../../../hadoop-openstack/index.html">OpenStack Swift</a>
</li>
<li class="none">
<a href="../../../hadoop-cos/cloud-storage/index.html">Tencent COS</a>
</li>
</ul>
<h5>Auth</h5>
<ul>
<li class="none">
<a href="../../../hadoop-auth/index.html">Overview</a>
</li>
<li class="none">
<a href="../../../hadoop-auth/Examples.html">Examples</a>
</li>
<li class="none">
<a href="../../../hadoop-auth/Configuration.html">Configuration</a>
</li>
<li class="none">
<a href="../../../hadoop-auth/BuildingIt.html">Building</a>
</li>
</ul>
<h5>Tools</h5>
<ul>
<li class="none">
<a href="../../../hadoop-streaming/HadoopStreaming.html">Hadoop Streaming</a>
</li>
<li class="none">
<a href="../../../hadoop-archives/HadoopArchives.html">Hadoop Archives</a>
</li>
<li class="none">
<a href="../../../hadoop-archive-logs/HadoopArchiveLogs.html">Hadoop Archive Logs</a>
</li>
<li class="none">
<a href="../../../hadoop-distcp/DistCp.html">DistCp</a>
</li>
<li class="none">
<a href="../../../hadoop-gridmix/GridMix.html">GridMix</a>
</li>
<li class="none">
<a href="../../../hadoop-rumen/Rumen.html">Rumen</a>
</li>
<li class="none">
<a href="../../../hadoop-resourceestimator/ResourceEstimator.html">Resource Estimator Service</a>
</li>
<li class="none">
<a href="../../../hadoop-sls/SchedulerLoadSimulator.html">Scheduler Load Simulator</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/Benchmarking.html">Hadoop Benchmarking</a>
</li>
<li class="none">
<a href="../../../hadoop-dynamometer/Dynamometer.html">Dynamometer</a>
</li>
</ul>
<h5>Reference</h5>
<ul>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/release/">Changelog and Release Notes</a>
</li>
<li class="none">
<a href="../../../api/index.html">Java API docs</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/UnixShellAPI.html">Unix Shell API</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/Metrics.html">Metrics</a>
</li>
</ul>
<h5>Configuration</h5>
<ul>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/core-default.xml">core-default.xml</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/hdfs-default.xml">hdfs-default.xml</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs-rbf/hdfs-rbf-default.xml">hdfs-rbf-default.xml</a>
</li>
<li class="none">
<a href="../../../hadoop-mapreduce-client/hadoop-mapreduce-client-core/mapred-default.xml">mapred-default.xml</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-common/yarn-default.xml">yarn-default.xml</a>
</li>
<li class="none">
<a href="../../../hadoop-kms/kms-default.html">kms-default.xml</a>
</li>
<li class="none">
<a href="../../../hadoop-hdfs-httpfs/httpfs-default.html">httpfs-default.xml</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/DeprecatedProperties.html">Deprecated Properties</a>
</li>
</ul>
<a href="http://maven.apache.org/" title="Built by Maven" class="poweredBy">
<img alt="Built by Maven" src="../../images/logos/maven-feather.png"/>
</a>
</div>
</div>
<div id="bodyColumn">
<div id="contentBox">
<!---
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<h1>S3 Select</h1>
<p><b>Experimental Feature</b></p>
<ul>
<li><a href="#Currently_Implemented_Features">Currently Implemented Features</a></li>
<li><a href="#Currently_Unsupported">Currently Unsupported</a></li>
<li><a href="#Enabling.2FDisabling_S3_Select">Enabling/Disabling S3 Select</a></li>
<li><a href="#Making_S3_Select_calls_through_the_Hadoop_APIs">Making S3 Select calls through the Hadoop APIs</a>
<ul>
<li><a href="#seek.28.29_behavior_on_SelectInputStream">seek() behavior on SelectInputStream</a></li></ul></li>
<li><a href="#Use_with_third-party_S3-compatible_object_stores.">Use with third-party S3-compatible object stores.</a></li>
<li><a href="#Selecting_data_from_the_command_line:_hadoop_s3guard_select">Selecting data from the command line: hadoop s3guard select</a>
<ul>
<li><a href="#Example_1">Example 1</a></li>
<li><a href="#Example_2">Example 2</a></li></ul></li>
<li><a href="#Use_in_MR.2FAnalytics_queries:_Work_in_Progress">Use in MR/Analytics queries: Work in Progress</a>
<ul>
<li><a href="#Querying_Compressed_objects">Querying Compressed objects</a></li></ul></li>
<li><a href="#S3_Select_configuration_options.">S3 Select configuration options.</a></li>
<li><a href="#Security_and_Privacy">Security and Privacy</a>
<ul>
<li><a href="#Links">Links</a></li>
<li><a href="#SQL_Syntax">SQL Syntax</a></li>
<li><a href="#CSV_formats">CSV formats</a></li>
<li><a href="#Consistency.2C_Concurrency_and_Error_handling">Consistency, Concurrency and Error handling</a></li></ul></li>
<li><a href="#Performance">Performance</a></li>
<li><a href="#Troubleshooting">Troubleshooting</a>
<ul>
<li><a href="#a.E2.80.9Cmid-query.E2.80.9D_failures_on_large_datasets">&#x201c;mid-query&#x201d; failures on large datasets</a></li>
<li><a href="#External_Resources_on_for_troubleshooting">External Resources on for troubleshooting</a></li>
<li><a href="#IOException:_.E2.80.9Cnot_a_gzip_file.E2.80.9D">IOException: &#x201c;not a gzip file&#x201d;</a></li>
<li><a href="#AWSBadRequestException_InvalidColumnIndex">AWSBadRequestException InvalidColumnIndex</a></li>
<li><a href="#AWSBadRequestException_ParseInvalidPathComponent">AWSBadRequestException ParseInvalidPathComponent</a></li>
<li><a href="#AWSBadRequestException__ParseExpectedTypeName">AWSBadRequestException ParseExpectedTypeName</a></li>
<li><a href="#ParseUnexpectedToken">ParseUnexpectedToken</a></li>
<li><a href="#ParseUnexpectedOperator">ParseUnexpectedOperator</a></li>
<li><a href="#MissingHeaders">MissingHeaders</a></li>
<li><a href="#Method_not_allowed">Method not allowed</a></li>
<li><a href="#AWSBadRequestException_InvalidTextEncoding">AWSBadRequestException InvalidTextEncoding</a></li>
<li><a href="#AWSBadRequestException__InvalidCompressionFormat_.E2.80.9CGZIP_is_not_applicable_to_the_queried_object.E2.80.9D">AWSBadRequestException InvalidCompressionFormat &#x201c;GZIP is not applicable to the queried object&#x201d;</a></li>
<li><a href="#PathIOException:_.E2.80.9Cseek.28.29_not_supported.E2.80.9D">PathIOException: &#x201c;seek() not supported&#x201d;</a></li>
<li><a href="#IllegalArgumentException:_.22Unknown_mandatory_key_.E2.80.9Cfs.s3a.select.sql.E2.80.9D">IllegalArgumentException: &quot;Unknown mandatory key &#x201c;fs.s3a.select.sql&#x201d;</a></li>
<li><a href="#IllegalArgumentException:_.E2.80.9CUnknown_mandatory_key_in_non-select_file_I.2FO.E2.80.9D">IllegalArgumentException: &#x201c;Unknown mandatory key in non-select file I/O&#x201d;</a></li>
<li><a href="#PathIOException_:_.E2.80.9Cseek.28.29_backwards_from__not_supported.E2.80.9D">PathIOException : &#x201c;seek() backwards from not supported&#x201d;</a></li>
<li><a href="#InvalidTableAlias">InvalidTableAlias</a></li>
<li><a href="#AWSBadRequestException_.E2.80.9CAttempt_to_convert_from_one_data_type_to_another_failed:_cast_from_STRING_to_TIMESTAMP..E2.80.9D">AWSBadRequestException &#x201c;Attempt to convert from one data type to another failed: cast from STRING to TIMESTAMP.&#x201d;</a></li></ul></li></ul>
<p>S3 Select is a feature for Amazon S3 introduced in April 2018. It allows for SQL-like SELECT expressions to be applied to files in some structured formats, including CSV and JSON.</p>
<p>By performing the SELECT operation in the S3 storage infrastructure, the bandwidth requirements between S3 and the hosts making the request can be reduced. Along with latency, this bandwidth is often the limiting factor in processing data from S3, especially with larger CSV and JSON datasets.</p>
<p>Apache Hadoop&#x2019;s S3A Client has experimental support for this feature, with the following warnings:</p>
<ul>
<li>The filtering is being done in S3 itself. If the source files cannot be parsed, that&#x2019;s not something which can be fixed in Hadoop or layers above.</li>
<li>It is not currently supported by third party S3 implementations, and unlikely to be supported in future (the bandwidth constraints are less, so the value less compelling).</li>
<li>Performance <i>appears</i> best when the selection restricts the number of fields, and projected columns: the less data returned, the faster the response.</li>
<li>High-level support in tools such as Apache Hive and Spark will also be evolving. Nobody has ever written CSV connectors with predicate pushdown before.</li>
<li>The standard <tt>FileInputFormat</tt> readers of text (<tt>LineRecordReader</tt> etc) fail when the amount of data returned is less than they expect. For this reason, S3 Select <i>MUST NOT BE USED IN PRODUCTION MAPREDUCE JOBS</i>.</li>
</ul>
<div class="section">
<h2><a name="Currently_Implemented_Features"></a>Currently Implemented Features</h2>
<ul>
<li>Ability to issue select queries on the command line.</li>
<li>Proof of concept support in MapReduce queries.</li>
<li>CSV input with/without compression.</li>
<li>CSV output.</li>
</ul></div>
<div class="section">
<h2><a name="Currently_Unsupported"></a>Currently Unsupported</h2>
<ul>
<li>Production-ready integration with the standard FileInputFormat and Record Readers.</li>
<li>Non-CSV output.</li>
<li>JSON source files.</li>
<li>Structured source file formats like Apache Parquet. It&#x2019;s better here to directly use the Apache Spark, Hive, Impala, Flink or similar, which all use the latest ASF-supported libraries.</li>
</ul></div>
<div class="section">
<h2><a name="Enabling.2FDisabling_S3_Select"></a>Enabling/Disabling S3 Select</h2>
<p>S3 Select is enabled by default:</p>
<div>
<div>
<pre class="source">&lt;property&gt;
&lt;name&gt;fs.s3a.select.enabled&lt;/name&gt;
&lt;value&gt;true&lt;/value&gt;
&lt;description&gt;Is S3 Select enabled?&lt;/description&gt;
&lt;/property&gt;
</pre></div></div>
<p>To disable it, set the option <tt>fs.s3a.select.enabled</tt> to <tt>false</tt>.</p>
<p>To probe to see if a FileSystem instance implements it, <tt>StreamCapabilities.hasCapability(&quot;s3a:fs.s3a.select.sql&quot;)</tt> will be true on an instance of the S3AFileSystem class if the version of Hadoop supports S3 Select, <i>and</i> it is enabled for that specific instance.</p>
<p>If this call returns false, then S3 Select calls will fail.</p>
<p>Rather than cast a filesystem to the <tt>S3AFileSystem</tt> class, cast it to <tt>org.apache.hadoop.fs.StreamCapabilities</tt>; a class which was added in Hadoop 2.9. This should result in less brittle code -and there is no need to have the <tt>hadoop-aws</tt> JAR on the classpath at compile time.</p>
<div>
<div>
<pre class="source">/**
* Probe for a filesystem instance supporting S3 Select.
* @param fs filesystem
* @return true if the filesystem supports S3 Select.
*/
public static boolean hasS3SelectCapability(final FileSystem fs) {
return (fs instanceof StreamCapabilities)
&amp;&amp; ((StreamCapabilities)fs).hasCapability(&quot;s3a:fs.s3a.select.sql&quot;);
}
</pre></div></div>
</div>
<div class="section">
<h2><a name="Making_S3_Select_calls_through_the_Hadoop_APIs"></a>Making S3 Select calls through the Hadoop APIs</h2>
<p>Applications can issue select queries through the Hadoop FileSystem/FileContext APIs via the asynchronous <tt>openFile()</tt> call added in Hadoop 3.3.</p>
<p>Use the <tt>FileSystem.openFile(path)</tt> or <tt>FileContext.openFile(path)</tt> methods command to get a builder class for the open operations, then set the mandatory s3 select options though multiple <tt>must()</tt> parameters.</p>
<div>
<div>
<pre class="source">FileSystem.FSDataInputStreamBuilder builder =
filesystem.openFile(&quot;s3a://bucket/path-to-file.csv&quot;)
.must(&quot;fs.s3a.select.sql&quot;,
&quot;SELECT * FROM S3OBJECT s WHERE s.\&quot;odd\&quot; = `TRUE`&quot;)
.must(&quot;fs.s3a.select.input.format&quot;, &quot;CSV&quot;)
.must(&quot;fs.s3a.select.input.compression&quot;, &quot;NONE&quot;)
.must(&quot;fs.s3a.select.input.csv.header&quot;, &quot;use&quot;)
.must(&quot;fs.s3a.select.output.format&quot;, &quot;CSV&quot;)
.must(&quot;fs.s3a.select.output.csv.field.delimiter&quot;, &quot;\t&quot;)
.must(&quot;fs.s3a.select.output.csv.quote.character&quot;, &quot;\&quot;&quot;)
.must(&quot;fs.s3a.select.output.csv.quote.fields&quot;, &quot;asneeded&quot;) ;
CompletableFuture&lt;FSDataInputStream&gt; future = builder.build();
try (FSDataInputStream select = future.get()) {
// process the output
byte[] bytes = new byte[8192];
int actual = select.read(bytes);
}
</pre></div></div>
<p>When the Builder&#x2019;s <tt>build()</tt> call is made, if the FileSystem/FileContext instance does not recognize any of the mandatory options it will fail. The S3A connector does recognize them, and, if S3 Select has not been disabled, will issue the Select query against the object store.</p>
<p>If the S3A connector has S3 Select disabled, it will fail with an <tt>UnsupportedOperationException</tt>.</p>
<p>The <tt>build()</tt> call returns a <tt>CompletableFuture&lt;FSDataInputStream&gt;</tt>. This future retrieves the result of the select call, which is executed asynchronously in the S3A FileSystem instance&#x2019;s executor pool.</p>
<p>Errors in the SQL, missing file, permission failures and suchlike will surface when the future is evaluated, <i>not the build call</i>.</p>
<p>In the returned stream, seeking and positioned reads do not work as usual, because there are no absolute positions in the file to seek to.</p>
<ol style="list-style-type: decimal">
<li>backwards <tt>seek()</tt> calls will raise a <tt>PathIOException</tt>.</li>
<li>Forwards <tt>seek()</tt> calls will succeed, but only by reading and discarding bytes. This will be slow.</li>
<li>All positioned read operations when the offset of the read is not the current position will raise a <tt>PathIOException</tt>.</li>
<li>Positioned read operations when the offset of the read <i>is</i> current position will succeed, but the position of the stream (as returned by <tt>getPos()</tt>) will be updated. This is not compliant with the filesystem specification.</li>
</ol>
<p>This is all done as a best-effort attempt to support existing code which often uses <tt>seek()</tt> to move forward in a split file after opening, or does a series of positioned read calls.</p>
<div class="section">
<h3><a name="seek.28.29_behavior_on_SelectInputStream"></a>seek() behavior on <tt>SelectInputStream</tt></h3>
<p>The returned stream, of type <tt>org.apache.hadoop.fs.s3a.select.SelectInputStream</tt>, only supports forward <tt>seek()</tt> operations.</p>
<p>A zero-byte seek operation is always valid:</p>
<div>
<div>
<pre class="source">stream.seek(stream.getPos());
</pre></div></div>
<p>A negative seek operation will always fail:</p>
<div>
<div>
<pre class="source">stream.seek(stream.getPos() - offset);
</pre></div></div>
<p>A forward seek operation will work, provided the final position is less than the total length of the stream:</p>
<div>
<div>
<pre class="source">stream.seek(stream.getPos() + offset);
</pre></div></div>
<p>If it is past the end of the file, an <tt>EOFException</tt> is raised.</p>
<p><i>Important</i> Forward seeks are implemented by reading and discarding the contents of the stream. The bigger the forward seek, the more data is thrown away, the longer the operation takes. And, if the data is being paid for over a long-haul S3 connection. the more the seek costs.</p>
<p>Calling <tt>seek()</tt> on a <tt>SelectInputStream</tt> should only be used with care.</p>
<p>The feature has been implemented for splittable queries across Selected data, where the initial read starts with a <tt>seek()</tt> to the offset. However, for reasons covered below, a codec should be used to declare the input unsplittable.</p></div></div>
<div class="section">
<h2><a name="Use_with_third-party_S3-compatible_object_stores."></a>Use with third-party S3-compatible object stores.</h2>
<p>Third party object stores do not, at the time of writing, support S3 Select. S3 Select operations against such stores will fail, presumably with a store-specific error code.</p>
<p>To avoid problems, disable S3 Select entirely:</p>
<div>
<div>
<pre class="source">&lt;property&gt;
&lt;name&gt;fs.s3a.select.enabled&lt;/name&gt;
&lt;value&gt;false&lt;/value&gt;
&lt;/property&gt;
</pre></div></div>
<p>This guarantees that the <tt>hasCapability()</tt> check will fail immediately, rather than delaying the failure until an SQL query is attempted.</p></div>
<div class="section">
<h2><a name="Selecting_data_from_the_command_line:_hadoop_s3guard_select"></a>Selecting data from the command line: <tt>hadoop s3guard select</tt></h2>
<p>The <tt>s3guard select</tt> command allows direct select statements to be made of a path.</p>
<p>Usage:</p>
<div>
<div>
<pre class="source">hadoop s3guard select [OPTIONS] \
[-limit rows] \
[-header (use|none|ignore)] \
[-out file] \
[-compression (gzip|none)] \
[-expected rows]
[-inputformat csv]
[-outputformat csv]
&lt;PATH&gt; &lt;SELECT QUERY&gt;
</pre></div></div>
<p>The output is printed, followed by some summary statistics, unless the <tt>-out</tt> option is used to declare a destination file. In this mode status will be logged to the console, but the output of the query will be saved directly to the output file.</p>
<div class="section">
<h3><a name="Example_1"></a>Example 1</h3>
<p>Read the first 100 rows of the landsat dataset where cloud cover is zero:</p>
<div>
<div>
<pre class="source">hadoop s3guard select -header use -compression gzip -limit 100 \
s3a://landsat-pds/scene_list.gz \
&quot;SELECT * FROM S3OBJECT s WHERE s.cloudCover = '0.0'&quot;
</pre></div></div>
</div>
<div class="section">
<h3><a name="Example_2"></a>Example 2</h3>
<p>Return the <tt>entityId</tt> column for all rows in the dataset where the cloud cover was &#x201c;0.0&#x201d;, and save it to the file <tt>output.csv</tt>:</p>
<div>
<div>
<pre class="source">hadoop s3guard select -header use -out s3a://mybucket/output.csv \
-compression gzip \
s3a://landsat-pds/scene_list.gz \
&quot;SELECT s.entityId from S3OBJECT s WHERE s.cloudCover = '0.0'&quot;
</pre></div></div>
<p>This file will:</p>
<ol style="list-style-type: decimal">
<li>Be UTF-8 encoded.</li>
<li>Have quotes on all columns returned.</li>
<li>Use commas as a separator.</li>
<li>Not have any header.</li>
</ol>
<p>The output can be saved to a file with the <tt>-out</tt> option. Note also that <tt>-D key=value</tt> settings can be used to control the operation, if placed after the <tt>s3guard</tt> command and before <tt>select</tt></p>
<div>
<div>
<pre class="source">hadoop s3guard \
-D s.s3a.select.output.csv.quote.fields=asneeded \
select \
-header use \
-compression gzip \
-limit 500 \
-inputformat csv \
-outputformat csv \
-out s3a://hwdev-steve-new/output.csv \
s3a://landsat-pds/scene_list.gz \
&quot;SELECT s.entityId from S3OBJECT s WHERE s.cloudCover = '0.0'&quot;
</pre></div></div>
</div></div>
<div class="section">
<h2><a name="Use_in_MR.2FAnalytics_queries:_Work_in_Progress"></a>Use in MR/Analytics queries: Work in Progress</h2>
<p>S3 Select support in analytics queries is a work in progress. It does not work reliably with large source files where the work is split up.</p>
<p>As a proof of concept <i>only</i>, S3 Select queries can be made through MapReduce jobs which use any Hadoop <tt>RecordReader</tt> class which uses the new <tt>openFile()</tt> API.</p>
<p>Currently this consists of the following MRv2 readers.</p>
<div>
<div>
<pre class="source">org.apache.hadoop.mapreduce.lib.input.LineRecordReader
org.apache.hadoop.mapreduce.lib.input.FixedLengthRecordReader
</pre></div></div>
<p>And a limited number of the MRv1 record readers:</p>
<div>
<div>
<pre class="source">org.apache.hadoop.mapred.LineRecordReader
</pre></div></div>
<p>All of these readers use the new API and can be have its optional/mandatory options set via the <tt>JobConf</tt> used when creating/configuring the reader.</p>
<p>These readers are instantiated within input formats; the following formats therefore support S3 Select.</p>
<div>
<div>
<pre class="source">org.apache.hadoop.mapreduce.lib.input.FixedLengthInputFormat
org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat
org.apache.hadoop.mapreduce.lib.input.NLineInputFormat
org.apache.hadoop.mapreduce.lib.input.TextInputFormat
org.apache.hadoop.mapred.KeyValueTextInputFormat
org.apache.hadoop.mapred.TextInputFormat
org.apache.hadoop.mapred.lib.NLineInputFormat
</pre></div></div>
<p>All <tt>JobConf</tt> options which begin with the prefix <tt>mapreduce.job.input.file.option.</tt> will have that prefix stripped and the remainder used as the name for an option when opening the file.</p>
<p>All <tt>JobConf</tt> options which being with the prefix <tt>mapreduce.job.input.file.must.</tt> will be converted into mandatory options.</p>
<p>To use an S3 Select call, set the following options</p>
<div>
<div>
<pre class="source">mapreduce.job.input.file.must.fs.s3a.select.sql = &lt;SQL STATEMENT&gt;
mapreduce.job.input.file.must.fs.s3a.select.input.format = CSV
mapreduce.job.input.file.must.fs.s3a.select.output.format = CSV
</pre></div></div>
<p>Further options may be set to tune the behaviour, for example:</p>
<div>
<div>
<pre class="source">jobConf.set(&quot;mapreduce.job.input.file.must.fs.s3a.select.input.csv.header&quot;, &quot;use&quot;);
</pre></div></div>
<p><i>Note</i> How to tell if a reader has migrated to the new <tt>openFile()</tt> builder API:</p>
<p>Set a mandatory option which is not known; if the job does not fail then an old reader is being used.</p>
<div>
<div>
<pre class="source">jobConf.set(&quot;mapreduce.job.input.file.must.unknown.option&quot;, &quot;anything&quot;);
</pre></div></div>
<div class="section">
<h3><a name="Querying_Compressed_objects"></a>Querying Compressed objects</h3>
<p>S3 Select queries can be made against gzipped source files; the S3A input stream receives the output in text format, rather than as a (re)compressed stream.</p>
<p>To read a gzip file, set <tt>fs.s3a.select.input.compression</tt> to <tt>gzip</tt>.</p>
<div>
<div>
<pre class="source">jobConf.set(&quot;mapreduce.job.input.file.must.fs.s3a.select.input.compression&quot;,
&quot;gzip&quot;);
</pre></div></div>
<p>Most of the Hadoop RecordReader classes automatically choose a decompressor based on the extension of the source file. This causes problems when reading <tt>.gz</tt> files, because S3 Select is automatically decompressing and returning csv-formatted text.</p>
<p>By default, a query across gzipped files will fail with the error &#x201c;IOException: not a gzip file&#x201d;</p>
<p>To avoid this problem, declare that the job should switch to the &#x201c;Passthrough Codec&#x201d; for all files with a &#x201c;.gz&#x201d; extension:</p>
<div>
<div>
<pre class="source">jobConf.set(&quot;io.compression.codecs&quot;,
&quot;org.apache.hadoop.io.compress.PassthroughCodec&quot;);
jobConf.set(&quot;io.compress.passthrough.extension&quot;, &quot;.gz&quot;);
</pre></div></div>
<p>Obviously, this breaks normal <tt>.gz</tt> decompression: only set it on S3 Select jobs.</p></div></div>
<div class="section">
<h2><a name="S3_Select_configuration_options."></a>S3 Select configuration options.</h2>
<p>Consult the javadocs for <tt>org.apache.hadoop.fs.s3a.select.SelectConstants</tt>.</p>
<p>The listed options can be set in <tt>core-site.xml</tt>, supported by S3A per-bucket configuration, and can be set programmatically on the <tt>Configuration</tt> object use to configure a new filesystem instance.</p>
<p>Any of these options can be set in the builder returned by the <tt>openFile()</tt> call &#x2014;simply set them through a chain of <tt>builder.must()</tt> operations.</p>
<div>
<div>
<pre class="source">&lt;property&gt;
&lt;name&gt;fs.s3a.select.input.format&lt;/name&gt;
&lt;value&gt;csv&lt;/value&gt;
&lt;description&gt;Input format&lt;/description&gt;
&lt;/property&gt;
&lt;property&gt;
&lt;name&gt;fs.s3a.select.output.format&lt;/name&gt;
&lt;value&gt;csv&lt;/value&gt;
&lt;description&gt;Output format&lt;/description&gt;
&lt;/property&gt;
&lt;property&gt;
&lt;name&gt;fs.s3a.select.input.csv.comment.marker&lt;/name&gt;
&lt;value&gt;#&lt;/value&gt;
&lt;description&gt;In S3 Select queries: the marker for comment lines in CSV files&lt;/description&gt;
&lt;/property&gt;
&lt;property&gt;
&lt;name&gt;fs.s3a.select.input.csv.record.delimiter&lt;/name&gt;
&lt;value&gt;\n&lt;/value&gt;
&lt;description&gt;In S3 Select queries over CSV files: the record delimiter.
\t is remapped to the TAB character, \r to CR \n to newline. \\ to \
and \&quot; to &quot;
&lt;/description&gt;
&lt;/property&gt;
&lt;property&gt;
&lt;name&gt;fs.s3a.select.input.csv.field.delimiter&lt;/name&gt;
&lt;value&gt;,&lt;/value&gt;
&lt;description&gt;In S3 Select queries over CSV files: the field delimiter.
\t is remapped to the TAB character, \r to CR \n to newline. \\ to \
and \&quot; to &quot;
&lt;/description&gt;
&lt;/property&gt;
&lt;property&gt;
&lt;name&gt;fs.s3a.select.input.csv.quote.character&lt;/name&gt;
&lt;value&gt;&quot;&lt;/value&gt;
&lt;description&gt;In S3 Select queries over CSV files: quote character.
\t is remapped to the TAB character, \r to CR \n to newline. \\ to \
and \&quot; to &quot;
&lt;/description&gt;
&lt;/property&gt;
&lt;property&gt;
&lt;name&gt;fs.s3a.select.input.csv.quote.escape.character&lt;/name&gt;
&lt;value&gt;\\&lt;/value&gt;
&lt;description&gt;In S3 Select queries over CSV files: quote escape character.
\t is remapped to the TAB character, \r to CR \n to newline. \\ to \
and \&quot; to &quot;
&lt;/description&gt;
&lt;/property&gt;
&lt;property&gt;
&lt;name&gt;fs.s3a.select.input.csv.header&lt;/name&gt;
&lt;value&gt;none&lt;/value&gt;
&lt;description&gt;In S3 Select queries over CSV files: what is the role of the header? One of &quot;none&quot;, &quot;ignore&quot; and &quot;use&quot;&lt;/description&gt;
&lt;/property&gt;
&lt;property&gt;
&lt;name&gt;fs.s3a.select.input.compression&lt;/name&gt;
&lt;value&gt;none&lt;/value&gt;
&lt;description&gt;In S3 Select queries, the source compression
algorithm. One of: &quot;none&quot; and &quot;gzip&quot;&lt;/description&gt;
&lt;/property&gt;
&lt;property&gt;
&lt;name&gt;fs.s3a.select.output.csv.quote.fields&lt;/name&gt;
&lt;value&gt;always&lt;/value&gt;
&lt;description&gt;
In S3 Select queries: should fields in generated CSV Files be quoted?
One of: &quot;always&quot;, &quot;asneeded&quot;.
&lt;/description&gt;
&lt;/property&gt;
&lt;property&gt;
&lt;name&gt;fs.s3a.select.output.csv.quote.character&lt;/name&gt;
&lt;value&gt;&quot;&lt;/value&gt;
&lt;description&gt;
In S3 Select queries: the quote character for generated CSV Files.
&lt;/description&gt;
&lt;/property&gt;
&lt;property&gt;
&lt;name&gt;fs.s3a.select.output.csv.quote.escape.character&lt;/name&gt;
&lt;value&gt;\\&lt;/value&gt;
&lt;description&gt;
In S3 Select queries: the quote escape character for generated CSV Files.
&lt;/description&gt;
&lt;/property&gt;
&lt;property&gt;
&lt;name&gt;fs.s3a.select.output.csv.record.delimiter&lt;/name&gt;
&lt;value&gt;\n&lt;/value&gt;
&lt;description&gt;
In S3 Select queries: the record delimiter for generated CSV Files.
&lt;/description&gt;
&lt;/property&gt;
&lt;property&gt;
&lt;name&gt;fs.s3a.select.output.csv.field.delimiter&lt;/name&gt;
&lt;value&gt;,&lt;/value&gt;
&lt;description&gt;
In S3 Select queries: the field delimiter for generated CSV Files.
&lt;/description&gt;
&lt;/property&gt;
&lt;property&gt;
&lt;name&gt;fs.s3a.select.errors.include.sql&lt;/name&gt;
&lt;value&gt;false&lt;/value&gt;
&lt;description&gt;
Include the SQL statement in errors: this is useful for development but
may leak security and Personally Identifying Information in production,
so must be disabled there.
&lt;/description&gt;
&lt;/property&gt;
</pre></div></div>
</div>
<div class="section">
<h2><a name="Security_and_Privacy"></a>Security and Privacy</h2>
<p>SQL Injection attacks are the classic attack on data. Because S3 Select is a read-only API, the classic <a class="externalLink" href="https://xkcd.com/327/">&#x201c;Bobby Tables&#x201d;</a> attack to gain write access isn&#x2019;t going to work. Even so: sanitize your inputs.</p>
<p>CSV does have security issues of its own, specifically:</p>
<p><i>Excel and other spreadsheets may interpret some fields beginning with special characters as formula, and execute them</i></p>
<p>S3 Select does not appear vulnerable to this, but in workflows where untrusted data eventually ends up in a spreadsheet (including Google Document spreadsheets), the data should be sanitized/audited first. There is no support for such sanitization in S3 Select or in the S3A connector.</p>
<p>Logging Select statements may expose secrets if they are in the statement. Even if they are just logged, this may potentially leak Personally Identifying Information as covered in the EU GDPR legislation and equivalents.</p>
<p>For both privacy and security reasons, SQL statements are not included in exception strings by default, nor logged at INFO level.</p>
<p>To enable them, set <tt>fs.s3a.select.errors.include.sql</tt> to <tt>true</tt>, either in the site/application configuration, or as an option in the builder for a single request. When set, the request will also be logged at the INFO level of the log <tt>org.apache.hadoop.fs.s3a.select.SelectBinding</tt>.</p>
<p>Personal Identifiable Information is not printed in the AWS S3 logs. Those logs contain only the SQL keywords from the query planner. All column names and literals are masked. Following is a sample log&#xa0;example:</p>
<p><i>Query:</i></p>
<div>
<div>
<pre class="source">SELECT * FROM S3OBJECT s;
</pre></div></div>
<p><i>Log:</i></p>
<div>
<div>
<pre class="source">select (project (list (project_all))) (from (as str0 (id str1 case_insensitive)))
</pre></div></div>
<p>Note also that:</p>
<ol style="list-style-type: decimal">
<li>Debug-level Hadoop logs for the module <tt>org.apache.hadoop.fs.s3a</tt> and other components&#x2019;s debug logs may also log the SQL statements (e.g. aws-sdk HTTP logs).</li>
</ol>
<p>The best practise here is: only enable SQL in exceptions while developing SQL queries, especially in an application/notebook where the exception text is a lot easier to see than the application logs.</p>
<p>In production: don&#x2019;t log or report. If you do, all logs and output must be considered sensitive from security and privacy perspectives.</p>
<p>The <tt>hadoop s3guard select</tt> command does enable the logging, so can be used as an initial place to experiment with the SQL syntax. Rationale: if you are constructing SQL queries on the command line, your shell history is already tainted with the query.</p>
<div class="section">
<h3><a name="Links"></a>Links</h3>
<ul>
<li><a class="externalLink" href="https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2014-3524">CVE-2014-3524</a>.</li>
<li><a class="externalLink" href="http://georgemauer.net/2017/10/07/csv-injection.html">The Absurdly Underestimated Dangers of CSV Injection</a>.</li>
<li><a class="externalLink" href="https://www.contextis.com/blog/comma-separated-vulnerabilities">Comma Separated Vulnerabilities</a>.</li>
</ul></div>
<div class="section">
<h3><a name="SQL_Syntax"></a>SQL Syntax</h3>
<p>The SQL Syntax directly supported by the AWS S3 Select API is <a class="externalLink" href="https://docs.aws.amazon.com/AmazonS3/latest/dev/s3-glacier-select-sql-reference.html">documented by Amazon</a>.</p>
<ul>
<li>Use single quotes for all constants, not double quotes.</li>
<li>All CSV column values are strings unless cast to a type</li>
<li>Simple <tt>SELECT</tt> calls, no <tt>JOIN</tt>.</li>
</ul></div>
<div class="section">
<h3><a name="CSV_formats"></a>CSV formats</h3>
<p>&#x201c;CSV&#x201d; is less a format, more &#x201c;a term meaning the data is in some nonstandard line-by-line&#x201d; text file, and there are even &#x201c;multiline CSV files&#x201d;.</p>
<p>S3 Select only supports a subset of the loose &#x201c;CSV&#x201d; concept, as covered in the AWS documentation. There are also limits on how many columns and how large a single line may be.</p>
<p>The specific quotation character, field and record delimiters, comments and escape characters can be configured in the Hadoop configuration.</p></div>
<div class="section">
<h3><a name="Consistency.2C_Concurrency_and_Error_handling"></a>Consistency, Concurrency and Error handling</h3>
<p><b>Consistency</b></p>
<p>Since November 2020, AWS S3 has been fully consistent. This also applies to S3 Select. We do not know what happens if an object is overwritten while a query is active.</p>
<p><b>Concurrency</b></p>
<p>The outcome of what happens when source file is overwritten while the result of a select call is overwritten is undefined.</p>
<p>The input stream returned by the operation is <i>NOT THREAD SAFE</i>.</p>
<p><b>Error Handling</b></p>
<p>If an attempt to issue an S3 select call fails, the S3A connector will reissue the request if-and-only-if it believes a retry may succeed. That is: it considers the operation to be idempotent and if the failure is considered to be a recoverable connectivity problem or a server-side rejection which can be retried (500, 503).</p>
<p>If an attempt to read data from an S3 select stream (<tt>org.apache.hadoop.fs.s3a.select.SelectInputStream)</tt> fails partway through the read, <i>no attempt is made to retry the operation</i></p>
<p>In contrast, the normal S3A input stream tries to recover from (possibly transient) failures by attempting to reopen the file.</p></div></div>
<div class="section">
<h2><a name="Performance"></a>Performance</h2>
<p>The select operation is best when the least amount of data is returned by the query, as this reduces the amount of data downloaded.</p>
<ul>
<li>Limit the number of columns projected to only those needed.</li>
<li>Use <tt>LIMIT</tt> to set an upper limit on the rows read, rather than implementing a row counter in application code and closing the stream when reached. This avoids having to abort the HTTPS connection and negotiate a new one on the next S3 request.</li>
</ul>
<p>The select call itself can be slow, especially when the source is a multi-MB compressed file with aggressive filtering in the <tt>WHERE</tt> clause. Assumption: the select query starts at row 1 and scans through each row, and does not return data until it has matched one or more rows.</p>
<p>If the asynchronous nature of the <tt>openFile().build().get()</tt> sequence can be taken advantage of, by performing other work before or in parallel to the <tt>get()</tt> call: do it.</p></div>
<div class="section">
<h2><a name="Troubleshooting"></a>Troubleshooting</h2>
<p>Getting S3 Select code to work is hard, though those knowledgeable in SQL will find it easier.</p>
<p>Problems can be split into:</p>
<ol style="list-style-type: decimal">
<li>Basic configuration of the client to issue the query.</li>
<li>Bad SQL select syntax and grammar.</li>
<li>Datatype casting issues</li>
<li>Bad records/data in source files.</li>
<li>Failure to configure MR jobs to work correctly.</li>
<li>Failure of MR jobs due to</li>
</ol>
<p>The exceptions here are all based on the experience during writing tests; more may surface with broader use.</p>
<p>All failures other than network errors on request initialization are considered unrecoverable and will not be reattempted.</p>
<p>As parse-time errors always state the line and column of an error, you can simplify debugging by breaking a SQL statement across lines, e.g.</p>
<div>
<div>
<pre class="source">String sql = &quot;SELECT\n&quot;
+ &quot;s.entityId \n&quot;
+ &quot;FROM &quot; + &quot;S3OBJECT s WHERE\n&quot;
+ &quot;s.\&quot;cloudCover\&quot; = '100.0'\n&quot;
+ &quot; LIMIT 100&quot;;
</pre></div></div>
<p>Now if the error is declared as &#x201c;line 4&#x201d;, it will be on the select conditions; the column offset will begin from the first character on that row.</p>
<p>The SQL Statements issued are only included in exceptions if <tt>fs.s3a.select.errors.include.sql</tt> is explicitly set to true. This can be done in an application during development, or in a <tt>openFile()</tt> option parameter. This should only be done during development, to reduce the risk of logging security or privacy information.</p>
<div class="section">
<h3><a name="a.E2.80.9Cmid-query.E2.80.9D_failures_on_large_datasets"></a>&#x201c;mid-query&#x201d; failures on large datasets</h3>
<p>S3 Select returns paged results; the source file is <i>not</i> filtered in one go in the initial request.</p>
<p>This means that errors related to the content of the data (type casting, etc) may only surface partway through the read. The errors reported in such a case may be different than those raised on reading the first page of data, where it will happen earlier on in the read process.</p></div>
<div class="section">
<h3><a name="External_Resources_on_for_troubleshooting"></a>External Resources on for troubleshooting</h3>
<p>See:</p>
<ul>
<li><a class="externalLink" href="https://docs.aws.amazon.com/AmazonS3/latest/dev/s3-glacier-select-sql-reference-select.html">SELECT Command Reference</a></li>
<li><a class="externalLink" href="https://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectSELECTContent.html">SELECT Object Content</a></li>
</ul></div>
<div class="section">
<h3><a name="IOException:_.E2.80.9Cnot_a_gzip_file.E2.80.9D"></a>IOException: &#x201c;not a gzip file&#x201d;</h3>
<p>This surfaces when trying to read in data from a <tt>.gz</tt> source file through an MR or other analytics query, and the gzip codec has tried to parse it.</p>
<div>
<div>
<pre class="source">java.io.IOException: not a gzip file
at org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor.processBasicHeader(BuiltInGzipDecompressor.java:496)
at org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor.executeHeaderState(BuiltInGzipDecompressor.java:257)
at org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor.decompress(BuiltInGzipDecompressor.java:186)
at org.apache.hadoop.io.compress.DecompressorStream.decompress(DecompressorStream.java:111)
at org.apache.hadoop.io.compress.DecompressorStream.read(DecompressorStream.java:105)
at java.io.InputStream.read(InputStream.java:101)
at org.apache.hadoop.util.LineReader.fillBuffer(LineReader.java:182)
at org.apache.hadoop.util.LineReader.readCustomLine(LineReader.java:306)
at org.apache.hadoop.util.LineReader.readLine(LineReader.java:174)
at org.apache.hadoop.mapreduce.lib.input.LineRecordReader.skipUtfByteOrderMark(LineRecordReader.java:158)
at org.apache.hadoop.mapreduce.lib.input.LineRecordReader.nextKeyValue(LineRecordReader.java:198)
</pre></div></div>
<p>The underlying problem is that the gzip decompressor is automatically enabled when the the source file ends with the &#x201c;.gz&#x201d; extension. Because S3 Select returns decompressed data, the codec fails.</p>
<p>The workaround here is to declare that the job should add the &#x201c;Passthrough Codec&#x201d; to its list of known decompressors, and that this codec should declare the file format it supports to be &#x201c;.gz&#x201d;.</p>
<div>
<div>
<pre class="source">io.compression.codecs = org.apache.hadoop.io.compress.PassthroughCodec
io.compress.passthrough.extension = .gz
</pre></div></div>
</div>
<div class="section">
<h3><a name="AWSBadRequestException_InvalidColumnIndex"></a>AWSBadRequestException <tt>InvalidColumnIndex</tt></h3>
<p>Your SQL is wrong and the element at fault is considered an unknown column name.</p>
<div>
<div>
<pre class="source">org.apache.hadoop.fs.s3a.AWSBadRequestException:
Select: SELECT * FROM S3OBJECT WHERE odd = true on test/testSelectOddLines.csv:
com.amazonaws.services.s3.model.AmazonS3Exception:
The column index at line 1, column 30 is invalid.
Please check the service documentation and try again.
(Service: Amazon S3; Status Code: 400; Error Code: InvalidColumnIndex;
</pre></div></div>
<p>Here it&#x2019;s the first line of the query, column 30. Paste the query into an editor and position yourself on the line and column at fault.</p>
<div>
<div>
<pre class="source">SELECT * FROM S3OBJECT WHERE odd = true
^ HERE
</pre></div></div>
<p>Another example:</p>
<div>
<div>
<pre class="source">org.apache.hadoop.fs.s3a.AWSBadRequestException: Select:
SELECT * FROM S3OBJECT s WHERE s._1 = &quot;true&quot; on test/testSelectOddLines.csv:
com.amazonaws.services.s3.model.AmazonS3Exception:
The column index at line 1, column 39 is invalid.
Please check the service documentation and try again.
(Service: Amazon S3; Status Code: 400;
Error Code: InvalidColumnIndex;
</pre></div></div>
<p>Here it is because strings must be single quoted, not double quoted.</p>
<div>
<div>
<pre class="source">SELECT * FROM S3OBJECT s WHERE s._1 = &quot;true&quot;
^ HERE
</pre></div></div>
<p>S3 select uses double quotes to wrap column names, interprets the string as column &#x201c;true&#x201d;, and fails with a non-intuitive message.</p>
<p><i>Tip</i>: look for the element at fault and treat the <tt>InvalidColumnIndex</tt> message as a parse-time message, rather than the definitive root cause of the problem.</p></div>
<div class="section">
<h3><a name="AWSBadRequestException_ParseInvalidPathComponent"></a>AWSBadRequestException <tt>ParseInvalidPathComponent</tt></h3>
<p>Your SQL is wrong.</p>
<div>
<div>
<pre class="source">org.apache.hadoop.fs.s3a.AWSBadRequestException:
Select: SELECT * FROM S3OBJECT s WHERE s.'odd' is &quot;true&quot; on test/testSelectOddLines.csv
: com.amazonaws.services.s3.model.AmazonS3Exception: Invalid Path component,
expecting either an IDENTIFIER or STAR, got: LITERAL,at line 1, column 34.
(Service: Amazon S3; Status Code: 400; Error Code: ParseInvalidPathComponent;
</pre></div></div>
<div>
<div>
<pre class="source">SELECT * FROM S3OBJECT s WHERE s.'odd' is &quot;true&quot; on test/testSelectOddLines.csv
^ HERE
</pre></div></div>
</div>
<div class="section">
<h3><a name="AWSBadRequestException__ParseExpectedTypeName"></a>AWSBadRequestException <tt>ParseExpectedTypeName</tt></h3>
<p>Your SQL is still wrong.</p>
<div>
<div>
<pre class="source">org.apache.hadoop.fs.s3a.AWSBadRequestException:
Select: SELECT * FROM S3OBJECT s WHERE s.odd = &quot;true&quot;
on test/testSelectOddLines.csv:
com.amazonaws.services.s3.model.AmazonS3Exception
: Expected type name, found QUOTED_IDENTIFIER:'true' at line 1, column 41.
(Service: Amazon S3; Status Code: 400; Error Code: ParseExpectedTypeName;
</pre></div></div>
</div>
<div class="section">
<h3><a name="ParseUnexpectedToken"></a><tt>ParseUnexpectedToken</tt></h3>
<p>Your SQL is broken.</p>
<div>
<div>
<pre class="source">org.apache.hadoop.fs.s3a.AWSBadRequestException:
Select: SELECT * FROM S3OBJECT s WHERE s.5 = `true` on test/testSelectOddLines.csv:
com.amazonaws.services.s3.model.AmazonS3Exception:
Unexpected token found LITERAL:5d-1 at line 1, column 33.
(Service: Amazon S3; Status Code: 400; Error Code: ParseUnexpectedToken;
</pre></div></div>
</div>
<div class="section">
<h3><a name="ParseUnexpectedOperator"></a><tt>ParseUnexpectedOperator</tt></h3>
<p>Your SQL is broken.</p>
<div>
<div>
<pre class="source">com.amazonaws.services.s3.model.AmazonS3Exception: Unexpected operator OPERATOR:'%' at line 1, column 45.
(Service: Amazon S3; Status Code: 400;
Error Code: ParseUnexpectedOperator; Request ID: E87F30C57436B459;
S3 Extended Request ID: UBFOIgkQxBBL+bcBFPaZaPBsjdnd8NRz3NFWAgcctqm3n6f7ib9FMOpR+Eu1Cy6cNMYHCpJbYEY
=:ParseUnexpectedOperator: Unexpected operator OPERATOR:'%' at line 1, column 45.
at java.util.concurrent.CompletableFuture.reportGet(CompletableFuture.java:357)
at java.util.concurrent.CompletableFuture.get(CompletableFuture.java:1895)
</pre></div></div>
</div>
<div class="section">
<h3><a name="MissingHeaders"></a><tt>MissingHeaders</tt></h3>
<div>
<div>
<pre class="source">org.apache.hadoop.fs.s3a.AWSBadRequestException:
Select: SELECT * FROM S3OBJECT s WHERE s.&quot;odd&quot; = `true` on test/testSelectOddLines.csv:
com.amazonaws.services.s3.model.AmazonS3Exception:
Some headers in the query are missing from the file.
Please check the file and try again.
(Service: Amazon S3; Status Code: 400; Error Code: MissingHeaders;
</pre></div></div>
<ol style="list-style-type: decimal">
<li>There&#x2019;s a header used in the query which doesn&#x2019;t match any in the document itself.</li>
<li>The header option for the select query is set to &#x201c;none&#x201d; or &#x201c;ignore&#x201d;, and you are trying to use a header named there.</li>
</ol>
<p>This can happen if you are trying to use double quotes for constants in the SQL expression.</p>
<div>
<div>
<pre class="source">SELECT * FROM S3OBJECT s WHERE s.&quot;odd&quot; = &quot;true&quot; on test/testSelectOddLines.csv:
^ HERE
</pre></div></div>
<p>Double quotes (&quot;) may only be used when naming columns; for constants single quotes are required.</p></div>
<div class="section">
<h3><a name="Method_not_allowed"></a>Method not allowed</h3>
<div>
<div>
<pre class="source">org.apache.hadoop.fs.s3a.AWSS3IOException: Select on test/testSelectWholeFile:
com.amazonaws.services.s3.model.AmazonS3Exception: The specified method is not
allowed against this resource. (Service: Amazon S3; Status Code: 405;
Error Code: MethodNotAllowed;
</pre></div></div>
<p>You are trying to use S3 Select to read data which for some reason you are not allowed to.</p></div>
<div class="section">
<h3><a name="AWSBadRequestException_InvalidTextEncoding"></a>AWSBadRequestException <tt>InvalidTextEncoding</tt></h3>
<p>The file couldn&#x2019;t be parsed. This can happen if you try to read a <tt>.gz</tt> file and forget to set the compression in the select request.</p>
<p>That can be done through the <tt>fs.s3a.select.compression</tt> option.</p>
<div>
<div>
<pre class="source">org.apache.hadoop.fs.s3a.AWSBadRequestException:
Select: '&quot; SELECT * FROM S3OBJECT s WHERE endstation_name = 'Bayswater Road: Hyde Park' &quot;
on s3a://example/dataset.csv.gz:
com.amazonaws.services.s3.model.AmazonS3Exception:
UTF-8 encoding is required. The text encoding error was found near byte 8,192.
(Service: Amazon S3; Status Code: 400; Error Code: InvalidTextEncoding
</pre></div></div>
</div>
<div class="section">
<h3><a name="AWSBadRequestException__InvalidCompressionFormat_.E2.80.9CGZIP_is_not_applicable_to_the_queried_object.E2.80.9D"></a>AWSBadRequestException <tt>InvalidCompressionFormat</tt> &#x201c;GZIP is not applicable to the queried object&#x201d;</h3>
<p>A SELECT call has been made using a compression which doesn&#x2019;t match that of the source object, such as it being a plain text file.</p>
<div>
<div>
<pre class="source">org.apache.hadoop.fs.s3a.AWSBadRequestException: Select:
'&quot; SELECT * FROM S3OBJECT s WHERE endstation_name = 'Bayswater Road: Hyde Park' &quot;
on s3a://example/dataset.csv:
com.amazonaws.services.s3.model.AmazonS3Exception:
GZIP is not applicable to the queried object. Please correct the request and try again.
(Service: Amazon S3; Status Code: 400; Error Code: InvalidCompressionFormat;
at org.apache.hadoop.fs.s3a.S3AUtils.translateException(S3AUtils.java:212)
at org.apache.hadoop.fs.s3a.Invoker.once(Invoker.java:111)
...
Caused by: com.amazonaws.services.s3.model.AmazonS3Exception: GZIP is not applicable to the queried object.
Please correct the request and try again.
Service: Amazon S3; Status Code: 400; Error Code: InvalidCompressionFormat;
at com.amazonaws.http.AmazonHttpClient$RequestExecutor.handleErrorResponse
...
</pre></div></div>
</div>
<div class="section">
<h3><a name="PathIOException:_.E2.80.9Cseek.28.29_not_supported.E2.80.9D"></a><tt>PathIOException</tt>: &#x201c;seek() not supported&#x201d;</h3>
<p>The input stream returned by the select call does not support seeking backwards in the stream.</p>
<p>Similarly, <tt>PositionedReadable</tt> operations will fail when used to read data any offset other than that of <tt>getPos()</tt>.</p>
<div>
<div>
<pre class="source">org.apache.hadoop.fs.PathIOException: `s3a://landsat-pds/landsat.csv.gz': seek() not supported
at org.apache.hadoop.fs.s3a.select.SelectInputStream.unsupported(SelectInputStream.java:254)
at org.apache.hadoop.fs.s3a.select.SelectInputStream.seek(SelectInputStream.java:243)
at org.apache.hadoop.fs.FSDataInputStream.seek(FSDataInputStream.java:66)
</pre></div></div>
<p>There is no fix for this. You can move forward in a file using <tt>skip(offset)</tt>; bear in mind that the return value indicates what offset was skipped -it may be less than expected.</p></div>
<div class="section">
<h3><a name="IllegalArgumentException:_.22Unknown_mandatory_key_.E2.80.9Cfs.s3a.select.sql.E2.80.9D"></a><tt>IllegalArgumentException</tt>: &quot;Unknown mandatory key &#x201c;fs.s3a.select.sql&#x201d;</h3>
<p>The filesystem is not an S3A filesystem, and the s3a select option is not recognized.</p>
<div>
<div>
<pre class="source">java.lang.IllegalArgumentException: Unknown mandatory key &quot;fs.s3a.select.sql&quot;
at com.google.common.base.Preconditions.checkArgument(Preconditions.java:88)
at org.apache.hadoop.fs.AbstractFSBuilder.lambda$rejectUnknownMandatoryKeys$0(AbstractFSBuilder.java:331)
at java.lang.Iterable.forEach(Iterable.java:75)
at java.util.Collections$UnmodifiableCollection.forEach(Collections.java:1080)
at org.apache.hadoop.fs.AbstractFSBuilder.rejectUnknownMandatoryKeys(AbstractFSBuilder.java:330)
at org.apache.hadoop.fs.filesystem.openFileWithOptions(FileSystem.java:3541)
at org.apache.hadoop.fs.FileSystem$FSDataInputStreamBuilder.build(FileSystem.java:4442)
</pre></div></div>
<ul>
<li>Verify that the URL has an &#x201c;s3a:&#x201d; prefix.</li>
<li>If it does, there may be a non-standard S3A implementation, or some a filtering/relaying class has been placed in front of the S3AFilesystem.</li>
</ul></div>
<div class="section">
<h3><a name="IllegalArgumentException:_.E2.80.9CUnknown_mandatory_key_in_non-select_file_I.2FO.E2.80.9D"></a><tt>IllegalArgumentException</tt>: &#x201c;Unknown mandatory key in non-select file I/O&#x201d;</h3>
<p>The file options to tune an S3 select call are only valid when a SQL expression is set in the <tt>fs.s3a.select.sql</tt> option. If not, any such option added as a <tt>must()</tt> value will fail.</p>
<div>
<div>
<pre class="source">java.lang.IllegalArgumentException: Unknown mandatory key for s3a://example/test/testSelectOptionsOnlyOnSelectCalls.csv in non-select file I/O &quot;fs.s3a.select.input.csv.header&quot;
at com.google.common.base.Preconditions.checkArgument(Preconditions.java:115)
at org.apache.hadoop.fs.impl.AbstractFSBuilderImpl.lambda$rejectUnknownMandatoryKeys$0(AbstractFSBuilderImpl.java:352)
at java.lang.Iterable.forEach(Iterable.java:75)
at java.util.Collections$UnmodifiableCollection.forEach(Collections.java:1080)
at org.apache.hadoop.fs.impl.AbstractFSBuilderImpl.rejectUnknownMandatoryKeys(AbstractFSBuilderImpl.java:351)
at org.apache.hadoop.fs.s3a.S3AFileSystem.openFileWithOptions(S3AFileSystem.java:3736)
at org.apache.hadoop.fs.FileSystem$FSDataInputStreamBuilder.build(FileSystem.java:4471)
</pre></div></div>
<p>Requiring these options without providing a SQL query is invariably an error. Fix: add the SQL statement, or use <tt>opt()</tt> calls to set the option.</p>
<p>If the <tt>fs.s3a.select.sql</tt> option is set, and still a key is rejected, then either the spelling of the key is wrong, it has leading or trailing spaces, or it is an option not supported in that specific release of Hadoop.</p></div>
<div class="section">
<h3><a name="PathIOException_:_.E2.80.9Cseek.28.29_backwards_from__not_supported.E2.80.9D"></a>PathIOException : &#x201c;seek() backwards from not supported&#x201d;</h3>
<p>Backwards seeks in an S3 Select <tt>SelectInputStream</tt> are not supported.</p>
<div>
<div>
<pre class="source">org.apache.hadoop.fs.PathIOException: `s3a://landsat-pds/scene_list.gz':
seek() backwards from 16387 to 0 not supported
at org.apache.hadoop.fs.s3a.select.SelectInputStream.unsupported(SelectInputStream.java:288)
at org.apache.hadoop.fs.s3a.select.SelectInputStream.seek(SelectInputStream.java:253)
at org.apache.hadoop.fs.FSDataInputStream.seek(FSDataInputStream.java:66)
</pre></div></div>
</div>
<div class="section">
<h3><a name="InvalidTableAlias"></a>InvalidTableAlias</h3>
<p>The SELECT refers to the name of a column which is not recognized</p>
<ul>
<li>the name of a column is wrong, here <tt>s.oddf</tt>.</li>
<li>headers are not enabled for the CSV source file. Fix: enable.</li>
<li>a generated alias is used e.g <tt>s._1</tt>, but headers have been enabled. Fix. disable, or use the header name.</li>
</ul>
<div>
<div>
<pre class="source">org.apache.hadoop.fs.s3a.AWSBadRequestException:
SELECT * FROM S3OBJECT WHERE s.&quot;oddf&quot; = 'true'
on s3a://example/test/testParseBrokenCSVFile:
com.amazonaws.services.s3.model.AmazonS3Exception:
Invalid table alias is specified at line 1, column 30.
Please check the file and try again. (Service: Amazon S3; Status Code: 400; Error Code: InvalidTableAlias;
Invalid table alias is specified at line 1, column 30. Please check the file and try again.
(Service: Amazon S3; Status Code: 400;
Error Code: InvalidTableAlias;
Request ID: 8693B86A52CFB91C;
at org.apache.hadoop.fs.s3a.S3AUtils.translateException(S3AUtils.java:225)
at org.apache.hadoop.fs.s3a.Invoker.once(Invoker.java:111)
at org.apache.hadoop.fs.s3a.Invoker.lambda$retry$3(Invoker.java:265)
...
Caused by: com.amazonaws.services.s3.model.AmazonS3Exception:
Invalid table alias is specified at line 1, column 30.
Please check the file and try again.
(Service: Amazon S3; Status Code: 400; Error Code: InvalidTableAlias; Request ID: 8693B86A52CFB91C;
at com.amazonaws.http.AmazonHttpClient$RequestExecutor.handleErrorResponse(AmazonHttpClient.java:1640)
at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeOneRequest(AmazonHttpClient.java:1304)
at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeHelper(AmazonHttpClient.java:1058)
at com.amazonaws.http.AmazonHttpClient$RequestExecutor.doExecute(AmazonHttpClient.java:743)
at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeWithTimer(AmazonHttpClient.java:717)
at com.amazonaws.http.AmazonHttpClient$RequestExecutor.execute(AmazonHttpClient.java:699)
at com.amazonaws.http.AmazonHttpClient$RequestExecutor.access$500(AmazonHttpClient.java:667)
at com.amazonaws.http.AmazonHttpClient$RequestExecutionBuilderImpl.execute(AmazonHttpClient.java:649)
</pre></div></div>
</div>
<div class="section">
<h3><a name="AWSBadRequestException_.E2.80.9CAttempt_to_convert_from_one_data_type_to_another_failed:_cast_from_STRING_to_TIMESTAMP..E2.80.9D"></a><tt>AWSBadRequestException</tt> &#x201c;Attempt to convert from one data type to another failed: cast from STRING to TIMESTAMP.&#x201d;</h3>
<p>A string field could not be converted to a timestamp because one or more of its entries were not parseable with the given timestamp.</p>
<p>Example, from a spreadsheet where &#x201c;timestamp&#x201d; is normally a well-formatted timestamp field, but in one column it is just &#x201c;Tuesday&#x201d;</p>
<div>
<div>
<pre class="source">SELECT CAST(s.date AS TIMESTAMP) FROM S3OBJECT s
</pre></div></div>
<div>
<div>
<pre class="source">org.apache.hadoop.fs.s3a.AWSBadRequestException: Select on s3a://example/test/testParseBrokenCSVFile:
com.amazonaws.services.s3.model.AmazonS3Exception:
Attempt to convert from one data type to another failed: cast from STRING to TIMESTAMP.
(Service: Amazon S3; Status Code: 400; Error Code: CastFailed;
Request ID: E2158FE45AF2049A; S3 Extended Request ID: iM40fzGuaPt6mQo0QxDDX+AY1bAgSVD1sKErFq6Y4GDJYHIAnmc00i0EvGGnH+0MFCFhKIivIrQ=),
S3 Extended Request ID: iM40fzGuaPt6mQo0QxDDX+AY1bAgSVD1sKErFq6Y4GDJYHIAnmc00i0EvGGnH+0MFCFhKIivIrQ=:CastFailed:
Attempt to convert from one data type to another failed: cast from STRING to TIMESTAMP.
(Service: Amazon S3; Status Code: 400; Error Code: CastFailed; Request ID: E2158FE45AF2049A; S3 Extended Request ID: iM40fzGuaPt6mQo0QxDDX+AY1bAgSVD1sKErFq6Y4GDJYHIAnmc00i0EvGGnH+0MFCFhKIivIrQ=)
at org.apache.hadoop.fs.s3a.S3AUtils.translateException(S3AUtils.java:225)
at org.apache.hadoop.fs.s3a.Invoker.once(Invoker.java:111)
at org.apache.hadoop.fs.s3a.Invoker.lambda$retry$3(Invoker.java:265)
Caused by: com.amazonaws.services.s3.model.AmazonS3Exception:
Attempt to convert from one data type to another failed: cast from STRING to TIMESTAMP.
(Service: Amazon S3; Status Code: 400; Error Code: CastFailed;)
</pre></div></div>
<p>There&#x2019;s no way to recover from a bad record here; no option to skip invalid rows.</p>
<p><i>Note:</i> This is an example stack trace <i>without</i> the SQL being printed.</p></div></div>
</div>
</div>
<div class="clear">
<hr/>
</div>
<div id="footer">
<div class="xright">
&#169; 2008-2021
Apache Software Foundation
- <a href="http://maven.apache.org/privacy-policy.html">Privacy Policy</a>.
Apache Maven, Maven, Apache, the Apache feather logo, and the Apache Maven project logos are trademarks of The Apache Software Foundation.
</div>
<div class="clear">
<hr/>
</div>
</div>
</body>
</html>