blob: 64daf48970a13550366bff477facf0bb8780d580 [file] [log] [blame]
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<!--
| Generated by Apache Maven Doxia at 2021-06-15
| Rendered using Apache Maven Stylus Skin 1.5
-->
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Apache Hadoop Amazon Web Services support &#x2013; S3Guard: Consistency and Metadata Caching for S3A</title>
<style type="text/css" media="all">
@import url("../../css/maven-base.css");
@import url("../../css/maven-theme.css");
@import url("../../css/site.css");
</style>
<link rel="stylesheet" href="../../css/print.css" type="text/css" media="print" />
<meta name="Date-Revision-yyyymmdd" content="20210615" />
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
</head>
<body class="composite">
<div id="banner">
<a href="http://hadoop.apache.org/" id="bannerLeft">
<img src="http://hadoop.apache.org/images/hadoop-logo.jpg" alt="" />
</a>
<a href="http://www.apache.org/" id="bannerRight">
<img src="http://www.apache.org/images/asf_logo_wide.png" alt="" />
</a>
<div class="clear">
<hr/>
</div>
</div>
<div id="breadcrumbs">
<div class="xleft">
<a href="http://www.apache.org/" class="externalLink">Apache</a>
&gt;
<a href="http://hadoop.apache.org/" class="externalLink">Hadoop</a>
&gt;
<a href="../../index.html">Apache Hadoop Amazon Web Services support</a>
&gt;
S3Guard: Consistency and Metadata Caching for S3A
</div>
<div class="xright"> <a href="http://wiki.apache.org/hadoop" class="externalLink">Wiki</a>
|
<a href="https://gitbox.apache.org/repos/asf/hadoop.git" class="externalLink">git</a>
&nbsp;| Last Published: 2021-06-15
&nbsp;| Version: 3.3.1
</div>
<div class="clear">
<hr/>
</div>
</div>
<div id="leftColumn">
<div id="navcolumn">
<h5>General</h5>
<ul>
<li class="none">
<a href="../../../index.html">Overview</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/SingleCluster.html">Single Node Setup</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/ClusterSetup.html">Cluster Setup</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/CommandsManual.html">Commands Reference</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/FileSystemShell.html">FileSystem Shell</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/Compatibility.html">Compatibility Specification</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/DownstreamDev.html">Downstream Developer's Guide</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/AdminCompatibilityGuide.html">Admin Compatibility Guide</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/InterfaceClassification.html">Interface Classification</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/filesystem/index.html">FileSystem Specification</a>
</li>
</ul>
<h5>Common</h5>
<ul>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/CLIMiniCluster.html">CLI Mini Cluster</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/FairCallQueue.html">Fair Call Queue</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/NativeLibraries.html">Native Libraries</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/Superusers.html">Proxy User</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/RackAwareness.html">Rack Awareness</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/SecureMode.html">Secure Mode</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/ServiceLevelAuth.html">Service Level Authorization</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/HttpAuthentication.html">HTTP Authentication</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/CredentialProviderAPI.html">Credential Provider API</a>
</li>
<li class="none">
<a href="../../../hadoop-kms/index.html">Hadoop KMS</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/Tracing.html">Tracing</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/UnixShellGuide.html">Unix Shell Guide</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/registry/index.html">Registry</a>
</li>
</ul>
<h5>HDFS</h5>
<ul>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HdfsDesign.html">Architecture</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HdfsUserGuide.html">User Guide</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HDFSCommands.html">Commands Reference</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HDFSHighAvailabilityWithQJM.html">NameNode HA With QJM</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HDFSHighAvailabilityWithNFS.html">NameNode HA With NFS</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/ObserverNameNode.html">Observer NameNode</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/Federation.html">Federation</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/ViewFs.html">ViewFs</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/ViewFsOverloadScheme.html">ViewFsOverloadScheme</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HdfsSnapshots.html">Snapshots</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HdfsEditsViewer.html">Edits Viewer</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HdfsImageViewer.html">Image Viewer</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HdfsPermissionsGuide.html">Permissions and HDFS</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HdfsQuotaAdminGuide.html">Quotas and HDFS</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/LibHdfs.html">libhdfs (C API)</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/WebHDFS.html">WebHDFS (REST API)</a>
</li>
<li class="none">
<a href="../../../hadoop-hdfs-httpfs/index.html">HttpFS</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/ShortCircuitLocalReads.html">Short Circuit Local Reads</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/CentralizedCacheManagement.html">Centralized Cache Management</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HdfsNfsGateway.html">NFS Gateway</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HdfsRollingUpgrade.html">Rolling Upgrade</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/ExtendedAttributes.html">Extended Attributes</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/TransparentEncryption.html">Transparent Encryption</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HdfsMultihoming.html">Multihoming</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/ArchivalStorage.html">Storage Policies</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/MemoryStorage.html">Memory Storage Support</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/SLGUserGuide.html">Synthetic Load Generator</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HDFSErasureCoding.html">Erasure Coding</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HDFSDiskbalancer.html">Disk Balancer</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HdfsUpgradeDomain.html">Upgrade Domain</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HdfsDataNodeAdminGuide.html">DataNode Admin</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs-rbf/HDFSRouterFederation.html">Router Federation</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HdfsProvidedStorage.html">Provided Storage</a>
</li>
</ul>
<h5>MapReduce</h5>
<ul>
<li class="none">
<a href="../../../hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapReduceTutorial.html">Tutorial</a>
</li>
<li class="none">
<a href="../../../hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapredCommands.html">Commands Reference</a>
</li>
<li class="none">
<a href="../../../hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapReduce_Compatibility_Hadoop1_Hadoop2.html">Compatibility with 1.x</a>
</li>
<li class="none">
<a href="../../../hadoop-mapreduce-client/hadoop-mapreduce-client-core/EncryptedShuffle.html">Encrypted Shuffle</a>
</li>
<li class="none">
<a href="../../../hadoop-mapreduce-client/hadoop-mapreduce-client-core/PluggableShuffleAndPluggableSort.html">Pluggable Shuffle/Sort</a>
</li>
<li class="none">
<a href="../../../hadoop-mapreduce-client/hadoop-mapreduce-client-core/DistributedCacheDeploy.html">Distributed Cache Deploy</a>
</li>
<li class="none">
<a href="../../../hadoop-mapreduce-client/hadoop-mapreduce-client-core/SharedCacheSupport.html">Support for YARN Shared Cache</a>
</li>
</ul>
<h5>MapReduce REST APIs</h5>
<ul>
<li class="none">
<a href="../../../hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapredAppMasterRest.html">MR Application Master</a>
</li>
<li class="none">
<a href="../../../hadoop-mapreduce-client/hadoop-mapreduce-client-hs/HistoryServerRest.html">MR History Server</a>
</li>
</ul>
<h5>YARN</h5>
<ul>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/YARN.html">Architecture</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/YarnCommands.html">Commands Reference</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/CapacityScheduler.html">Capacity Scheduler</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/FairScheduler.html">Fair Scheduler</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/ResourceManagerRestart.html">ResourceManager Restart</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/ResourceManagerHA.html">ResourceManager HA</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/ResourceModel.html">Resource Model</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/NodeLabel.html">Node Labels</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/NodeAttributes.html">Node Attributes</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/WebApplicationProxy.html">Web Application Proxy</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/TimelineServer.html">Timeline Server</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/TimelineServiceV2.html">Timeline Service V.2</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/WritingYarnApplications.html">Writing YARN Applications</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/YarnApplicationSecurity.html">YARN Application Security</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/NodeManager.html">NodeManager</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/DockerContainers.html">Running Applications in Docker Containers</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/RuncContainers.html">Running Applications in runC Containers</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/NodeManagerCgroups.html">Using CGroups</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/SecureContainer.html">Secure Containers</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/ReservationSystem.html">Reservation System</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/GracefulDecommission.html">Graceful Decommission</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/OpportunisticContainers.html">Opportunistic Containers</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/Federation.html">YARN Federation</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/SharedCache.html">Shared Cache</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/UsingGpus.html">Using GPU</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/UsingFPGA.html">Using FPGA</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/PlacementConstraints.html">Placement Constraints</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/YarnUI2.html">YARN UI2</a>
</li>
</ul>
<h5>YARN REST APIs</h5>
<ul>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/WebServicesIntro.html">Introduction</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/ResourceManagerRest.html">Resource Manager</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/NodeManagerRest.html">Node Manager</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/TimelineServer.html#Timeline_Server_REST_API_v1">Timeline Server</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/TimelineServiceV2.html#Timeline_Service_v.2_REST_API">Timeline Service V.2</a>
</li>
</ul>
<h5>YARN Service</h5>
<ul>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/yarn-service/Overview.html">Overview</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/yarn-service/QuickStart.html">QuickStart</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/yarn-service/Concepts.html">Concepts</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/yarn-service/YarnServiceAPI.html">Yarn Service API</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/yarn-service/ServiceDiscovery.html">Service Discovery</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/yarn-service/SystemServices.html">System Services</a>
</li>
</ul>
<h5>Hadoop Compatible File Systems</h5>
<ul>
<li class="none">
<a href="../../../hadoop-aliyun/tools/hadoop-aliyun/index.html">Aliyun OSS</a>
</li>
<li class="none">
<a href="../../../hadoop-aws/tools/hadoop-aws/index.html">Amazon S3</a>
</li>
<li class="none">
<a href="../../../hadoop-azure/index.html">Azure Blob Storage</a>
</li>
<li class="none">
<a href="../../../hadoop-azure-datalake/index.html">Azure Data Lake Storage</a>
</li>
<li class="none">
<a href="../../../hadoop-openstack/index.html">OpenStack Swift</a>
</li>
<li class="none">
<a href="../../../hadoop-cos/cloud-storage/index.html">Tencent COS</a>
</li>
</ul>
<h5>Auth</h5>
<ul>
<li class="none">
<a href="../../../hadoop-auth/index.html">Overview</a>
</li>
<li class="none">
<a href="../../../hadoop-auth/Examples.html">Examples</a>
</li>
<li class="none">
<a href="../../../hadoop-auth/Configuration.html">Configuration</a>
</li>
<li class="none">
<a href="../../../hadoop-auth/BuildingIt.html">Building</a>
</li>
</ul>
<h5>Tools</h5>
<ul>
<li class="none">
<a href="../../../hadoop-streaming/HadoopStreaming.html">Hadoop Streaming</a>
</li>
<li class="none">
<a href="../../../hadoop-archives/HadoopArchives.html">Hadoop Archives</a>
</li>
<li class="none">
<a href="../../../hadoop-archive-logs/HadoopArchiveLogs.html">Hadoop Archive Logs</a>
</li>
<li class="none">
<a href="../../../hadoop-distcp/DistCp.html">DistCp</a>
</li>
<li class="none">
<a href="../../../hadoop-gridmix/GridMix.html">GridMix</a>
</li>
<li class="none">
<a href="../../../hadoop-rumen/Rumen.html">Rumen</a>
</li>
<li class="none">
<a href="../../../hadoop-resourceestimator/ResourceEstimator.html">Resource Estimator Service</a>
</li>
<li class="none">
<a href="../../../hadoop-sls/SchedulerLoadSimulator.html">Scheduler Load Simulator</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/Benchmarking.html">Hadoop Benchmarking</a>
</li>
<li class="none">
<a href="../../../hadoop-dynamometer/Dynamometer.html">Dynamometer</a>
</li>
</ul>
<h5>Reference</h5>
<ul>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/release/">Changelog and Release Notes</a>
</li>
<li class="none">
<a href="../../../api/index.html">Java API docs</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/UnixShellAPI.html">Unix Shell API</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/Metrics.html">Metrics</a>
</li>
</ul>
<h5>Configuration</h5>
<ul>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/core-default.xml">core-default.xml</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/hdfs-default.xml">hdfs-default.xml</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs-rbf/hdfs-rbf-default.xml">hdfs-rbf-default.xml</a>
</li>
<li class="none">
<a href="../../../hadoop-mapreduce-client/hadoop-mapreduce-client-core/mapred-default.xml">mapred-default.xml</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-common/yarn-default.xml">yarn-default.xml</a>
</li>
<li class="none">
<a href="../../../hadoop-kms/kms-default.html">kms-default.xml</a>
</li>
<li class="none">
<a href="../../../hadoop-hdfs-httpfs/httpfs-default.html">httpfs-default.xml</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/DeprecatedProperties.html">Deprecated Properties</a>
</li>
</ul>
<a href="http://maven.apache.org/" title="Built by Maven" class="poweredBy">
<img alt="Built by Maven" src="../../images/logos/maven-feather.png"/>
</a>
</div>
</div>
<div id="bodyColumn">
<div id="contentBox">
<!---
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<h1>S3Guard: Consistency and Metadata Caching for S3A</h1>
<ul>
<li><a href="#Overview">Overview</a></li>
<li><a href="#Moving_off_S3Guard">Moving off S3Guard</a></li>
<li><a href="#Setting_up_S3Guard">Setting up S3Guard</a>
<ul>
<li><a href="#S3A_to_warn_or_fail_if_S3Guard_is_disabled">S3A to warn or fail if S3Guard is disabled</a></li>
<li><a href="#a1._Choose_the_Database">1. Choose the Database</a></li>
<li><a href="#a2._Configure_S3Guard_Settings">2. Configure S3Guard Settings</a>
<ul>
<li><a href="#Authoritative_S3Guard"> Authoritative S3Guard</a></li>
<li><a href="#TTL_metadata_expiry">TTL metadata expiry</a></li>
<li><a href="#Fail_on_Error">Fail on Error</a></li></ul></li>
<li><a href="#a3._Configure_the_Metadata_Store.">3. Configure the Metadata Store.</a></li>
<li><a href="#a4._Name_Your_Table">4. Name Your Table</a></li>
<li><a href="#a5._Locate_your_Table">5. Locate your Table</a></li>
<li><a href="#a6._Optional:_Create_your_Table">6. Optional: Create your Table</a></li>
<li><a href="#a7._If_creating_a_table:_Choose_your_billing_mode_.28and_perhaps_I.2FO_Capacity.29">7. If creating a table: Choose your billing mode (and perhaps I/O Capacity)</a></li>
<li><a href="#a8.__If_creating_a_table:_Enable_server_side_encryption_.28SSE.29">8. If creating a table: Enable server side encryption (SSE)</a></li></ul></li>
<li><a href="#Authenticating_with_S3Guard">Authenticating with S3Guard</a></li>
<li><a href="#Per-bucket_S3Guard_configuration">Per-bucket S3Guard configuration</a>
<ul>
<li><a href="#Out-of-band_operations_with_S3Guard">Out-of-band operations with S3Guard</a></li></ul></li>
<li><a href="#S3Guard_Command_Line_Interface_.28CLI.29">S3Guard Command Line Interface (CLI)</a>
<ul>
<li><a href="#Create_a_table:_s3guard_init">Create a table: s3guard init</a></li>
<li><a href="#Import_a_bucket:_s3guard_import">Import a bucket: s3guard import</a></li>
<li><a href="#Compare_a_S3Guard_table_and_the_S3_Store:_s3guard_diff">Compare a S3Guard table and the S3 Store: s3guard diff</a></li>
<li><a href="#Display_information_about_a_bucket.2C_s3guard_bucket-info">Display information about a bucket, s3guard bucket-info</a></li>
<li><a href="#List_or_Delete_Leftover_Multipart_Uploads:_s3guard_uploads">List or Delete Leftover Multipart Uploads: s3guard uploads</a></li>
<li><a href="#Delete_a_table:_s3guard_destroy">Delete a table: s3guard destroy</a></li>
<li><a href="#Clean_up_a_table.2C_s3guard_prune">Clean up a table, s3guard prune</a></li>
<li><a href="#Audit_the_.22authoritative_state_of_a_DynamoDB_Table.2C_s3guard_authoritative">Audit the &quot;authoritative state of a DynamoDB Table, s3guard authoritative</a></li>
<li><a href="#Tune_the_I.2FO_capacity_of_the_DynamoDB_Table.2C_s3guard_set-capacity">Tune the I/O capacity of the DynamoDB Table, s3guard set-capacity</a></li>
<li><a href="#Check_the_consistency_of_the_metadata_store.2C_s3guard_fsck">Check the consistency of the metadata store, s3guard fsck</a></li></ul></li>
<li><a href="#Debugging_and_Error_Handling">Debugging and Error Handling</a>
<ul>
<li><a href="#Enabling_a_log_message_whenever_S3Guard_is_disabled">Enabling a log message whenever S3Guard is disabled</a></li>
<li><a href="#Failure_Semantics">Failure Semantics</a></li>
<li><a href="#Versioning">Versioning</a></li></ul></li>
<li><a href="#Security">Security</a></li>
<li><a href="#Managing_DynamoDB_I.2FO_Capacity">Managing DynamoDB I/O Capacity</a>
<ul>
<li><a href="#On-Demand_Dynamo_Capacity"> On-Demand Dynamo Capacity</a></li>
<li><a href="#Autoscaling_.28Provisioned_Capacity.29_S3Guard_tables."> Autoscaling (Provisioned Capacity) S3Guard tables.</a></li></ul></li>
<li><a href="#Read-After-Overwrite_Consistency">Read-After-Overwrite Consistency</a>
<ul>
<li><a href="#No_Versioning_Metadata_Available">No Versioning Metadata Available</a></li>
<li><a href="#Known_Limitations">Known Limitations</a>
<ul>
<li><a href="#S3_Select">S3 Select</a></li>
<li><a href="#Rename">Rename</a></li>
<li><a href="#Out_of_Sync_Metadata">Out of Sync Metadata</a></li></ul></li></ul></li>
<li><a href="#Troubleshooting">Troubleshooting</a>
<ul>
<li><a href="#Error:_S3Guard_table_lacks_version_marker.">Error: S3Guard table lacks version marker.</a></li>
<li><a href="#Error:_Database_table_is_from_an_incompatible_S3Guard_version">Error: Database table is from an incompatible S3Guard version</a></li>
<li><a href="#Error_.22DynamoDB_table_TABLE_does_not_exist_in_region_REGION.3B_auto-creation_is_turned_off.22">Error &quot;DynamoDB table TABLE does not exist in region REGION; auto-creation is turned off&quot;</a></li>
<li><a href="#Error_.22The_level_of_configured_provisioned_throughput_for_the_table_was_exceeded.22">Error &quot;The level of configured provisioned throughput for the table was exceeded&quot;</a></li>
<li><a href="#Error_Max_retries_exceeded">Error Max retries exceeded</a></li>
<li><a href="#Error_when_running_set-capacity:_org.apache.hadoop.fs.s3a.AWSServiceThrottledException:_ProvisionTable">Error when running set-capacity: org.apache.hadoop.fs.s3a.AWSServiceThrottledException: ProvisionTable</a></li>
<li><a href="#Error_Invalid_region_specified">Error Invalid region specified</a></li>
<li><a href="#a.E2.80.9CNeither_ReadCapacityUnits_nor_WriteCapacityUnits_can_be_specified_when_BillingMode_is_PAY_PER_REQUEST.E2.80.9D">&#x201c;Neither ReadCapacityUnits nor WriteCapacityUnits can be specified when BillingMode is PAY_PER_REQUEST&#x201d;</a></li>
<li><a href="#MetadataPersistenceException">MetadataPersistenceException</a></li>
<li><a href="#Error_RemoteFileChangedException">Error RemoteFileChangedException</a></li>
<li><a href="#Error_AWSClientIOException:_copyFile_caused_by_NullPointerException">Error AWSClientIOException: copyFile caused by NullPointerException</a></li>
<li><a href="#Error_Attempt_to_change_a_resource_which_is_still_in_use:_Table_is_being_deleted">Error Attempt to change a resource which is still in use: Table is being deleted</a></li></ul></li>
<li><a href="#Other_Topics">Other Topics</a></li></ul>
<div class="section">
<h2><a name="Overview"></a>Overview</h2>
<p><i>S3Guard</i> is a feature for the S3A client of the S3 object store, which can use a (consistent) database as the store of metadata about objects in an S3 bucket.</p>
<p>It was written been 2016 and 2020, <i>when Amazon S3 was eventually consistent.</i> It compensated for the following S3 inconsistencies: * Newly created objects excluded from directory listings. * Newly deleted objects retained in directory listings. * Deleted objects still visible in existence probes and opening for reading. * S3 Load balancer 404 caching when a probe is made for an object before its creation.</p>
<p>It did not compensate for update inconsistency, though by storing the etag values of objects in the database, it could detect and report problems.</p>
<p>Now that S3 is consistent, there is no need for S3Guard at all.</p>
<p>S3Guard</p>
<ol style="list-style-type: decimal">
<li>
<p>Permitted a consistent view of the object store.</p>
</li>
<li>
<p>Could improve performance on directory listing/scanning operations. including those which take place during the partitioning period of query execution, the process where files are listed and the work divided up amongst processes.</p>
</li>
</ol>
<p>The basic idea was that, for each operation in the Hadoop S3 client (s3a) that reads or modifies metadata, a shadow copy of that metadata is stored in a separate MetadataStore implementation. The store was 1. Updated after mutating operations on the store 1. Updated after list operations against S3 discovered changes 1. Looked up whenever a probe was made for a file/directory existing. 1. Queried for all objects under a path when a directory listing was made; the results were merged with the S3 listing in a non-authoritative path, used exclusively in authoritative mode.</p>
<p>For links to early design documents and related patches, see <a class="externalLink" href="https://issues.apache.org/jira/browse/HADOOP-13345">HADOOP-13345</a>.</p>
<p><i>Important</i></p>
<ul>
<li>While all underlying data is persisted in S3, if, for some reason, the S3Guard-cached metadata becomes inconsistent with that in S3, queries on the data may become incorrect. For example, new datasets may be omitted, objects may be overwritten, or clients may not be aware that some data has been deleted. It is essential for all clients writing to an S3Guard-enabled S3 Repository to use the feature. Clients reading the data may work directly with the S3A data, in which case the normal S3 consistency guarantees apply.</li>
</ul></div>
<div class="section">
<h2><a name="Moving_off_S3Guard"></a>Moving off S3Guard</h2>
<p>How to move off S3Guard, given it is no longer needed.</p>
<ol style="list-style-type: decimal">
<li>Unset the option <tt>fs.s3a.metadatastore.impl</tt> globally/for all buckets for which it was selected.</li>
<li>If the option <tt>org.apache.hadoop.fs.s3a.s3guard.disabled.warn.level</tt> has been changed from the default (<tt>SILENT</tt>), change it back. You no longer need to be warned that S3Guard is disabled.</li>
<li>Restart all applications.</li>
</ol>
<p>Once you are confident that all applications have been restarted, <i>Delete the DynamoDB table</i>. This is to avoid paying for a database you no longer need. This is best done from the AWS GUI.</p></div>
<div class="section">
<h2><a name="Setting_up_S3Guard"></a>Setting up S3Guard</h2>
<div class="section">
<h3><a name="S3A_to_warn_or_fail_if_S3Guard_is_disabled"></a>S3A to warn or fail if S3Guard is disabled</h3>
<p>A seemingly recurrent problem with S3Guard is that people think S3Guard is turned on but it isn&#x2019;t. You can set <tt>org.apache.hadoop.fs.s3a.s3guard.disabled.warn.level</tt> to avoid this. The property sets what to do when an S3A FS is instantiated without S3Guard. The following values are available:</p>
<ul>
<li><tt>SILENT</tt>: Do nothing.</li>
<li><tt>INFORM</tt>: Log at info level that FS is instantiated without S3Guard.</li>
<li><tt>WARN</tt>: Warn that data may be at risk in workflows.</li>
<li><tt>FAIL</tt>: S3AFileSystem instantiation will fail.</li>
</ul>
<p>The default setting is <tt>SILENT</tt>. The setting is case insensitive. The required level can be set in the <tt>core-site.xml</tt>.</p><hr />
<p>The latest configuration parameters are defined in <tt>core-default.xml</tt>. You should consult that file for full information, but a summary is provided here.</p></div>
<div class="section">
<h3><a name="a1._Choose_the_Database"></a>1. Choose the Database</h3>
<p>A core concept of S3Guard is that the directory listing data of the object store, <i>the metadata</i> is replicated in a higher-performance, consistent, database. In S3Guard, this database is called <i>The Metadata Store</i></p>
<p>By default, S3Guard is not enabled.</p>
<p>The Metadata Store to use in production is bonded to Amazon&#x2019;s DynamoDB database service. The following setting will enable this Metadata Store:</p>
<div>
<div>
<pre class="source">&lt;property&gt;
&lt;name&gt;fs.s3a.metadatastore.impl&lt;/name&gt;
&lt;value&gt;org.apache.hadoop.fs.s3a.s3guard.DynamoDBMetadataStore&lt;/value&gt;
&lt;/property&gt;
</pre></div></div>
<p>Note that the <tt>NullMetadataStore</tt> store can be explicitly requested if desired. This offers no metadata storage, and effectively disables S3Guard.</p>
<div>
<div>
<pre class="source">&lt;property&gt;
&lt;name&gt;fs.s3a.metadatastore.impl&lt;/name&gt;
&lt;value&gt;org.apache.hadoop.fs.s3a.s3guard.NullMetadataStore&lt;/value&gt;
&lt;/property&gt;
</pre></div></div>
</div>
<div class="section">
<h3><a name="a2._Configure_S3Guard_Settings"></a>2. Configure S3Guard Settings</h3>
<p>More settings will may be added in the future. Currently the only Metadata Store-independent setting, besides the implementation class above, are the <i>allow authoritative</i> and <i>fail-on-error</i> flags.</p>
<div class="section">
<h4><a name="Authoritative_S3Guard"></a><a name="authoritative"></a> Authoritative S3Guard</h4>
<p>Authoritative S3Guard is a complicated configuration which delivers performance at the expense of being unsafe for other applications to use the same directory tree/bucket unless configured consistently.</p>
<p>It can also be used to support <a href="directory_markers.html">directory marker retention</a> in higher-performance but non-backwards-compatible modes.</p>
<p>Most deployments do not use this setting -it is ony used in deployments where specific parts of a bucket (e.g. Apache Hive managed tables) are known to have exclusive access by a single application (Hive) and other tools/applications from exactly the same Hadoop release.</p>
<p>The <i>authoritative</i> expression in S3Guard is present in two different layers, for two different reasons:</p>
<ul>
<li>
<p>Authoritative S3Guard</p>
<ul>
<li>S3Guard can be set as authoritative, which means that an S3A client will avoid round-trips to S3 when <b>getting file metadata</b>, and <b>getting directory listings</b> if there is a fully cached version of the directory stored in metadata store.</li>
<li>This mode can be set as a configuration property <tt>fs.s3a.metadatastore.authoritative</tt></li>
<li>It can also be set only on specific directories by setting <tt>fs.s3a.authoritative.path</tt> to one or more prefixes, for example <tt>s3a://bucket/path</tt> or &#x201c;/auth1,/auth2&#x201d;.</li>
<li>All interactions with the S3 bucket(s) must be through S3A clients sharing the same metadata store.</li>
<li>This is independent from which metadata store implementation is used.</li>
<li>In authoritative mode the metadata TTL metadata expiry is not effective. This means that the metadata entries won&#x2019;t expire on authoritative paths.</li>
</ul>
</li>
<li>
<p>Authoritative directory listings (isAuthoritative bit)</p>
<ul>
<li>Tells if the stored directory listing metadata is complete.</li>
<li>This is set by the FileSystem client (e.g. s3a) via the <tt>DirListingMetadata</tt> class (<tt>org.apache.hadoop.fs.s3a.s3guard.DirListingMetadata</tt>). (The MetadataStore only knows what the FS client tells it.)</li>
<li>If set to <tt>TRUE</tt>, we know that the directory listing (<tt>DirListingMetadata</tt>) is full, and complete.</li>
<li>If set to <tt>FALSE</tt> the listing may not be complete.</li>
<li>Metadata store may persist the isAuthoritative bit on the metadata store.</li>
<li>Currently <tt>org.apache.hadoop.fs.s3a.s3guard.LocalMetadataStore</tt> and <tt>org.apache.hadoop.fs.s3a.s3guard.DynamoDBMetadataStore</tt> implementation supports authoritative bit.</li>
</ul>
</li>
</ul>
<p>More on Authoritative S3Guard:</p>
<ul>
<li>This setting is about treating the MetadataStore (e.g. dynamodb) as the source of truth in general, and also to short-circuit S3 list objects and serve listings from the MetadataStore in some circumstances.</li>
<li>For S3A to skip S3&#x2019;s get object metadata, and serve it directly from the MetadataStore, the following things must all be true:
<ol style="list-style-type: decimal">
<li>The S3A client is configured to allow MetadataStore to be authoritative source of a file metadata (<tt>fs.s3a.metadatastore.authoritative=true</tt>).</li>
<li>The MetadataStore has the file metadata for the path stored in it.</li>
</ol>
</li>
<li>For S3A to skip S3&#x2019;s list objects on some path, and serve it directly from the MetadataStore, the following things must all be true:
<ol style="list-style-type: decimal">
<li>The MetadataStore implementation persists the bit <tt>DirListingMetadata.isAuthorititative</tt> set when calling <tt>MetadataStore#put</tt> (<tt>DirListingMetadata</tt>)</li>
<li>The S3A client is configured to allow MetadataStore to be authoritative source of a directory listing (<tt>fs.s3a.metadatastore.authoritative=true</tt>).</li>
<li>The MetadataStore has a <b>full listing for path</b> stored in it. This only happens if the FS client (s3a) explicitly has stored a full directory listing with <tt>DirListingMetadata.isAuthorititative=true</tt> before the said listing request happens.</li>
</ol>
</li>
</ul>
<p>This configuration only enables authoritative mode in the client layer. It is recommended that you leave the default setting here:</p>
<div>
<div>
<pre class="source">&lt;property&gt;
&lt;name&gt;fs.s3a.metadatastore.authoritative&lt;/name&gt;
&lt;value&gt;false&lt;/value&gt;
&lt;/property&gt;
</pre></div></div>
<p>Note that a MetadataStore MAY persist this bit in the directory listings. (Not MUST).</p>
<p>Note that if this is set to true, it may exacerbate or persist existing race conditions around multiple concurrent modifications and listings of a given directory tree.</p>
<p>In particular: <b>If the Metadata Store is declared as authoritative, all interactions with the S3 bucket(s) must be through S3A clients sharing the same Metadata Store</b></p></div>
<div class="section">
<h4><a name="TTL_metadata_expiry"></a>TTL metadata expiry</h4>
<p>It can be configured how long an entry is valid in the MetadataStore <b>if the authoritative mode is turned off</b>, or the path is not configured to be authoritative. If <tt>((lastUpdated + ttl) &lt;= now)</tt> is false for an entry, the entry will be expired, so the S3 bucket will be queried for fresh metadata. The time for expiry of metadata can be set as the following:</p>
<div>
<div>
<pre class="source">&lt;property&gt;
&lt;name&gt;fs.s3a.metadatastore.metadata.ttl&lt;/name&gt;
&lt;value&gt;15m&lt;/value&gt;
&lt;/property&gt;
</pre></div></div>
</div>
<div class="section">
<h4><a name="Fail_on_Error"></a>Fail on Error</h4>
<p>By default, S3AFileSystem write operations will fail when updates to S3Guard metadata fail. S3AFileSystem first writes the file to S3 and then updates the metadata in S3Guard. If the metadata write fails, <tt>MetadataPersistenceException</tt> is thrown. The file in S3 <b>is not</b> rolled back.</p>
<p>If the write operation cannot be programmatically retried, the S3Guard metadata for the given file can be corrected with a command like the following:</p>
<div>
<div>
<pre class="source">hadoop s3guard import [-meta URI] s3a://my-bucket/file-with-bad-metadata
</pre></div></div>
<p>Programmatic retries of the original operation would require overwrite=true. Suppose the original operation was <tt>FileSystem.create(myFile, overwrite=false)</tt>. If this operation failed with <tt>MetadataPersistenceException</tt> a repeat of the same operation would result in <tt>FileAlreadyExistsException</tt> since the original operation successfully created the file in S3 and only failed in writing the metadata to S3Guard.</p>
<p>Metadata update failures can be downgraded to ERROR logging instead of exception by setting the following configuration:</p>
<div>
<div>
<pre class="source">&lt;property&gt;
&lt;name&gt;fs.s3a.metadatastore.fail.on.write.error&lt;/name&gt;
&lt;value&gt;false&lt;/value&gt;
&lt;/property&gt;
</pre></div></div>
<p>Setting this false is dangerous as it could result in the type of issue S3Guard is designed to avoid. For example, a reader may see an inconsistent listing after a recent write since S3Guard may not contain metadata about the recently written file due to a metadata write error.</p>
<p>As with the default setting, the new/updated file is still in S3 and <b>is not</b> rolled back. The S3Guard metadata is likely to be out of sync.</p></div></div>
<div class="section">
<h3><a name="a3._Configure_the_Metadata_Store."></a>3. Configure the Metadata Store.</h3>
<p>Here are the <tt>DynamoDBMetadataStore</tt> settings. Other Metadata Store implementations will have their own configuration parameters.</p></div>
<div class="section">
<h3><a name="a4._Name_Your_Table"></a>4. Name Your Table</h3>
<p>First, choose the name of the table you wish to use for the S3Guard metadata storage in your DynamoDB instance. If you leave it unset/empty, a separate table will be created for each S3 bucket you access, and that bucket&#x2019;s name will be used for the name of the DynamoDB table. For example, this sets the table name to <tt>my-ddb-table-name</tt></p>
<div>
<div>
<pre class="source">&lt;property&gt;
&lt;name&gt;fs.s3a.s3guard.ddb.table&lt;/name&gt;
&lt;value&gt;my-ddb-table-name&lt;/value&gt;
&lt;description&gt;
The DynamoDB table name to operate. Without this property, the respective
S3 bucket names will be used.
&lt;/description&gt;
&lt;/property&gt;
</pre></div></div>
<p>It is good to share a table across multiple buckets for multiple reasons, especially if you are <i>not</i> using on-demand DynamoDB tables, and instead prepaying for provisioned I/O capacity.</p>
<ol style="list-style-type: decimal">
<li>
<p>You are billed for the provisioned I/O capacity allocated to the table, <i>even when the table is not used</i>. Sharing capacity can reduce costs.</p>
</li>
<li>
<p>You can share the &#x201c;provision burden&#x201d; across the buckets. That is, rather than allocating for the peak load on a single bucket, you can allocate for the peak load <i>across all the buckets</i>, which is likely to be significantly lower.</p>
</li>
<li>
<p>It&#x2019;s easier to measure and tune the load requirements and cost of S3Guard, because there is only one table to review and configure in the AWS management console.</p>
</li>
<li>
<p>When you don&#x2019;t grant the permission to create DynamoDB tables to users. A single pre-created table for all buckets avoids the needs for an administrator to create one for every bucket.</p>
</li>
</ol>
<p>When wouldn&#x2019;t you want to share a table?</p>
<ol style="list-style-type: decimal">
<li>When you are using on-demand DynamoDB and want to keep each table isolated.</li>
<li>When you do explicitly want to provision I/O capacity to a specific bucket and table, isolated from others.</li>
<li>
<p>When you are using separate billing for specific buckets allocated to specific projects.</p>
</li>
<li>
<p>When different users/roles have different access rights to different buckets. As S3Guard requires all users to have R/W access to the table, all users will be able to list the metadata in all buckets, even those to which they lack read access.</p>
</li>
</ol></div>
<div class="section">
<h3><a name="a5._Locate_your_Table"></a>5. Locate your Table</h3>
<p>You may also wish to specify the region to use for DynamoDB. If a region is not configured, S3A will assume that it is in the same region as the S3 bucket. A list of regions for the DynamoDB service can be found in <a class="externalLink" href="http://docs.aws.amazon.com/general/latest/gr/rande.html#ddb_region">Amazon&#x2019;s documentation</a>. In this example, to use the US West 2 region:</p>
<div>
<div>
<pre class="source">&lt;property&gt;
&lt;name&gt;fs.s3a.s3guard.ddb.region&lt;/name&gt;
&lt;value&gt;us-west-2&lt;/value&gt;
&lt;/property&gt;
</pre></div></div>
<p>When working with S3Guard-managed buckets from EC2 VMs running in AWS infrastructure, using a local DynamoDB region ensures the lowest latency and highest reliability, as well as avoiding all long-haul network charges. The S3Guard tables, and indeed, the S3 buckets, should all be in the same region as the VMs.</p></div>
<div class="section">
<h3><a name="a6._Optional:_Create_your_Table"></a>6. Optional: Create your Table</h3>
<p>Next, you can choose whether or not the table will be automatically created (if it doesn&#x2019;t already exist). If you want this feature, set the <tt>fs.s3a.s3guard.ddb.table.create</tt> option to <tt>true</tt>.</p>
<div>
<div>
<pre class="source">&lt;property&gt;
&lt;name&gt;fs.s3a.s3guard.ddb.table.create&lt;/name&gt;
&lt;value&gt;true&lt;/value&gt;
&lt;description&gt;
If true, the S3A client will create the table if it does not already exist.
&lt;/description&gt;
&lt;/property&gt;
</pre></div></div>
</div>
<div class="section">
<h3><a name="a7._If_creating_a_table:_Choose_your_billing_mode_.28and_perhaps_I.2FO_Capacity.29"></a>7. If creating a table: Choose your billing mode (and perhaps I/O Capacity)</h3>
<p>Next, you need to decide whether to use On-Demand DynamoDB and its pay-per-request billing (recommended), or to explicitly request a provisioned IO capacity.</p>
<p>Before AWS offered pay-per-request billing, the sole billing mechanism, was &#x201c;provisioned capacity&#x201d;. This mechanism requires you to choose the DynamoDB read and write throughput requirements you expect to need for your expected uses of the S3Guard table. Setting higher values cost you more money -<i>even when the table was idle</i> <i>Note</i> that these settings only affect table creation when <tt>fs.s3a.s3guard.ddb.table.create</tt> is enabled. To change the throughput for an existing table, use the AWS console or CLI tool.</p>
<p>For more details on DynamoDB capacity units, see the AWS page on <a class="externalLink" href="http://docs.aws.amazon.com/amazondynamodb/latest/developerguide/WorkingWithTables.html#CapacityUnitCalculations">Capacity Unit Calculations</a>.</p>
<p>Provisioned IO capacity is billed per hour for the life of the table, <i>even when the table and the underlying S3 buckets are not being used</i>.</p>
<p>There are also charges incurred for data storage and for data I/O outside of the region of the DynamoDB instance. S3Guard only stores metadata in DynamoDB: path names and summary details of objects &#x2014;the actual data is stored in S3, so billed at S3 rates.</p>
<p>With provisioned I/O capacity, attempting to perform more I/O than the capacity requested throttles the operation and may result in operations failing. Larger I/O capacities cost more.</p>
<p>With the introduction of On-Demand DynamoDB, you can now avoid paying for provisioned capacity by creating an on-demand table. With an on-demand table you are not throttled if your DynamoDB requests exceed any pre-provisioned limit, nor do you pay per hour even when a table is idle.</p>
<p>You do, however, pay more per DynamoDB operation. Even so, the ability to cope with sudden bursts of read or write requests, combined with the elimination of charges for idle tables, suit the use patterns made of S3Guard tables by applications interacting with S3. That is: periods when the table is rarely used, with intermittent high-load operations when directory trees are scanned (query planning and similar), or updated (rename and delete operations).</p>
<p>We recommending using On-Demand DynamoDB for maximum performance in operations such as query planning, and lowest cost when S3 buckets are not being accessed.</p>
<p>This is the default, as configured in the default configuration options.</p>
<div>
<div>
<pre class="source">&lt;property&gt;
&lt;name&gt;fs.s3a.s3guard.ddb.table.capacity.read&lt;/name&gt;
&lt;value&gt;0&lt;/value&gt;
&lt;description&gt;
Provisioned throughput requirements for read operations in terms of capacity
units for the DynamoDB table. This config value will only be used when
creating a new DynamoDB table.
If set to 0 (the default), new tables are created with &quot;per-request&quot; capacity.
If a positive integer is provided for this and the write capacity, then
a table with &quot;provisioned capacity&quot; will be created.
You can change the capacity of an existing provisioned-capacity table
through the &quot;s3guard set-capacity&quot; command.
&lt;/description&gt;
&lt;/property&gt;
&lt;property&gt;
&lt;name&gt;fs.s3a.s3guard.ddb.table.capacity.write&lt;/name&gt;
&lt;value&gt;0&lt;/value&gt;
&lt;description&gt;
Provisioned throughput requirements for write operations in terms of
capacity units for the DynamoDB table.
If set to 0 (the default), new tables are created with &quot;per-request&quot; capacity.
Refer to related configuration option fs.s3a.s3guard.ddb.table.capacity.read
&lt;/description&gt;
&lt;/property&gt;
</pre></div></div>
</div>
<div class="section">
<h3><a name="a8.__If_creating_a_table:_Enable_server_side_encryption_.28SSE.29"></a>8. If creating a table: Enable server side encryption (SSE)</h3>
<p>Encryption at rest can help you protect sensitive data in your DynamoDB table. When creating a new table, you can set server side encryption on the table using the default AWS owned customer master key (CMK), AWS managed CMK, or customer managed CMK. S3Guard code accessing the table is all the same whether SSE is enabled or not. For more details on DynamoDB table server side encryption, see the AWS page on <a class="externalLink" href="https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/encryption.howitworks.html">Encryption at Rest: How It Works</a>.</p>
<p>These are the default configuration options, as configured in <tt>core-default.xml</tt>.</p>
<div>
<div>
<pre class="source">&lt;property&gt;
&lt;name&gt;fs.s3a.s3guard.ddb.table.sse.enabled&lt;/name&gt;
&lt;value&gt;false&lt;/value&gt;
&lt;description&gt;
Whether server-side encryption (SSE) is enabled or disabled on the table.
By default it's disabled, meaning SSE is set to AWS owned CMK.
&lt;/description&gt;
&lt;/property&gt;
&lt;property&gt;
&lt;name&gt;fs.s3a.s3guard.ddb.table.sse.cmk&lt;/name&gt;
&lt;value/&gt;
&lt;description&gt;
The KMS Customer Master Key (CMK) used for the KMS encryption on the table.
To specify a CMK, this config value can be its key ID, Amazon Resource Name
(ARN), alias name, or alias ARN. Users only need to provide this config if
the key is different from the default DynamoDB KMS Master Key, which is
alias/aws/dynamodb.
&lt;/description&gt;
&lt;/property&gt;
</pre></div></div>
</div></div>
<div class="section">
<h2><a name="Authenticating_with_S3Guard"></a>Authenticating with S3Guard</h2>
<p>The DynamoDB metadata store takes advantage of the fact that the DynamoDB service uses the same authentication mechanisms as S3. S3Guard gets all its credentials from the S3A client that is using it.</p>
<p>All existing S3 authentication mechanisms can be used.</p></div>
<div class="section">
<h2><a name="Per-bucket_S3Guard_configuration"></a>Per-bucket S3Guard configuration</h2>
<p>In production, it is likely only some buckets will have S3Guard enabled; those which are read-only may have disabled, for example. Equally importantly, buckets in different regions should have different tables, each in the relevant region.</p>
<p>These options can be managed through S3A&#x2019;s <a href="./index.html#Configuring_different_S3_buckets">per-bucket configuration mechanism</a>. All options with the under <tt>fs.s3a.bucket.BUCKETNAME.KEY</tt> are propagated to the options <tt>fs.s3a.KEY</tt> <i>for that bucket only</i>.</p>
<p>As an example, here is a configuration to use different metadata stores and tables for different buckets</p>
<p>First, we define shortcuts for the metadata store classnames:</p>
<div>
<div>
<pre class="source">&lt;property&gt;
&lt;name&gt;s3guard.null&lt;/name&gt;
&lt;value&gt;org.apache.hadoop.fs.s3a.s3guard.NullMetadataStore&lt;/value&gt;
&lt;/property&gt;
&lt;property&gt;
&lt;name&gt;s3guard.dynamo&lt;/name&gt;
&lt;value&gt;org.apache.hadoop.fs.s3a.s3guard.DynamoDBMetadataStore&lt;/value&gt;
&lt;/property&gt;
</pre></div></div>
<p>Next, Amazon&#x2019;s public landsat database is configured with no metadata store:</p>
<div>
<div>
<pre class="source">&lt;property&gt;
&lt;name&gt;fs.s3a.bucket.landsat-pds.metadatastore.impl&lt;/name&gt;
&lt;value&gt;${s3guard.null}&lt;/value&gt;
&lt;description&gt;The read-only landsat-pds repository isn't
managed by S3Guard&lt;/description&gt;
&lt;/property&gt;
</pre></div></div>
<p>Next the <tt>ireland-2</tt> and <tt>ireland-offline</tt> buckets are configured with DynamoDB as the store, and a shared table <tt>production-table</tt>:</p>
<div>
<div>
<pre class="source">&lt;property&gt;
&lt;name&gt;fs.s3a.bucket.ireland-2.metadatastore.impl&lt;/name&gt;
&lt;value&gt;${s3guard.dynamo}&lt;/value&gt;
&lt;/property&gt;
&lt;property&gt;
&lt;name&gt;fs.s3a.bucket.ireland-offline.metadatastore.impl&lt;/name&gt;
&lt;value&gt;${s3guard.dynamo}&lt;/value&gt;
&lt;/property&gt;
&lt;property&gt;
&lt;name&gt;fs.s3a.bucket.ireland-2.s3guard.ddb.table&lt;/name&gt;
&lt;value&gt;production-table&lt;/value&gt;
&lt;/property&gt;
</pre></div></div>
<p>The region of this table is automatically set to be that of the buckets, here <tt>eu-west-1</tt>; the same table name may actually be used in different regions.</p>
<p>Together then, this configuration enables the DynamoDB Metadata Store for two buckets with a shared table, while disabling it for the public bucket.</p>
<div class="section">
<h3><a name="Out-of-band_operations_with_S3Guard"></a>Out-of-band operations with S3Guard</h3>
<p>We call an operation out-of-band (OOB) when a bucket is used by a client with S3Guard, and another client runs a write (e.g delete, move, rename, overwrite) operation on an object in the same bucket without S3Guard.</p>
<p>The definition of behaviour in S3AFileSystem/MetadataStore in case of OOBs: * A client with S3Guard * B client without S3Guard (Directly to S3)</p>
<ul>
<li>OOB OVERWRITE, authoritative mode:</li>
<li>A client creates F1 file</li>
<li>B client overwrites F1 file with F2 (Same, or different file size)</li>
<li>A client&#x2019;s getFileStatus returns F1 metadata</li>
<li>
<p>OOB OVERWRITE, NOT authoritative mode:</p>
</li>
<li>A client creates F1 file</li>
<li>B client overwrites F1 file with F2 (Same, or different file size)</li>
<li>A client&#x2019;s getFileStatus returns F2 metadata. In not authoritative mode we check S3 for the file. If the modification time of the file in S3 is greater than in S3Guard, we can safely return the S3 file metadata and update the cache.</li>
<li>
<p>OOB DELETE, authoritative mode:</p>
</li>
<li>A client creates F file</li>
<li>B client deletes F file</li>
<li>A client&#x2019;s getFileStatus returns that the file is still there</li>
<li>
<p>OOB DELETE, NOT authoritative mode:</p>
</li>
<li>A client creates F file</li>
<li>B client deletes F file</li>
<li>A client&#x2019;s getFileStatus returns that the file is still there</li>
</ul>
<p>Note: authoritative and NOT authoritative mode behaves the same at OOB DELETE case.</p>
<p>The behaviour in case of getting directory listings: * File status in metadata store gets updated during the listing the same way as in getFileStatus.</p></div></div>
<div class="section">
<h2><a name="S3Guard_Command_Line_Interface_.28CLI.29"></a>S3Guard Command Line Interface (CLI)</h2>
<p>Note that in some cases an AWS region or <tt>s3a://</tt> URI can be provided.</p>
<p>Metadata store URIs include a scheme that designates the backing store. For example (e.g. <tt>dynamodb://table_name</tt>;). As documented above, the AWS region can be inferred if the URI to an existing bucket is provided.</p>
<p>The S3A URI must also be provided for per-bucket configuration options to be picked up. That is: when an s3a URL is provided on the command line, all its &#x201c;resolved&#x201d; per-bucket settings are used to connect to, authenticate with and configure the S3Guard table. If no such URL is provided, then the base settings are picked up.</p>
<div class="section">
<h3><a name="Create_a_table:_s3guard_init"></a>Create a table: <tt>s3guard init</tt></h3>
<div>
<div>
<pre class="source">hadoop s3guard init -meta URI ( -region REGION | s3a://BUCKET )
</pre></div></div>
<p>Creates and initializes an empty metadata store.</p>
<p>A DynamoDB metadata store can be initialized with additional parameters pertaining to capacity.</p>
<p>If these values are both zero, then an on-demand DynamoDB table is created; if positive values then they set the <a class="externalLink" href="http://docs.aws.amazon.com/amazondynamodb/latest/developerguide/HowItWorks.ProvisionedThroughput.html">Provisioned Throughput</a> of the table.</p>
<div>
<div>
<pre class="source">[-write PROVISIONED_WRITES] [-read PROVISIONED_READS]
</pre></div></div>
<p>Server side encryption (SSE) can be enabled with AWS managed customer master key (CMK), or customer managed CMK. By default the DynamoDB table will be encrypted with AWS owned CMK. To use a customer managed CMK, you can specify its KMS key ID, ARN, alias name, or alias ARN. If not specified, the default AWS managed CMK for DynamoDB &#x201c;alias/aws/dynamodb&#x201d; will be used.</p>
<div>
<div>
<pre class="source">[-sse [-cmk KMS_CMK_ID]]
</pre></div></div>
<p>Tag argument can be added with a key=value list of tags. The table for the metadata store will be created with these tags in DynamoDB.</p>
<div>
<div>
<pre class="source">[-tag key=value;]
</pre></div></div>
<p>Example 1</p>
<div>
<div>
<pre class="source">hadoop s3guard init -meta dynamodb://ireland-team -write 0 -read 0 s3a://ireland-1
</pre></div></div>
<p>Creates an on-demand table &#x201c;ireland-team&#x201d;, in the same location as the S3 bucket &#x201c;ireland-1&#x201d;.</p>
<p>Example 2</p>
<div>
<div>
<pre class="source">hadoop s3guard init -meta dynamodb://ireland-team -region eu-west-1 --read 0 --write 0
</pre></div></div>
<p>Creates a table &#x201c;ireland-team&#x201d; in the region &#x201c;eu-west-1.amazonaws.com&#x201d;</p>
<p>Example 3</p>
<div>
<div>
<pre class="source">hadoop s3guard init -meta dynamodb://ireland-team -tag tag1=first;tag2=second;
</pre></div></div>
<p>Creates a table &#x201c;ireland-team&#x201d; with tags &#x201c;first&#x201d; and &#x201c;second&#x201d;. The read and write capacity will be those of the site configuration&#x2019;s values of <tt>fs.s3a.s3guard.ddb.table.capacity.read</tt> and <tt>fs.s3a.s3guard.ddb.table.capacity.write</tt>; if these are both zero then it will be an on-demand table.</p>
<p>Example 4</p>
<div>
<div>
<pre class="source">hadoop s3guard init -meta dynamodb://ireland-team -sse
</pre></div></div>
<p>Creates a table &#x201c;ireland-team&#x201d; with server side encryption enabled. The CMK will be using the default AWS managed &#x201c;alias/aws/dynamodb&#x201d;.</p></div>
<div class="section">
<h3><a name="Import_a_bucket:_s3guard_import"></a>Import a bucket: <tt>s3guard import</tt></h3>
<div>
<div>
<pre class="source">hadoop s3guard import [-meta URI] [-authoritative] [-verbose] s3a://PATH
</pre></div></div>
<p>Pre-populates a metadata store according to the current contents of an S3 bucket/path. If the <tt>-meta</tt> option is omitted, the binding information is taken from the <tt>core-site.xml</tt> configuration.</p>
<p>Usage</p>
<div>
<div>
<pre class="source">hadoop s3guard import
import [OPTIONS] [s3a://PATH]
import metadata from existing S3 data
Common options:
-authoritative - Mark imported directory data as authoritative.
-verbose - Verbose Output.
-meta URL - Metadata repository details (implementation-specific)
Amazon DynamoDB-specific options:
-region REGION - Service region for connections
URLs for Amazon DynamoDB are of the form dynamodb://TABLE_NAME.
Specifying both the -region option and an S3A path
is not supported.
</pre></div></div>
<p>Example</p>
<p>Import all files and directories in a bucket into the S3Guard table.</p>
<div>
<div>
<pre class="source">hadoop s3guard import s3a://ireland-1
</pre></div></div>
<p>Import a directory tree, marking directories as authoritative.</p>
<div>
<div>
<pre class="source">hadoop s3guard import -authoritative -verbose s3a://ireland-1/fork-0008
2020-01-03 12:05:18,321 [main] INFO - Metadata store DynamoDBMetadataStore{region=eu-west-1,
tableName=s3guard-metadata, tableArn=arn:aws:dynamodb:eu-west-1:980678866538:table/s3guard-metadata} is initialized.
2020-01-03 12:05:18,324 [main] INFO - Starting: Importing s3a://ireland-1/fork-0008
2020-01-03 12:05:18,324 [main] INFO - Importing directory s3a://ireland-1/fork-0008
2020-01-03 12:05:18,537 [main] INFO - Dir s3a://ireland-1/fork-0008/test/doTestListFiles-0-0-0-false
2020-01-03 12:05:18,630 [main] INFO - Dir s3a://ireland-1/fork-0008/test/doTestListFiles-0-0-0-true
2020-01-03 12:05:19,142 [main] INFO - Dir s3a://ireland-1/fork-0008/test/doTestListFiles-2-0-0-false/dir-0
2020-01-03 12:05:19,191 [main] INFO - Dir s3a://ireland-1/fork-0008/test/doTestListFiles-2-0-0-false/dir-1
2020-01-03 12:05:19,240 [main] INFO - Dir s3a://ireland-1/fork-0008/test/doTestListFiles-2-0-0-true/dir-0
2020-01-03 12:05:19,289 [main] INFO - Dir s3a://ireland-1/fork-0008/test/doTestListFiles-2-0-0-true/dir-1
2020-01-03 12:05:19,314 [main] INFO - Updated S3Guard with 0 files and 6 directory entries
2020-01-03 12:05:19,315 [main] INFO - Marking directory tree s3a://ireland-1/fork-0008 as authoritative
2020-01-03 12:05:19,342 [main] INFO - Importing s3a://ireland-1/fork-0008: duration 0:01.018s
Inserted 6 items into Metadata Store
</pre></div></div>
</div>
<div class="section">
<h3><a name="Compare_a_S3Guard_table_and_the_S3_Store:_s3guard_diff"></a>Compare a S3Guard table and the S3 Store: <tt>s3guard diff</tt></h3>
<div>
<div>
<pre class="source">hadoop s3guard diff [-meta URI] s3a://BUCKET
</pre></div></div>
<p>Lists discrepancies between a metadata store and bucket. Note that depending on how S3Guard is used, certain discrepancies are to be expected.</p>
<p>Example</p>
<div>
<div>
<pre class="source">hadoop s3guard diff s3a://ireland-1
</pre></div></div>
</div>
<div class="section">
<h3><a name="Display_information_about_a_bucket.2C_s3guard_bucket-info"></a>Display information about a bucket, <tt>s3guard bucket-info</tt></h3>
<p>Prints and optionally checks the s3guard and encryption status of a bucket.</p>
<div>
<div>
<pre class="source">hadoop s3guard bucket-info [-guarded] [-unguarded] [-auth] [-nonauth] [-magic] [-encryption ENCRYPTION] s3a://BUCKET
</pre></div></div>
<p>Options</p>
<table border="0" class="bodyTable">
<thead>
<tr class="a">
<th> argument </th>
<th> meaning </th></tr>
</thead><tbody>
<tr class="b">
<td> <tt>-guarded</tt> </td>
<td> Require S3Guard to be enabled </td></tr>
<tr class="a">
<td> <tt>-unguarded</tt> </td>
<td> Force S3Guard to be disabled </td></tr>
<tr class="b">
<td> <tt>-auth</tt> </td>
<td> Require the S3Guard mode to be &#x201c;authoritative&#x201d; </td></tr>
<tr class="a">
<td> <tt>-nonauth</tt> </td>
<td> Require the S3Guard mode to be &#x201c;non-authoritative&#x201d; </td></tr>
<tr class="b">
<td> <tt>-magic</tt> </td>
<td> Require the S3 filesystem to be support the &#x201c;magic&#x201d; committer </td></tr>
<tr class="a">
<td> <tt>-encryption &lt;type&gt;</tt> </td>
<td> Require a specific server-side encryption algorithm </td></tr>
</tbody>
</table>
<p>The server side encryption options are not directly related to S3Guard, but it is often convenient to check them at the same time.</p>
<p>Example</p>
<div>
<div>
<pre class="source">hadoop s3guard bucket-info -guarded -magic s3a://ireland-1
</pre></div></div>
<p>List the details of bucket <tt>s3a://ireland-1</tt>, mandating that it must have S3Guard enabled (&#x201c;-guarded&#x201d;) and that support for the magic S3A committer is enabled (&#x201c;-magic&#x201d;)</p>
<div>
<div>
<pre class="source">Filesystem s3a://ireland-1
Location: eu-west-1
Filesystem s3a://ireland-1 is using S3Guard with store DynamoDBMetadataStore{region=eu-west-1, tableName=ireland-1}
Authoritative S3Guard: fs.s3a.metadatastore.authoritative=false
Metadata Store Diagnostics:
ARN=arn:aws:dynamodb:eu-west-1:00000000:table/ireland-1
billing-mode=provisioned
description=S3Guard metadata store in DynamoDB
name=ireland-1
read-capacity=20
region=eu-west-1
retryPolicy=ExponentialBackoffRetry(maxRetries=9, sleepTime=100 MILLISECONDS)
size=12812
status=ACTIVE
table={AttributeDefinitions: [{AttributeName: child,AttributeType: S},
{AttributeName: parent,AttributeType: S}],TableName: ireland-1,
KeySchema: [{AttributeName: parent,KeyType: HASH}, {AttributeName: child,KeyType: RANGE}],
TableStatus: ACTIVE,
CreationDateTime: Fri Aug 25 19:07:25 BST 2017,
ProvisionedThroughput: {LastIncreaseDateTime: Tue Aug 29 11:45:18 BST 2017,
LastDecreaseDateTime: Wed Aug 30 15:37:51 BST 2017,
NumberOfDecreasesToday: 1,
ReadCapacityUnits: 20,WriteCapacityUnits: 20},
TableSizeBytes: 12812,ItemCount: 91,
TableArn: arn:aws:dynamodb:eu-west-1:00000000:table/ireland-1,}
write-capacity=20
The &quot;magic&quot; committer is supported
S3A Client
Signing Algorithm: fs.s3a.signing-algorithm=(unset)
Endpoint: fs.s3a.endpoint=s3-eu-west-1.amazonaws.com
Encryption: fs.s3a.server-side-encryption-algorithm=none
Input seek policy: fs.s3a.experimental.input.fadvise=normal
Change Detection Source: fs.s3a.change.detection.source=etag
Change Detection Mode: fs.s3a.change.detection.mode=server
Delegation token support is disabled
</pre></div></div>
<p>This listing includes all the information about the table supplied from</p>
<div>
<div>
<pre class="source">hadoop s3guard bucket-info -unguarded -encryption none s3a://landsat-pds
</pre></div></div>
<p>List the S3Guard status of clients of the public <tt>landsat-pds</tt> bucket, and verifies that the data is neither tracked with S3Guard nor encrypted.</p>
<div>
<div>
<pre class="source">Filesystem s3a://landsat-pdsLocation: us-west-2
Filesystem s3a://landsat-pds is not using S3Guard
Endpoint: fs.s3a.endpoints3.amazonaws.com
Encryption: fs.s3a.server-side-encryption-algorithm=none
Input seek policy: fs.s3a.experimental.input.fadvise=normal
</pre></div></div>
<p>Note that other clients may have a S3Guard table set up to store metadata on this bucket; the checks are all done from the perspective of the configuration settings of the current client.</p>
<div>
<div>
<pre class="source">hadoop s3guard bucket-info -guarded -auth s3a://landsat-pds
</pre></div></div>
<p>Require the bucket to be using S3Guard in authoritative mode. This will normally fail against this specific bucket.</p></div>
<div class="section">
<h3><a name="List_or_Delete_Leftover_Multipart_Uploads:_s3guard_uploads"></a>List or Delete Leftover Multipart Uploads: <tt>s3guard uploads</tt></h3>
<p>Lists or deletes all pending (uncompleted) multipart uploads older than given age.</p>
<div>
<div>
<pre class="source">hadoop s3guard uploads (-list | -abort | -expect &lt;num-uploads&gt;) [-verbose] \
[-days &lt;days&gt;] [-hours &lt;hours&gt;] [-minutes &lt;minutes&gt;] [-seconds &lt;seconds&gt;] \
[-force] s3a://bucket/prefix
</pre></div></div>
<p>The command lists or deletes all multipart uploads which are older than the given age, and that match the prefix supplied, if any.</p>
<p>For example, to delete all uncompleted multipart uploads older than two days in the folder at <tt>s3a://my-bucket/path/to/stuff</tt>, use the following command:</p>
<div>
<div>
<pre class="source">hadoop s3guard uploads -abort -days 2 s3a://my-bucket/path/to/stuff
</pre></div></div>
<p>We recommend running with <tt>-list</tt> first to confirm the parts shown are those that you wish to delete. Note that the command will prompt you with a &#x201c;Are you sure?&#x201d; prompt unless you specify the <tt>-force</tt> option. This is to safeguard against accidental deletion of data, which is especially risky without a long age parameter as it can affect in-fight uploads.</p>
<p>The <tt>-expect</tt> option is similar to <tt>-list</tt>, except it is silent by default, and terminates with a success or failure exit code depending on whether or not the supplied number matches the number of uploads found that match the given options (path, age).</p></div>
<div class="section">
<h3><a name="Delete_a_table:_s3guard_destroy"></a>Delete a table: <tt>s3guard destroy</tt></h3>
<p>Deletes a metadata store. With DynamoDB as the store, this means the specific DynamoDB table use to store the metadata.</p>
<div>
<div>
<pre class="source">hadoop s3guard destroy [-meta URI] ( -region REGION | s3a://BUCKET )
</pre></div></div>
<p>This <i>does not</i> delete the bucket, only the S3Guard table which it is bound to.</p>
<p>Examples</p>
<div>
<div>
<pre class="source">hadoop s3guard destroy s3a://ireland-1
</pre></div></div>
<p>Deletes the table which the bucket ireland-1 is configured to use as its MetadataStore.</p>
<div>
<div>
<pre class="source">hadoop s3guard destroy -meta dynamodb://ireland-team -region eu-west-1
</pre></div></div>
</div>
<div class="section">
<h3><a name="Clean_up_a_table.2C_s3guard_prune"></a>Clean up a table, <tt>s3guard prune</tt></h3>
<p>Delete all file entries in the MetadataStore table whose object &#x201c;modification time&#x201d; is older than the specified age.</p>
<div>
<div>
<pre class="source">hadoop s3guard prune [-days DAYS] [-hours HOURS] [-minutes MINUTES]
[-seconds SECONDS] [-tombstone] [-meta URI] ( -region REGION | s3a://BUCKET )
</pre></div></div>
<p>A time value of hours, minutes and/or seconds must be supplied.</p>
<ol style="list-style-type: decimal">
<li>This does not delete the entries in the bucket itself.</li>
<li>The modification time is effectively the creation time of the objects in the S3 Bucket.</li>
<li>If an S3A URI is supplied, only the entries in the table specified by the URI and older than a specific age are deleted.</li>
</ol>
<p>The <tt>-tombstone</tt> option instructs the operation to only purge &#x201c;tombstones&#x201d;, markers of deleted files. These tombstone markers are only used briefly, to indicate that a recently deleted file should not be found in listings. As a result, there is no adverse consequences in regularly pruning old tombstones.</p>
<p>Example</p>
<div>
<div>
<pre class="source">hadoop s3guard prune -days 7 s3a://ireland-1
</pre></div></div>
<p>Deletes all entries in the S3Guard table for files older than seven days from the table associated with <tt>s3a://ireland-1</tt>.</p>
<div>
<div>
<pre class="source">hadoop s3guard prune -tombstone -days 7 s3a://ireland-1/path_prefix/
</pre></div></div>
<p>Deletes all entries in the S3Guard table for tombstones older than seven days from the table associated with <tt>s3a://ireland-1</tt> and with the prefix <tt>path_prefix</tt></p>
<div>
<div>
<pre class="source">hadoop s3guard prune -hours 1 -minutes 30 -meta dynamodb://ireland-team -region eu-west-1
</pre></div></div>
<p>Delete all file entries more than 90 minutes old from the table &quot;<tt>ireland-team&quot;</tt> in the region <tt>eu-west-1</tt>.</p></div>
<div class="section">
<h3><a name="Audit_the_.22authoritative_state_of_a_DynamoDB_Table.2C_s3guard_authoritative"></a>Audit the &quot;authoritative state of a DynamoDB Table, <tt>s3guard authoritative</tt></h3>
<p>This recursively checks a S3Guard table to verify that all directories underneath are marked as &#x201c;authoritative&#x201d;, and/or that the configuration is set for the S3A client to treat files and directories urnder the path as authoritative.</p>
<div>
<div>
<pre class="source">hadoop s3guard authoritative
authoritative [OPTIONS] [s3a://PATH]
Audits a DynamoDB S3Guard repository for all the entries being 'authoritative'
Options:
-required Require directories under the path to be authoritative.
-check-config Check the configuration for the path to be authoritative
-verbose Verbose Output.
</pre></div></div>
<p>Verify that a path under an object store is declared to be authoritative in the cluster configuration -and therefore that file entries will not be validated against S3, and that directories marked as &#x201c;authoritative&#x201d; in the S3Guard table will be treated as complete.</p>
<div>
<div>
<pre class="source">hadoop s3guard authoritative -check-config s3a:///ireland-1/fork-0003/test/
2020-01-03 11:42:29,147 [main] INFO Metadata store DynamoDBMetadataStore{
region=eu-west-1, tableName=s3guard-metadata, tableArn=arn:aws:dynamodb:eu-west-1:980678866538:table/s3guard-metadata} is initialized.
Path /fork-0003/test is not configured to be authoritative
</pre></div></div>
<p>Scan a store and report which directories are not marked as authoritative.</p>
<div>
<div>
<pre class="source">hadoop s3guard authoritative s3a://ireland-1/
2020-01-03 11:51:58,416 [main] INFO - Metadata store DynamoDBMetadataStore{region=eu-west-1, tableName=s3guard-metadata, tableArn=arn:aws:dynamodb:eu-west-1:980678866538:table/s3guard-metadata} is initialized.
2020-01-03 11:51:58,419 [main] INFO - Starting: audit s3a://ireland-1/
2020-01-03 11:51:58,422 [main] INFO - Root directory s3a://ireland-1/
2020-01-03 11:51:58,469 [main] INFO - files 4; directories 12
2020-01-03 11:51:58,469 [main] INFO - Directory s3a://ireland-1/Users
2020-01-03 11:51:58,521 [main] INFO - files 0; directories 1
2020-01-03 11:51:58,522 [main] INFO - Directory s3a://ireland-1/fork-0007
2020-01-03 11:51:58,573 [main] INFO - Directory s3a://ireland-1/fork-0001
2020-01-03 11:51:58,626 [main] INFO - files 0; directories 1
2020-01-03 11:51:58,626 [main] INFO - Directory s3a://ireland-1/fork-0006
2020-01-03 11:51:58,676 [main] INFO - Directory s3a://ireland-1/path
2020-01-03 11:51:58,734 [main] INFO - files 0; directories 1
2020-01-03 11:51:58,735 [main] INFO - Directory s3a://ireland-1/fork-0008
2020-01-03 11:51:58,802 [main] INFO - files 0; directories 1
2020-01-03 11:51:58,802 [main] INFO - Directory s3a://ireland-1/fork-0004
2020-01-03 11:51:58,854 [main] INFO - files 0; directories 1
2020-01-03 11:51:58,855 [main] WARN - Directory s3a://ireland-1/fork-0003 is not authoritative
2020-01-03 11:51:58,905 [main] INFO - files 0; directories 1
2020-01-03 11:51:58,906 [main] INFO - Directory s3a://ireland-1/fork-0005
2020-01-03 11:51:58,955 [main] INFO - Directory s3a://ireland-1/customsignerpath2
2020-01-03 11:51:59,006 [main] INFO - Directory s3a://ireland-1/fork-0002
2020-01-03 11:51:59,063 [main] INFO - files 0; directories 1
2020-01-03 11:51:59,064 [main] INFO - Directory s3a://ireland-1/customsignerpath1
2020-01-03 11:51:59,121 [main] INFO - Directory s3a://ireland-1/Users/stevel
2020-01-03 11:51:59,170 [main] INFO - files 0; directories 1
2020-01-03 11:51:59,171 [main] INFO - Directory s3a://ireland-1/fork-0001/test
2020-01-03 11:51:59,233 [main] INFO - Directory s3a://ireland-1/path/style
2020-01-03 11:51:59,282 [main] INFO - files 0; directories 1
2020-01-03 11:51:59,282 [main] INFO - Directory s3a://ireland-1/fork-0008/test
2020-01-03 11:51:59,338 [main] INFO - files 15; directories 10
2020-01-03 11:51:59,339 [main] INFO - Directory s3a://ireland-1/fork-0004/test
2020-01-03 11:51:59,394 [main] WARN - Directory s3a://ireland-1/fork-0003/test is not authoritative
2020-01-03 11:51:59,451 [main] INFO - files 35; directories 1
2020-01-03 11:51:59,451 [main] INFO - Directory s3a://ireland-1/fork-0002/test
2020-01-03 11:51:59,508 [main] INFO - Directory s3a://ireland-1/Users/stevel/Projects
2020-01-03 11:51:59,558 [main] INFO - files 0; directories 1
2020-01-03 11:51:59,559 [main] INFO - Directory s3a://ireland-1/path/style/access
2020-01-03 11:51:59,610 [main] INFO - Directory s3a://ireland-1/fork-0008/test/doTestListFiles-0-2-0-false
2020-01-03 11:51:59,660 [main] INFO - Directory s3a://ireland-1/fork-0008/test/doTestListFiles-0-2-1-false
2020-01-03 11:51:59,719 [main] INFO - Directory s3a://ireland-1/fork-0008/test/doTestListFiles-0-0-0-true
2020-01-03 11:51:59,773 [main] INFO - Directory s3a://ireland-1/fork-0008/test/doTestListFiles-2-0-0-true
2020-01-03 11:51:59,824 [main] INFO - files 0; directories 2
2020-01-03 11:51:59,824 [main] INFO - Directory s3a://ireland-1/fork-0008/test/doTestListFiles-0-2-1-true
2020-01-03 11:51:59,879 [main] INFO - Directory s3a://ireland-1/fork-0008/test/doTestListFiles-0-0-1-false
2020-01-03 11:51:59,939 [main] INFO - Directory s3a://ireland-1/fork-0008/test/doTestListFiles-0-0-0-false
2020-01-03 11:51:59,990 [main] INFO - Directory s3a://ireland-1/fork-0008/test/doTestListFiles-0-2-0-true
2020-01-03 11:52:00,042 [main] INFO - Directory s3a://ireland-1/fork-0008/test/doTestListFiles-2-0-0-false
2020-01-03 11:52:00,094 [main] INFO - files 0; directories 2
2020-01-03 11:52:00,094 [main] INFO - Directory s3a://ireland-1/fork-0008/test/doTestListFiles-0-0-1-true
2020-01-03 11:52:00,144 [main] WARN - Directory s3a://ireland-1/fork-0003/test/ancestor is not authoritative
2020-01-03 11:52:00,197 [main] INFO - Directory s3a://ireland-1/Users/stevel/Projects/hadoop-trunk
2020-01-03 11:52:00,245 [main] INFO - files 0; directories 1
2020-01-03 11:52:00,245 [main] INFO - Directory s3a://ireland-1/fork-0008/test/doTestListFiles-2-0-0-true/dir-0
2020-01-03 11:52:00,296 [main] INFO - Directory s3a://ireland-1/fork-0008/test/doTestListFiles-2-0-0-true/dir-1
2020-01-03 11:52:00,346 [main] INFO - Directory s3a://ireland-1/fork-0008/test/doTestListFiles-2-0-0-false/dir-0
2020-01-03 11:52:00,397 [main] INFO - Directory s3a://ireland-1/fork-0008/test/doTestListFiles-2-0-0-false/dir-1
2020-01-03 11:52:00,479 [main] INFO - Directory s3a://ireland-1/Users/stevel/Projects/hadoop-trunk/hadoop-tools
2020-01-03 11:52:00,530 [main] INFO - files 0; directories 1
2020-01-03 11:52:00,530 [main] INFO - Directory s3a://ireland-1/Users/stevel/Projects/hadoop-trunk/hadoop-tools/hadoop-aws
2020-01-03 11:52:00,582 [main] INFO - files 0; directories 1
2020-01-03 11:52:00,582 [main] INFO - Directory s3a://ireland-1/Users/stevel/Projects/hadoop-trunk/hadoop-tools/hadoop-aws/target
2020-01-03 11:52:00,636 [main] INFO - files 0; directories 1
2020-01-03 11:52:00,637 [main] INFO - Directory s3a://ireland-1/Users/stevel/Projects/hadoop-trunk/hadoop-tools/hadoop-aws/target/test-dir
2020-01-03 11:52:00,691 [main] INFO - files 0; directories 3
2020-01-03 11:52:00,691 [main] INFO - Directory s3a://ireland-1/Users/stevel/Projects/hadoop-trunk/hadoop-tools/hadoop-aws/target/test-dir/2
2020-01-03 11:52:00,752 [main] INFO - Directory s3a://ireland-1/Users/stevel/Projects/hadoop-trunk/hadoop-tools/hadoop-aws/target/test-dir/5
2020-01-03 11:52:00,807 [main] INFO - Directory s3a://ireland-1/Users/stevel/Projects/hadoop-trunk/hadoop-tools/hadoop-aws/target/test-dir/8
2020-01-03 11:52:00,862 [main] INFO - Scanned 45 directories - 3 were not marked as authoritative
2020-01-03 11:52:00,863 [main] INFO - audit s3a://ireland-1/: duration 0:02.444s
</pre></div></div>
<p>Scan the path/bucket and fail if any entry is non-authoritative.</p>
<div>
<div>
<pre class="source">hadoop s3guard authoritative -verbose -required s3a://ireland-1/
2020-01-03 11:47:40,288 [main] INFO - Metadata store DynamoDBMetadataStore{region=eu-west-1, tableName=s3guard-metadata, tableArn=arn:aws:dynamodb:eu-west-1:980678866538:table/s3guard-metadata} is initialized.
2020-01-03 11:47:40,291 [main] INFO - Starting: audit s3a://ireland-1/
2020-01-03 11:47:40,295 [main] INFO - Root directory s3a://ireland-1/
2020-01-03 11:47:40,336 [main] INFO - files 4; directories 12
2020-01-03 11:47:40,336 [main] INFO - Directory s3a://ireland-1/Users
2020-01-03 11:47:40,386 [main] INFO - files 0; directories 1
2020-01-03 11:47:40,386 [main] INFO - Directory s3a://ireland-1/fork-0007
2020-01-03 11:47:40,435 [main] INFO - files 1; directories 0
2020-01-03 11:47:40,435 [main] INFO - Directory s3a://ireland-1/fork-0001
2020-01-03 11:47:40,486 [main] INFO - files 0; directories 1
2020-01-03 11:47:40,486 [main] INFO - Directory s3a://ireland-1/fork-0006
2020-01-03 11:47:40,534 [main] INFO - files 1; directories 0
2020-01-03 11:47:40,535 [main] INFO - Directory s3a://ireland-1/path
2020-01-03 11:47:40,587 [main] INFO - files 0; directories 1
2020-01-03 11:47:40,588 [main] INFO - Directory s3a://ireland-1/fork-0008
2020-01-03 11:47:40,641 [main] INFO - files 0; directories 1
2020-01-03 11:47:40,642 [main] INFO - Directory s3a://ireland-1/fork-0004
2020-01-03 11:47:40,692 [main] INFO - files 0; directories 1
2020-01-03 11:47:40,693 [main] WARN - Directory s3a://ireland-1/fork-0003 is not authoritative
2020-01-03 11:47:40,693 [main] INFO - audit s3a://ireland-1/: duration 0:00.402s
2020-01-03 11:47:40,698 [main] INFO - Exiting with status 46: `s3a://ireland-1/fork-0003': Directory is not marked as authoritative in the S3Guard store
</pre></div></div>
<p>This command is primarily for testing.</p></div>
<div class="section">
<h3><a name="Tune_the_I.2FO_capacity_of_the_DynamoDB_Table.2C_s3guard_set-capacity"></a>Tune the I/O capacity of the DynamoDB Table, <tt>s3guard set-capacity</tt></h3>
<p>Alter the read and/or write capacity of a s3guard table created with provisioned I/O capacity.</p>
<div>
<div>
<pre class="source">hadoop s3guard set-capacity [--read UNIT] [--write UNIT] ( -region REGION | s3a://BUCKET )
</pre></div></div>
<p>The <tt>--read</tt> and <tt>--write</tt> units are those of <tt>s3guard init</tt>.</p>
<p>It cannot be used to change the I/O capacity of an on demand table (there is no need), and nor can it be used to convert an existing table to being on-demand. For that the AWS console must be used.</p>
<p>Example</p>
<div>
<div>
<pre class="source">hadoop s3guard set-capacity -read 20 -write 20 s3a://ireland-1
</pre></div></div>
<p>Set the capacity of the table used by bucket <tt>s3a://ireland-1</tt> to 20 read and 20 write. (This is a low number, incidentally)</p>
<div>
<div>
<pre class="source">2017-08-30 16:21:26,343 [main] INFO s3guard.S3GuardTool (S3GuardTool.java:initMetadataStore(229)) - Metadata store DynamoDBMetadataStore{region=eu-west-1, tableName=ireland-1} is initialized.
2017-08-30 16:21:26,344 [main] INFO s3guard.DynamoDBMetadataStore (DynamoDBMetadataStore.java:updateParameters(1084)) - Current table capacity is read: 25, write: 25
2017-08-30 16:21:26,344 [main] INFO s3guard.DynamoDBMetadataStore (DynamoDBMetadataStore.java:updateParameters(1086)) - Changing capacity of table to read: 20, write: 20
Metadata Store Diagnostics:
ARN=arn:aws:dynamodb:eu-west-1:00000000000:table/ireland-1
billing-mode=provisioned
description=S3Guard metadata store in DynamoDB
name=ireland-1
read-capacity=25
region=eu-west-1
retryPolicy=ExponentialBackoffRetry(maxRetries=9, sleepTime=100 MILLISECONDS)
size=12812
status=UPDATING
table={ ... }
write-capacity=25
</pre></div></div>
<p>After the update, the table status changes to <tt>UPDATING</tt>; this is a sign that the capacity has been changed.</p>
<p>Repeating the same command will not change the capacity, as both read and write values match that already in use.</p>
<div>
<div>
<pre class="source">2017-08-30 16:24:35,337 [main] INFO s3guard.DynamoDBMetadataStore (DynamoDBMetadataStore.java:updateParameters(1090)) - Table capacity unchanged at read: 20, write: 20
Metadata Store Diagnostics:
ARN=arn:aws:dynamodb:eu-west-1:00000000000:table/ireland-1
billing-mode=provisioned
description=S3Guard metadata store in DynamoDB
name=ireland-1
read-capacity=20
region=eu-west-1
retryPolicy=ExponentialBackoffRetry(maxRetries=9, sleepTime=100 MILLISECONDS)
size=12812
status=ACTIVE
table={ ... }
write-capacity=20
</pre></div></div>
<p><i>Note</i>: There is a limit to how many times in a 24 hour period the capacity of a bucket can be changed, either through this command or the AWS console.</p></div>
<div class="section">
<h3><a name="Check_the_consistency_of_the_metadata_store.2C_s3guard_fsck"></a>Check the consistency of the metadata store, <tt>s3guard fsck</tt></h3>
<p>Compares S3 with MetadataStore, and returns a failure status if any rules or invariants are violated. Only works with DynamoDB metadata stores.</p>
<div>
<div>
<pre class="source">hadoop s3guard fsck [-check | -internal] [-fix] (s3a://BUCKET | s3a://PATH_PREFIX)
</pre></div></div>
<p><tt>-check</tt> operation checks the metadata store from the S3 perspective, but does not fix any issues. The consistency issues will be logged in ERROR loglevel.</p>
<p><tt>-internal</tt> operation checks the internal consistency of the metadata store, but does not fix any issues.</p>
<p><tt>-fix</tt> operation fixes consistency issues between the metadatastore and the S3 bucket. This parameter is optional, and can be used together with check or internal parameters, but not alone. The following fix is implemented: - Remove orphan entries from DDB</p>
<p>The errors found will be logged at the ERROR log level.</p>
<p><i>Note</i>: <tt>-check</tt> and <tt>-internal</tt> operations can be used only as separate commands. Running <tt>fsck</tt> with both will result in an error.</p>
<p>Example</p>
<div>
<div>
<pre class="source">hadoop s3guard fsck -check s3a://ireland-1/path_prefix/
</pre></div></div>
<p>Checks the metadata store while iterating through the S3 bucket. The path_prefix will be used as the root element of the check.</p>
<div>
<div>
<pre class="source">hadoop s3guard fsck -internal s3a://ireland-1/path_prefix/
</pre></div></div>
<p>Checks the metadata store internal consistency. The path_prefix will be used as the root element of the check.</p></div></div>
<div class="section">
<h2><a name="Debugging_and_Error_Handling"></a>Debugging and Error Handling</h2>
<p>If you run into network connectivity issues, or have a machine failure in the middle of an operation, you may end up with your metadata store having state that differs from S3. The S3Guard CLI commands, covered in the CLI section above, can be used to diagnose and repair these issues.</p>
<p>There are some logs whose log level can be increased to provide more information.</p>
<div>
<div>
<pre class="source"># Log S3Guard classes
log4j.logger.org.apache.hadoop.fs.s3a.s3guard=DEBUG
# Log all S3A classes
log4j.logger.org.apache.hadoop.fs.s3a=DEBUG
# Enable debug logging of AWS DynamoDB client
log4j.logger.com.amazonaws.services.dynamodbv2.AmazonDynamoDB
# Log all HTTP requests made; includes S3 interaction. This may
# include sensitive information such as account IDs in HTTP headers.
log4j.logger.com.amazonaws.request=DEBUG
</pre></div></div>
<p>If all else fails, S3Guard is designed to allow for easy recovery by deleting the metadata store data. In DynamoDB, this can be accomplished by simply deleting the table, and allowing S3Guard to recreate it from scratch. Note that S3Guard tracks recent changes to file metadata to implement consistency. Deleting the metadata store table will simply result in a period of eventual consistency for any file modifications that were made right before the table was deleted.</p>
<div class="section">
<h3><a name="Enabling_a_log_message_whenever_S3Guard_is_disabled"></a>Enabling a log message whenever S3Guard is <i>disabled</i></h3>
<p>When dealing with support calls related to the S3A connector, &#x201c;is S3Guard on?&#x201d; is the usual opening question. This can be determined by looking at the application logs for messages about S3Guard starting -the absence of S3Guard can only be inferred by the absence of such messages.</p>
<p>There is a another strategy: have the S3A Connector log whenever <i>S3Guard is not enabled</i></p>
<p>This can be done in the configuration option <tt>fs.s3a.s3guard.disabled.warn.level</tt></p>
<div>
<div>
<pre class="source">&lt;property&gt;
&lt;name&gt;fs.s3a.s3guard.disabled.warn.level&lt;/name&gt;
&lt;value&gt;silent&lt;/value&gt;
&lt;description&gt;
Level to print a message when S3Guard is disabled.
Values:
&quot;warn&quot;: log at WARN level
&quot;inform&quot;: log at INFO level
&quot;silent&quot;: log at DEBUG level
&quot;fail&quot;: raise an exception
&lt;/description&gt;
&lt;/property&gt;
</pre></div></div>
<p>The <tt>fail</tt> option is clearly more than logging; it exists as an extreme debugging tool. Use with care.</p></div>
<div class="section">
<h3><a name="Failure_Semantics"></a>Failure Semantics</h3>
<p>Operations which modify metadata will make changes to S3 first. If, and only if, those operations succeed, the equivalent changes will be made to the Metadata Store.</p>
<p>These changes to S3 and Metadata Store are not fully-transactional: If the S3 operations succeed, and the subsequent Metadata Store updates fail, the S3 changes will <i>not</i> be rolled back. In this case, an error message will be logged.</p></div>
<div class="section">
<h3><a name="Versioning"></a>Versioning</h3>
<p>S3Guard tables are created with a version marker entry and table tag. The entry is created with the primary key and child entry of <tt>../VERSION</tt>; the use of a relative path guarantees that it will not be resolved. Table tag key is named <tt>s3guard_version</tt>.</p>
<p>When the table is initialized by S3Guard, the table will be tagged during the creating and the version marker entry will be created in the table. If the table lacks the version marker entry or tag, S3Guard will try to create it according to the following rules:</p>
<ol style="list-style-type: decimal">
<li>If the table lacks both version markers AND it&#x2019;s empty, both markers will be added. If the table is not empty the check throws IOException</li>
<li>If there&#x2019;s no version marker ITEM, the compatibility with the TAG will be checked, and the version marker ITEM will be added if the TAG version is compatible. If the TAG version is not compatible, the check throws OException</li>
<li>If there&#x2019;s no version marker TAG, the compatibility with the ITEM version marker will be checked, and the version marker ITEM will be added if the ITEM version is compatible. If the ITEM version is not compatible, the check throws IOException</li>
<li>If the TAG and ITEM versions are both present then both will be checked for compatibility. If the ITEM or TAG version marker is not compatible, the check throws IOException</li>
</ol>
<p><i>Note</i>: If the user does not have sufficient rights to tag the table the initialization of S3Guard will not fail, but there will be no version marker tag on the dynamo table.</p>
<p><i>Versioning policy</i></p>
<ol style="list-style-type: decimal">
<li>The version number of an S3Guard table will only be incremented when an incompatible change is made to the table structure &#x2014;that is, the structure has changed so that it is no longer readable by older versions, or because it has added new mandatory fields which older versions do not create.</li>
<li>The version number of S3Guard tables will only be changed by incrementing the value.</li>
<li>Updated versions of S3Guard MAY continue to support older version tables.</li>
<li>If an incompatible change is made such that existing tables are not compatible, then a means shall be provided to update existing tables. For example: an option in the Command Line Interface, or an option to upgrade tables during S3Guard initialization.</li>
</ol>
<p><i>Note</i>: this policy does not indicate any intent to upgrade table structures in an incompatible manner. The version marker in tables exists to support such an option if it ever becomes necessary, by ensuring that all S3Guard client can recognise any version mismatch.</p></div></div>
<div class="section">
<h2><a name="Security"></a>Security</h2>
<p>All users of the DynamoDB table must have write access to it. This effectively means they must have write access to the entire object store.</p>
<p>There&#x2019;s not been much testing of using a S3Guard Metadata Store with a read-only S3 Bucket. It <i>should</i> work, provided all users have write access to the DynamoDB table. And, as updates to the Metadata Store are only made after successful file creation, deletion and rename, the store is <i>unlikely</i> to get out of sync, it is still something which merits more testing before it could be considered reliable.</p></div>
<div class="section">
<h2><a name="Managing_DynamoDB_I.2FO_Capacity"></a>Managing DynamoDB I/O Capacity</h2>
<p>Historically, DynamoDB has been not only billed on use (data and I/O requests) -but on provisioned I/O Capacity.</p>
<p>With Provisioned IO, when an application makes more requests than the allocated capacity permits, the request is rejected; it is up to the calling application to detect when it is being so throttled and react. S3Guard does this, but as a result: when the client is being throttled, operations are slower. This capacity throttling is averaged over a few minutes: a briefly overloaded table will not be throttled, but the rate cannot be sustained.</p>
<p>The load on a table is visible in the AWS console: go to the DynamoDB page for the table and select the &#x201c;metrics&#x201d; tab. If the graphs of throttled read or write requests show that a lot of throttling has taken place, then there is not enough allocated capacity for the applications making use of the table.</p>
<p>Similarly, if the capacity graphs show that the read or write loads are low compared to the allocated capacities, then the table <i>may</i> be overprovisioned for the current workload.</p>
<p>The S3Guard connector to DynamoDB can be configured to make multiple attempts to repeat a throttled request, with an exponential backoff between them.</p>
<p>The relevant settings for managing retries in the connector are:</p>
<div>
<div>
<pre class="source">&lt;property&gt;
&lt;name&gt;fs.s3a.s3guard.ddb.max.retries&lt;/name&gt;
&lt;value&gt;9&lt;/value&gt;
&lt;description&gt;
Max retries on throttled/incompleted DynamoDB operations
before giving up and throwing an IOException.
Each retry is delayed with an exponential
backoff timer which starts at 100 milliseconds and approximately
doubles each time. The minimum wait before throwing an exception is
sum(100, 200, 400, 800, .. 100*2^N-1 ) == 100 * ((2^N)-1)
&lt;/description&gt;
&lt;/property&gt;
&lt;property&gt;
&lt;name&gt;fs.s3a.s3guard.ddb.throttle.retry.interval&lt;/name&gt;
&lt;value&gt;100ms&lt;/value&gt;
&lt;description&gt;
Initial interval to retry after a request is throttled events;
the back-off policy is exponential until the number of retries of
fs.s3a.s3guard.ddb.max.retries is reached.
&lt;/description&gt;
&lt;/property&gt;
&lt;property&gt;
&lt;name&gt;fs.s3a.s3guard.ddb.background.sleep&lt;/name&gt;
&lt;value&gt;25ms&lt;/value&gt;
&lt;description&gt;
Length (in milliseconds) of pause between each batch of deletes when
pruning metadata. Prevents prune operations (which can typically be low
priority background operations) from overly interfering with other I/O
operations.
&lt;/description&gt;
&lt;/property&gt;
</pre></div></div>
<p>Having a large value for <tt>fs.s3a.s3guard.ddb.max.retries</tt> will ensure that clients of an overloaded table will not fail immediately. However queries may be unexpectedly slow.</p>
<p>If operations, especially directory operations, are slow, check the AWS console. It is also possible to set up AWS alerts for capacity limits being exceeded.</p>
<div class="section">
<h3><a name="On-Demand_Dynamo_Capacity"></a><a name="on-demand"></a> On-Demand Dynamo Capacity</h3>
<p><a class="externalLink" href="https://aws.amazon.com/blogs/aws/amazon-dynamodb-on-demand-no-capacity-planning-and-pay-per-request-pricing/">Amazon DynamoDB On-Demand</a> removes the need to pre-allocate I/O capacity for S3Guard tables. Instead the caller is <i>only</i> charged per I/O Operation.</p>
<ul>
<li>There are no SLA capacity guarantees. This is generally not an issue for S3Guard applications.</li>
<li>There&#x2019;s no explicit limit on I/O capacity, so operations which make heavy use of S3Guard tables (for example: SQL query planning) do not get throttled.</li>
<li>You are charged more per DynamoDB API call, in exchange for paying nothing when you are not interacting with DynamoDB.</li>
<li>There&#x2019;s no way put a limit on the I/O; you may unintentionally run up large bills through sustained heavy load.</li>
<li>The <tt>s3guard set-capacity</tt> command fails: it does not make sense any more.</li>
</ul>
<p>When idle, S3Guard tables are only billed for the data stored, not for any unused capacity. For this reason, there is no performance benefit from sharing a single S3Guard table across multiple buckets.</p>
<p><i>Creating a S3Guard Table with On-Demand Tables</i></p>
<p>The default settings for S3Guard are to create on-demand tables; this can also be done explicitly in the <tt>s3guard init</tt> command by setting the read and write capacities to zero.</p>
<div>
<div>
<pre class="source">hadoop s3guard init -meta dynamodb://ireland-team -write 0 -read 0 s3a://ireland-1
</pre></div></div>
<p><i>Enabling DynamoDB On-Demand for an existing S3Guard table</i></p>
<p>You cannot currently convert an existing S3Guard table to being an on-demand table through the <tt>s3guard</tt> command.</p>
<p>It can be done through the AWS console or <a class="externalLink" href="https://docs.aws.amazon.com/cli/latest/reference/dynamodb/update-table.html">the CLI</a>. From the Web console or the command line, switch the billing to pay-per-request.</p>
<p>Once enabled, the read and write capacities of the table listed in the <tt>hadoop s3guard bucket-info</tt> command become &#x201c;0&#x201d;, and the &#x201c;billing-mode&#x201d; attribute changes to &#x201c;per-request&#x201d;:</p>
<div>
<div>
<pre class="source">&gt; hadoop s3guard bucket-info s3a://example-bucket/
Filesystem s3a://example-bucket
Location: eu-west-1
Filesystem s3a://example-bucket is using S3Guard with store
DynamoDBMetadataStore{region=eu-west-1, tableName=example-bucket,
tableArn=arn:aws:dynamodb:eu-west-1:11111122223333:table/example-bucket}
Authoritative S3Guard: fs.s3a.metadatastore.authoritative=false
Metadata Store Diagnostics:
ARN=arn:aws:dynamodb:eu-west-1:11111122223333:table/example-bucket
billing-mode=per-request
description=S3Guard metadata store in DynamoDB
name=example-bucket
persist.authoritative.bit=true
read-capacity=0
region=eu-west-1
retryPolicy=ExponentialBackoffRetry(maxRetries=9, sleepTime=250 MILLISECONDS)
size=66797
status=ACTIVE
table={AttributeDefinitions:
[{AttributeName: child,AttributeType: S},
{AttributeName: parent,AttributeType: S}],
TableName: example-bucket,
KeySchema: [{
AttributeName: parent,KeyType: HASH},
{AttributeName: child,KeyType: RANGE}],
TableStatus: ACTIVE,
CreationDateTime: Thu Oct 11 18:51:14 BST 2018,
ProvisionedThroughput: {
LastIncreaseDateTime: Tue Oct 30 16:48:45 GMT 2018,
LastDecreaseDateTime: Tue Oct 30 18:00:03 GMT 2018,
NumberOfDecreasesToday: 0,
ReadCapacityUnits: 0,
WriteCapacityUnits: 0},
TableSizeBytes: 66797,
ItemCount: 415,
TableArn: arn:aws:dynamodb:eu-west-1:11111122223333:table/example-bucket,
TableId: a7b0728a-f008-4260-b2a0-aaaaabbbbb,}
write-capacity=0
The &quot;magic&quot; committer is supported
</pre></div></div>
</div>
<div class="section">
<h3><a name="Autoscaling_.28Provisioned_Capacity.29_S3Guard_tables."></a><a name="autoscaling"></a> Autoscaling (Provisioned Capacity) S3Guard tables.</h3>
<p><a class="externalLink" href="https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/AutoScaling.html">DynamoDB Auto Scaling</a> can automatically increase and decrease the allocated capacity.</p>
<p>Before DynamoDB On-Demand was introduced, autoscaling was the sole form of dynamic scaling.</p>
<p>Experiments with S3Guard and DynamoDB Auto Scaling have shown that any Auto Scaling operation will only take place after callers have been throttled for a period of time. The clients will still need to be configured to retry when overloaded until any extra capacity is allocated. Furthermore, as this retrying will block the threads from performing other operations -including more I/O, the the autoscale may not scale fast enough.</p>
<p>This is why the DynamoDB On-Demand appears is a better option for workloads with Hadoop, Spark, Hive and other applications.</p>
<p>If autoscaling is to be used, we recommend experimenting with the option, based on usage information collected from previous days, and choosing a combination of retry counts and an interval which allow for the clients to cope with some throttling, but not to time-out other applications.</p></div></div>
<div class="section">
<h2><a name="Read-After-Overwrite_Consistency"></a>Read-After-Overwrite Consistency</h2>
<p>S3Guard provides read-after-overwrite consistency through ETags (default) or object versioning checked either on the server (default) or client. This works such that a reader reading a file after an overwrite either sees the new version of the file or an error. Without S3Guard, new readers may see the original version. Once S3 reaches eventual consistency, new readers will see the new version.</p>
<p>Readers using S3Guard will usually see the new file version, but may in rare cases see <tt>RemoteFileChangedException</tt> instead. This would occur if an S3 object read cannot provide the version tracked in S3Guard metadata.</p>
<p>S3Guard achieves this behavior by storing ETags and object version IDs in the S3Guard metadata store (e.g. DynamoDB). On opening a file, S3AFileSystem will look in S3 for the version of the file indicated by the ETag or object version ID stored in the metadata store. If that version is unavailable, <tt>RemoteFileChangedException</tt> is thrown. Whether ETag or version ID and server or client mode is used is determed by the <a href="./index.html#Handling_Read-During-Overwrite">fs.s3a.change.detection configuration options</a>.</p>
<div class="section">
<h3><a name="No_Versioning_Metadata_Available"></a>No Versioning Metadata Available</h3>
<p>When the first S3AFileSystem clients are upgraded to a version of <tt>S3AFileSystem</tt> that contains these change tracking features, any existing S3Guard metadata will not contain ETags or object version IDs. Reads of files tracked in such S3Guard metadata will access whatever version of the file is available in S3 at the time of read. Only if the file is subsequently updated will S3Guard start tracking ETag and object version ID and as such generating <tt>RemoteFileChangedException</tt> if an inconsistency is detected.</p>
<p>Similarly, when S3Guard metadata is pruned, S3Guard will no longer be able to detect an inconsistent read. S3Guard metadata should be retained for at least as long as the perceived possible read-after-overwrite temporary inconsistency window. That window is expected to be short, but there are no guarantees so it is at the administrator&#x2019;s discretion to weigh the risk.</p></div>
<div class="section">
<h3><a name="Known_Limitations"></a>Known Limitations</h3>
<div class="section">
<h4><a name="S3_Select"></a>S3 Select</h4>
<p>S3 Select does not provide a capability for server-side ETag or object version ID qualification. Whether <tt>fs.s3a.change.detection.mode</tt> is &#x201c;client&#x201d; or &#x201c;server&#x201d;, S3Guard will cause a client-side check of the file version before opening the file with S3 Select. If the current version does not match the version tracked in S3Guard, <tt>RemoteFileChangedException</tt> is thrown.</p>
<p>It is still possible that the S3 Select read will access a different version of the file, if the visible file version changes between the version check and the opening of the file. This can happen due to eventual consistency or an overwrite of the file between the version check and the open of the file.</p></div>
<div class="section">
<h4><a name="Rename"></a>Rename</h4>
<p>Rename is implemented via copy in S3. With <tt>fs.s3a.change.detection.mode</tt> set to &#x201c;client&#x201d;, a fully reliable mechansim for ensuring the copied content is the expected content is not possible. This is the case since there isn&#x2019;t necessarily a way to know the expected ETag or version ID to appear on the object resulting from the copy.</p>
<p>Furthermore, if <tt>fs.s3a.change.detection.mode</tt> is &#x201c;server&#x201d; and a third-party S3 implementation is used that doesn&#x2019;t honor the provided ETag or version ID, S3AFileSystem and S3Guard cannot detect it.</p>
<p>When <tt>fs.s3.change.detection.mode</tt> is &#x201c;client&#x201d;, a client-side check will be performed before the copy to ensure the current version of the file matches S3Guard metadata. If not, <tt>RemoteFileChangedException</tt> is thrown. Similar to as discussed with regard to S3 Select, this is not sufficient to guarantee that same version is the version copied.</p>
<p>When <tt>fs.s3.change.detection.mode</tt> server, the expected version is also specified in the underlying S3 <tt>CopyObjectRequest</tt>. As long as the server honors it, the copied object will be correct.</p>
<p>All this said, with the defaults of <tt>fs.s3.change.detection.mode</tt> of &#x201c;server&#x201d; and <tt>fs.s3.change.detection.source</tt> of &#x201c;etag&#x201d;, when working with Amazon&#x2019;s S3, copy should in fact either copy the expected file version or, in the case of an eventual consistency anomaly, generate <tt>RemoteFileChangedException</tt>. The same should be true when <tt>fs.s3.change.detection.source</tt> = &#x201c;versionid&#x201d;.</p></div>
<div class="section">
<h4><a name="Out_of_Sync_Metadata"></a>Out of Sync Metadata</h4>
<p>The S3Guard version tracking metadata (ETag or object version ID) could become out of sync with the true current object metadata in S3. For example, S3Guard is still tracking v1 of some file after v2 has been written. This could occur for reasons such as a writer writing without utilizing S3Guard and/or S3AFileSystem or simply due to a write with S3AFileSystem and S3Guard that wrote successfully to S3, but failed in communication with S3Guard&#x2019;s metadata store (e.g. DynamoDB).</p>
<p>If this happens, reads of the affected file(s) will result in <tt>RemoteFileChangedException</tt> until one of:</p>
<ul>
<li>the S3Guard metadata is corrected out-of-band</li>
<li>the file is overwritten (causing an S3Guard metadata update)</li>
<li>the S3Guard metadata is pruned</li>
</ul>
<p>The S3Guard metadata for a file can be corrected with the <tt>s3guard import</tt> command as discussed above. The command can take a file URI instead of a bucket URI to correct the metadata for a single file. For example:</p>
<div>
<div>
<pre class="source">hadoop s3guard import [-meta URI] s3a://my-bucket/file-with-bad-metadata
</pre></div></div>
</div></div></div>
<div class="section">
<h2><a name="Troubleshooting"></a>Troubleshooting</h2>
<div class="section">
<h3><a name="Error:_S3Guard_table_lacks_version_marker."></a>Error: <tt>S3Guard table lacks version marker.</tt></h3>
<p>The table which was intended to be used as a S3guard metadata store does not have any version marker indicating that it is a S3Guard table.</p>
<p>It may be that this is not a S3Guard table.</p>
<ul>
<li>Make sure that this is the correct table name.</li>
<li>Delete the table, so it can be rebuilt.</li>
</ul></div>
<div class="section">
<h3><a name="Error:_Database_table_is_from_an_incompatible_S3Guard_version"></a>Error: <tt>Database table is from an incompatible S3Guard version</tt></h3>
<p>This indicates that the version of S3Guard which created (or possibly updated) the database table is from a different version that that expected by the S3A client.</p>
<p>This error will also include the expected and actual version numbers.</p>
<p>If the expected version is lower than the actual version, then the version of the S3A client library is too old to interact with this S3Guard-managed bucket. Upgrade the application/library.</p>
<p>If the expected version is higher than the actual version, then the table itself will need upgrading.</p></div>
<div class="section">
<h3><a name="Error_.22DynamoDB_table_TABLE_does_not_exist_in_region_REGION.3B_auto-creation_is_turned_off.22"></a>Error <tt>&quot;DynamoDB table TABLE does not exist in region REGION; auto-creation is turned off&quot;</tt></h3>
<p>S3Guard could not find the DynamoDB table for the Metadata Store, and it was not configured to create it. Either the table was missing, or the configuration is preventing S3Guard from finding the table.</p>
<ol style="list-style-type: decimal">
<li>Verify that the value of <tt>fs.s3a.s3guard.ddb.table</tt> is correct.</li>
<li>If the region for an existing table has been set in <tt>fs.s3a.s3guard.ddb.region</tt>, verify that the value is correct.</li>
<li>If the region is not set, verify that the table exists in the same region as the bucket being used.</li>
<li>Create the table if necessary.</li>
</ol></div>
<div class="section">
<h3><a name="Error_.22The_level_of_configured_provisioned_throughput_for_the_table_was_exceeded.22"></a>Error <tt>&quot;The level of configured provisioned throughput for the table was exceeded&quot;</tt></h3>
<div>
<div>
<pre class="source">org.apache.hadoop.fs.s3a.AWSServiceThrottledException: listFiles on s3a://bucket/10/d1/d2/d3:
com.amazonaws.services.dynamodbv2.model.ProvisionedThroughputExceededException:
The level of configured provisioned throughput for the table was exceeded.
Consider increasing your provisioning level with the UpdateTable API.
(Service: AmazonDynamoDBv2; Status Code: 400;
Error Code: ProvisionedThroughputExceededException;
</pre></div></div>
<p>The I/O load of clients of the (shared) DynamoDB table was exceeded.</p>
<ol style="list-style-type: decimal">
<li>Switch to On-Demand Dynamo DB tables (AWS console)</li>
<li>Increase the capacity of the DynamoDB table (AWS console or <tt>s3guard set-capacity</tt>)/</li>
<li>Increase the retry count and/or sleep time of S3Guard on throttle events (Hadoop configuration).</li>
</ol></div>
<div class="section">
<h3><a name="Error_Max_retries_exceeded"></a>Error <tt>Max retries exceeded</tt></h3>
<p>The I/O load of clients of the (shared) DynamoDB table was exceeded, and the number of attempts to retry the operation exceeded the configured amount.</p>
<ol style="list-style-type: decimal">
<li>Switch to On-Demand Dynamo DB tables (AWS console).</li>
<li>Increase the capacity of the DynamoDB table.</li>
<li>Increase the retry count and/or sleep time of S3Guard on throttle events.</li>
</ol></div>
<div class="section">
<h3><a name="Error_when_running_set-capacity:_org.apache.hadoop.fs.s3a.AWSServiceThrottledException:_ProvisionTable"></a>Error when running <tt>set-capacity</tt>: <tt>org.apache.hadoop.fs.s3a.AWSServiceThrottledException: ProvisionTable</tt></h3>
<div>
<div>
<pre class="source">org.apache.hadoop.fs.s3a.AWSServiceThrottledException: ProvisionTable on s3guard-example:
com.amazonaws.services.dynamodbv2.model.LimitExceededException:
Subscriber limit exceeded: Provisioned throughput decreases are limited within a given UTC day.
After the first 4 decreases, each subsequent decrease in the same UTC day can be performed at most once every 3600 seconds.
Number of decreases today: 6.
Last decrease at Wednesday, July 25, 2018 8:48:14 PM UTC.
Next decrease can be made at Wednesday, July 25, 2018 9:48:14 PM UTC
</pre></div></div>
<p>There&#x2019;s are limit on how often you can change the capacity of an DynamoDB table; if you call <tt>set-capacity</tt> too often, it fails. Wait until the after the time indicated and try again.</p></div>
<div class="section">
<h3><a name="Error_Invalid_region_specified"></a>Error <tt>Invalid region specified</tt></h3>
<div>
<div>
<pre class="source">java.io.IOException: Invalid region specified &quot;iceland-2&quot;:
Region can be configured with fs.s3a.s3guard.ddb.region:
us-gov-west-1, us-east-1, us-east-2, us-west-1, us-west-2,
eu-west-1, eu-west-2, eu-west-3, eu-central-1, ap-south-1,
ap-southeast-1, ap-southeast-2, ap-northeast-1, ap-northeast-2,
sa-east-1, cn-north-1, cn-northwest-1, ca-central-1
at org.apache.hadoop.fs.s3a.s3guard.DynamoDBClientFactory$DefaultDynamoDBClientFactory.getRegion
at org.apache.hadoop.fs.s3a.s3guard.DynamoDBClientFactory$DefaultDynamoDBClientFactory.createDynamoDBClient
</pre></div></div>
<p>The region specified in <tt>fs.s3a.s3guard.ddb.region</tt> is invalid.</p></div>
<div class="section">
<h3><a name="a.E2.80.9CNeither_ReadCapacityUnits_nor_WriteCapacityUnits_can_be_specified_when_BillingMode_is_PAY_PER_REQUEST.E2.80.9D"></a>&#x201c;Neither ReadCapacityUnits nor WriteCapacityUnits can be specified when BillingMode is PAY_PER_REQUEST&#x201d;</h3>
<div>
<div>
<pre class="source">ValidationException; One or more parameter values were invalid:
Neither ReadCapacityUnits nor WriteCapacityUnits can be specified when
BillingMode is PAY_PER_REQUEST
(Service: AmazonDynamoDBv2; Status Code: 400; Error Code: ValidationException)
</pre></div></div>
<p>On-Demand DynamoDB tables do not have any fixed capacity -it is an error to try to change it with the <tt>set-capacity</tt> command.</p></div>
<div class="section">
<h3><a name="MetadataPersistenceException"></a><tt>MetadataPersistenceException</tt></h3>
<p>A filesystem write operation failed to persist metadata to S3Guard. The file was successfully written to S3 and now the S3Guard metadata is likely to be out of sync.</p>
<p>See <a href="#fail-on-error">Fail on Error</a> for more detail.</p></div>
<div class="section">
<h3><a name="Error_RemoteFileChangedException"></a>Error <tt>RemoteFileChangedException</tt></h3>
<p>An exception like the following could occur for a couple of reasons:</p>
<ul>
<li>
<p>the S3Guard metadata is out of sync with the true S3 metadata. For example, the S3Guard DynamoDB table is tracking a different ETag than the ETag shown in the exception. This may suggest the object was updated in S3 without involvement from S3Guard or there was a transient failure when S3Guard tried to write to DynamoDB.</p>
</li>
<li>
<p>S3 is exhibiting read-after-overwrite temporary inconsistency. The S3Guard metadata was updated with a new ETag during a recent write, but the current read is not seeing that ETag due to S3 eventual consistency. This exception prevents the reader from an inconsistent read where the reader sees an older version of the file.</p>
</li>
</ul>
<div>
<div>
<pre class="source">org.apache.hadoop.fs.s3a.RemoteFileChangedException: open 's3a://my-bucket/test/file.txt':
Change reported by S3 while reading at position 0.
ETag 4e886e26c072fef250cfaf8037675405 was unavailable
at org.apache.hadoop.fs.s3a.impl.ChangeTracker.processResponse(ChangeTracker.java:167)
at org.apache.hadoop.fs.s3a.S3AInputStream.reopen(S3AInputStream.java:207)
at org.apache.hadoop.fs.s3a.S3AInputStream.lambda$lazySeek$1(S3AInputStream.java:355)
at org.apache.hadoop.fs.s3a.Invoker.lambda$retry$2(Invoker.java:195)
at org.apache.hadoop.fs.s3a.Invoker.once(Invoker.java:109)
at org.apache.hadoop.fs.s3a.Invoker.lambda$retry$3(Invoker.java:265)
at org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:322)
at org.apache.hadoop.fs.s3a.Invoker.retry(Invoker.java:261)
at org.apache.hadoop.fs.s3a.Invoker.retry(Invoker.java:193)
at org.apache.hadoop.fs.s3a.Invoker.retry(Invoker.java:215)
at org.apache.hadoop.fs.s3a.S3AInputStream.lazySeek(S3AInputStream.java:348)
at org.apache.hadoop.fs.s3a.S3AInputStream.read(S3AInputStream.java:381)
at java.io.FilterInputStream.read(FilterInputStream.java:83)
</pre></div></div>
</div>
<div class="section">
<h3><a name="Error_AWSClientIOException:_copyFile_caused_by_NullPointerException"></a>Error <tt>AWSClientIOException: copyFile</tt> caused by <tt>NullPointerException</tt></h3>
<p>The AWS SDK has an <a class="externalLink" href="https://github.com/aws/aws-sdk-java/issues/1644">issue</a> where it will throw a relatively generic <tt>AmazonClientException</tt> caused by <tt>NullPointerException</tt> when copying a file and specifying a precondition that cannot be met. This can bubble up from <tt>S3AFileSystem.rename()</tt>. It suggests that the file in S3 is inconsistent with the metadata in S3Guard.</p>
<div>
<div>
<pre class="source">org.apache.hadoop.fs.s3a.AWSClientIOException: copyFile(test/rename-eventually2.dat, test/dest2.dat) on test/rename-eventually2.dat: com.amazonaws.AmazonClientException: Unable to complete transfer: null: Unable to complete transfer: null
at org.apache.hadoop.fs.s3a.S3AUtils.translateException(S3AUtils.java:201)
at org.apache.hadoop.fs.s3a.Invoker.once(Invoker.java:111)
at org.apache.hadoop.fs.s3a.Invoker.lambda$retry$4(Invoker.java:314)
at org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:406)
at org.apache.hadoop.fs.s3a.Invoker.retry(Invoker.java:310)
at org.apache.hadoop.fs.s3a.Invoker.retry(Invoker.java:285)
at org.apache.hadoop.fs.s3a.S3AFileSystem.copyFile(S3AFileSystem.java:3034)
at org.apache.hadoop.fs.s3a.S3AFileSystem.innerRename(S3AFileSystem.java:1258)
at org.apache.hadoop.fs.s3a.S3AFileSystem.rename(S3AFileSystem.java:1119)
at org.apache.hadoop.fs.s3a.ITestS3ARemoteFileChanged.lambda$testRenameEventuallyConsistentFile2$6(ITestS3ARemoteFileChanged.java:556)
at org.apache.hadoop.test.LambdaTestUtils.intercept(LambdaTestUtils.java:498)
at org.apache.hadoop.fs.s3a.ITestS3ARemoteFileChanged.testRenameEventuallyConsistentFile2(ITestS3ARemoteFileChanged.java:554)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:50)
at org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12)
at org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:47)
at org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17)
at org.junit.internal.runners.statements.RunBefores.evaluate(RunBefores.java:26)
at org.junit.internal.runners.statements.RunAfters.evaluate(RunAfters.java:27)
at org.junit.rules.TestWatcher$1.evaluate(TestWatcher.java:55)
at org.junit.internal.runners.statements.FailOnTimeout$CallableStatement.call(FailOnTimeout.java:298)
at org.junit.internal.runners.statements.FailOnTimeout$CallableStatement.call(FailOnTimeout.java:292)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.lang.Thread.run(Thread.java:748)
Caused by: com.amazonaws.AmazonClientException: Unable to complete transfer: null
at com.amazonaws.services.s3.transfer.internal.AbstractTransfer.unwrapExecutionException(AbstractTransfer.java:286)
at com.amazonaws.services.s3.transfer.internal.AbstractTransfer.rethrowExecutionException(AbstractTransfer.java:265)
at com.amazonaws.services.s3.transfer.internal.CopyImpl.waitForCopyResult(CopyImpl.java:67)
at org.apache.hadoop.fs.s3a.impl.CopyOutcome.waitForCopy(CopyOutcome.java:72)
at org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$copyFile$14(S3AFileSystem.java:3047)
at org.apache.hadoop.fs.s3a.Invoker.once(Invoker.java:109)
... 25 more
Caused by: java.lang.NullPointerException
at com.amazonaws.services.s3.transfer.internal.CopyCallable.copyInOneChunk(CopyCallable.java:154)
at com.amazonaws.services.s3.transfer.internal.CopyCallable.call(CopyCallable.java:134)
at com.amazonaws.services.s3.transfer.internal.CopyMonitor.call(CopyMonitor.java:132)
at com.amazonaws.services.s3.transfer.internal.CopyMonitor.call(CopyMonitor.java:43)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
... 1 more
</pre></div></div>
</div>
<div class="section">
<h3><a name="Error_Attempt_to_change_a_resource_which_is_still_in_use:_Table_is_being_deleted"></a>Error <tt>Attempt to change a resource which is still in use: Table is being deleted</tt></h3>
<div>
<div>
<pre class="source">com.amazonaws.services.dynamodbv2.model.ResourceInUseException:
Attempt to change a resource which is still in use: Table is being deleted:
s3guard.test.testDynamoDBInitDestroy351245027
(Service: AmazonDynamoDBv2; Status Code: 400; Error Code: ResourceInUseException;)
</pre></div></div>
<p>You have attempted to call <tt>hadoop s3guard destroy</tt> on a table which is already being destroyed.</p></div></div>
<div class="section">
<h2><a name="Other_Topics"></a>Other Topics</h2>
<p>For details on how to test S3Guard, see <a href="./testing.html#s3guard">Testing S3Guard</a></p></div>
</div>
</div>
<div class="clear">
<hr/>
</div>
<div id="footer">
<div class="xright">
&#169; 2008-2021
Apache Software Foundation
- <a href="http://maven.apache.org/privacy-policy.html">Privacy Policy</a>.
Apache Maven, Maven, Apache, the Apache feather logo, and the Apache Maven project logos are trademarks of The Apache Software Foundation.
</div>
<div class="clear">
<hr/>
</div>
</div>
</body>
</html>