blob: 77d7f9640e4e4b3d4d3bacef760efb1c752b64db [file] [log] [blame]
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<!--
| Generated by Apache Maven Doxia at 2024-05-29
| Rendered using Apache Maven Stylus Skin 1.5
-->
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Apache Hadoop 3.5.0-SNAPSHOT &#x2013; class org.apache.hadoop.fs.FutureDataInputStreamBuilder</title>
<style type="text/css" media="all">
@import url("../css/maven-base.css");
@import url("../css/maven-theme.css");
@import url("../css/site.css");
</style>
<link rel="stylesheet" href="../css/print.css" type="text/css" media="print" />
<meta name="Date-Revision-yyyymmdd" content="20240529" />
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
</head>
<body class="composite">
<div id="banner">
<a href="http://hadoop.apache.org/" id="bannerLeft">
<img src="http://hadoop.apache.org/images/hadoop-logo.jpg" alt="" />
</a>
<a href="http://www.apache.org/" id="bannerRight">
<img src="http://www.apache.org/images/asf_logo_wide.png" alt="" />
</a>
<div class="clear">
<hr/>
</div>
</div>
<div id="breadcrumbs">
<div class="xright"> <a href="http://wiki.apache.org/hadoop" class="externalLink">Wiki</a>
|
<a href="https://gitbox.apache.org/repos/asf/hadoop.git" class="externalLink">git</a>
|
<a href="http://hadoop.apache.org/" class="externalLink">Apache Hadoop</a>
&nbsp;| Last Published: 2024-05-29
&nbsp;| Version: 3.5.0-SNAPSHOT
</div>
<div class="clear">
<hr/>
</div>
</div>
<div id="leftColumn">
<div id="navcolumn">
<h5>General</h5>
<ul>
<li class="none">
<a href="../../../index.html">Overview</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/SingleCluster.html">Single Node Setup</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/ClusterSetup.html">Cluster Setup</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/CommandsManual.html">Commands Reference</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/FileSystemShell.html">FileSystem Shell</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/Compatibility.html">Compatibility Specification</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/DownstreamDev.html">Downstream Developer's Guide</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/AdminCompatibilityGuide.html">Admin Compatibility Guide</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/InterfaceClassification.html">Interface Classification</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/filesystem/index.html">FileSystem Specification</a>
</li>
</ul>
<h5>Common</h5>
<ul>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/CLIMiniCluster.html">CLI Mini Cluster</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/FairCallQueue.html">Fair Call Queue</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/NativeLibraries.html">Native Libraries</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/Superusers.html">Proxy User</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/RackAwareness.html">Rack Awareness</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/SecureMode.html">Secure Mode</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/ServiceLevelAuth.html">Service Level Authorization</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/HttpAuthentication.html">HTTP Authentication</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/CredentialProviderAPI.html">Credential Provider API</a>
</li>
<li class="none">
<a href="../../../hadoop-kms/index.html">Hadoop KMS</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/Tracing.html">Tracing</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/UnixShellGuide.html">Unix Shell Guide</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/registry/index.html">Registry</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/AsyncProfilerServlet.html">Async Profiler</a>
</li>
</ul>
<h5>HDFS</h5>
<ul>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HdfsDesign.html">Architecture</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HdfsUserGuide.html">User Guide</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HDFSCommands.html">Commands Reference</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HDFSHighAvailabilityWithQJM.html">NameNode HA With QJM</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HDFSHighAvailabilityWithNFS.html">NameNode HA With NFS</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/ObserverNameNode.html">Observer NameNode</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/Federation.html">Federation</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/ViewFs.html">ViewFs</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/ViewFsOverloadScheme.html">ViewFsOverloadScheme</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HdfsSnapshots.html">Snapshots</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HdfsEditsViewer.html">Edits Viewer</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HdfsImageViewer.html">Image Viewer</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HdfsPermissionsGuide.html">Permissions and HDFS</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HdfsQuotaAdminGuide.html">Quotas and HDFS</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/LibHdfs.html">libhdfs (C API)</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/WebHDFS.html">WebHDFS (REST API)</a>
</li>
<li class="none">
<a href="../../../hadoop-hdfs-httpfs/index.html">HttpFS</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/ShortCircuitLocalReads.html">Short Circuit Local Reads</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/CentralizedCacheManagement.html">Centralized Cache Management</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HdfsNfsGateway.html">NFS Gateway</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HdfsRollingUpgrade.html">Rolling Upgrade</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/ExtendedAttributes.html">Extended Attributes</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/TransparentEncryption.html">Transparent Encryption</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HdfsMultihoming.html">Multihoming</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/ArchivalStorage.html">Storage Policies</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/MemoryStorage.html">Memory Storage Support</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/SLGUserGuide.html">Synthetic Load Generator</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HDFSErasureCoding.html">Erasure Coding</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HDFSDiskbalancer.html">Disk Balancer</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HdfsUpgradeDomain.html">Upgrade Domain</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HdfsDataNodeAdminGuide.html">DataNode Admin</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs-rbf/HDFSRouterFederation.html">Router Federation</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/HdfsProvidedStorage.html">Provided Storage</a>
</li>
</ul>
<h5>MapReduce</h5>
<ul>
<li class="none">
<a href="../../../hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapReduceTutorial.html">Tutorial</a>
</li>
<li class="none">
<a href="../../../hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapredCommands.html">Commands Reference</a>
</li>
<li class="none">
<a href="../../../hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapReduce_Compatibility_Hadoop1_Hadoop2.html">Compatibility with 1.x</a>
</li>
<li class="none">
<a href="../../../hadoop-mapreduce-client/hadoop-mapreduce-client-core/EncryptedShuffle.html">Encrypted Shuffle</a>
</li>
<li class="none">
<a href="../../../hadoop-mapreduce-client/hadoop-mapreduce-client-core/PluggableShuffleAndPluggableSort.html">Pluggable Shuffle/Sort</a>
</li>
<li class="none">
<a href="../../../hadoop-mapreduce-client/hadoop-mapreduce-client-core/DistributedCacheDeploy.html">Distributed Cache Deploy</a>
</li>
<li class="none">
<a href="../../../hadoop-mapreduce-client/hadoop-mapreduce-client-core/SharedCacheSupport.html">Support for YARN Shared Cache</a>
</li>
</ul>
<h5>MapReduce REST APIs</h5>
<ul>
<li class="none">
<a href="../../../hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapredAppMasterRest.html">MR Application Master</a>
</li>
<li class="none">
<a href="../../../hadoop-mapreduce-client/hadoop-mapreduce-client-hs/HistoryServerRest.html">MR History Server</a>
</li>
</ul>
<h5>YARN</h5>
<ul>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/YARN.html">Architecture</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/YarnCommands.html">Commands Reference</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/CapacityScheduler.html">Capacity Scheduler</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/FairScheduler.html">Fair Scheduler</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/ResourceManagerRestart.html">ResourceManager Restart</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/ResourceManagerHA.html">ResourceManager HA</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/ResourceModel.html">Resource Model</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/NodeLabel.html">Node Labels</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/NodeAttributes.html">Node Attributes</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/WebApplicationProxy.html">Web Application Proxy</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/TimelineServer.html">Timeline Server</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/TimelineServiceV2.html">Timeline Service V.2</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/WritingYarnApplications.html">Writing YARN Applications</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/YarnApplicationSecurity.html">YARN Application Security</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/NodeManager.html">NodeManager</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/DockerContainers.html">Running Applications in Docker Containers</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/RuncContainers.html">Running Applications in runC Containers</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/NodeManagerCgroups.html">Using CGroups</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/SecureContainer.html">Secure Containers</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/ReservationSystem.html">Reservation System</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/GracefulDecommission.html">Graceful Decommission</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/OpportunisticContainers.html">Opportunistic Containers</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/Federation.html">YARN Federation</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/SharedCache.html">Shared Cache</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/UsingGpus.html">Using GPU</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/UsingFPGA.html">Using FPGA</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/PlacementConstraints.html">Placement Constraints</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/YarnUI2.html">YARN UI2</a>
</li>
</ul>
<h5>YARN REST APIs</h5>
<ul>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/WebServicesIntro.html">Introduction</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/ResourceManagerRest.html">Resource Manager</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/NodeManagerRest.html">Node Manager</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/TimelineServer.html#Timeline_Server_REST_API_v1">Timeline Server</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/TimelineServiceV2.html#Timeline_Service_v.2_REST_API">Timeline Service V.2</a>
</li>
</ul>
<h5>YARN Service</h5>
<ul>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/yarn-service/Overview.html">Overview</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/yarn-service/QuickStart.html">QuickStart</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/yarn-service/Concepts.html">Concepts</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/yarn-service/YarnServiceAPI.html">Yarn Service API</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/yarn-service/ServiceDiscovery.html">Service Discovery</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-site/yarn-service/SystemServices.html">System Services</a>
</li>
</ul>
<h5>Hadoop Compatible File Systems</h5>
<ul>
<li class="none">
<a href="../../../hadoop-aliyun/tools/hadoop-aliyun/index.html">Aliyun OSS</a>
</li>
<li class="none">
<a href="../../../hadoop-aws/tools/hadoop-aws/index.html">Amazon S3</a>
</li>
<li class="none">
<a href="../../../hadoop-azure/index.html">Azure Blob Storage</a>
</li>
<li class="none">
<a href="../../../hadoop-azure-datalake/index.html">Azure Data Lake Storage</a>
</li>
<li class="none">
<a href="../../../hadoop-cos/cloud-storage/index.html">Tencent COS</a>
</li>
<li class="none">
<a href="../../../hadoop-huaweicloud/cloud-storage/index.html">Huaweicloud OBS</a>
</li>
</ul>
<h5>Auth</h5>
<ul>
<li class="none">
<a href="../../../hadoop-auth/index.html">Overview</a>
</li>
<li class="none">
<a href="../../../hadoop-auth/Examples.html">Examples</a>
</li>
<li class="none">
<a href="../../../hadoop-auth/Configuration.html">Configuration</a>
</li>
<li class="none">
<a href="../../../hadoop-auth/BuildingIt.html">Building</a>
</li>
</ul>
<h5>Tools</h5>
<ul>
<li class="none">
<a href="../../../hadoop-streaming/HadoopStreaming.html">Hadoop Streaming</a>
</li>
<li class="none">
<a href="../../../hadoop-archives/HadoopArchives.html">Hadoop Archives</a>
</li>
<li class="none">
<a href="../../../hadoop-archive-logs/HadoopArchiveLogs.html">Hadoop Archive Logs</a>
</li>
<li class="none">
<a href="../../../hadoop-distcp/DistCp.html">DistCp</a>
</li>
<li class="none">
<a href="../../../hadoop-federation-balance/HDFSFederationBalance.html">HDFS Federation Balance</a>
</li>
<li class="none">
<a href="../../../hadoop-gridmix/GridMix.html">GridMix</a>
</li>
<li class="none">
<a href="../../../hadoop-rumen/Rumen.html">Rumen</a>
</li>
<li class="none">
<a href="../../../hadoop-resourceestimator/ResourceEstimator.html">Resource Estimator Service</a>
</li>
<li class="none">
<a href="../../../hadoop-sls/SchedulerLoadSimulator.html">Scheduler Load Simulator</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/Benchmarking.html">Hadoop Benchmarking</a>
</li>
<li class="none">
<a href="../../../hadoop-dynamometer/Dynamometer.html">Dynamometer</a>
</li>
</ul>
<h5>Reference</h5>
<ul>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/release/">Changelog and Release Notes</a>
</li>
<li class="none">
<a href="../../../api/index.html">Java API docs</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/UnixShellAPI.html">Unix Shell API</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/Metrics.html">Metrics</a>
</li>
</ul>
<h5>Configuration</h5>
<ul>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/core-default.xml">core-default.xml</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs/hdfs-default.xml">hdfs-default.xml</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-hdfs-rbf/hdfs-rbf-default.xml">hdfs-rbf-default.xml</a>
</li>
<li class="none">
<a href="../../../hadoop-mapreduce-client/hadoop-mapreduce-client-core/mapred-default.xml">mapred-default.xml</a>
</li>
<li class="none">
<a href="../../../hadoop-yarn/hadoop-yarn-common/yarn-default.xml">yarn-default.xml</a>
</li>
<li class="none">
<a href="../../../hadoop-kms/kms-default.html">kms-default.xml</a>
</li>
<li class="none">
<a href="../../../hadoop-hdfs-httpfs/httpfs-default.html">httpfs-default.xml</a>
</li>
<li class="none">
<a href="../../../hadoop-project-dist/hadoop-common/DeprecatedProperties.html">Deprecated Properties</a>
</li>
</ul>
<a href="http://maven.apache.org/" title="Built by Maven" class="poweredBy">
<img alt="Built by Maven" src="../images/logos/maven-feather.png"/>
</a>
</div>
</div>
<div id="bodyColumn">
<div id="contentBox">
<!---
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<!-- ============================================================= -->
<!-- CLASS: FutureDataInputStreamBuilder -->
<!-- ============================================================= -->
<h1>class <code>org.apache.hadoop.fs.FutureDataInputStreamBuilder</code></h1>
<ul>
<li><a href="#History"> History</a>
<ul>
<li><a href="#Hadoop_3.3.0:_API_introduced">Hadoop 3.3.0: API introduced</a></li>
<li><a href="#Hadoop_3.3.5:_standardization_and_expansion">Hadoop 3.3.5: standardization and expansion</a></li>
<li><a href="#Hadoop_3.3.6:_API_change_to_address_operator_overload_bugs.">Hadoop 3.3.6: API change to address operator overload bugs.</a></li></ul></li>
<li><a href="#Invariants">Invariants</a></li>
<li><a href="#a.60Implementation-agnostic_parameters."> `Implementation-agnostic parameters.</a>
<ul>
<li><a href="#FutureDataInputStreamBuilder_bufferSize.28int_bufSize.29"> FutureDataInputStreamBuilder bufferSize(int bufSize)</a></li>
<li><a href="#FutureDataInputStreamBuilder_withFileStatus.28FileStatus_status.29"> FutureDataInputStreamBuilder withFileStatus(FileStatus status)</a></li>
<li><a href="#Set_optional_or_mandatory_parameters"> Set optional or mandatory parameters</a></li>
<li><a href="#When_to_use_opt_versus_must"> When to use opt versus must</a></li></ul></li>
<li><a href="#Implementation_Notes"> Implementation Notes</a></li>
<li><a href="#Builder_interface"> Builder interface</a>
<ul>
<li><a href="#CompletableFuture.3CFSDataInputStream.3E_build.28.29"> CompletableFuture&lt;FSDataInputStream&gt; build()</a></li></ul></li>
<li><a href="#Standard_openFile.28.29_options_since_hadoop_branch-3.3"> Standard openFile() options since hadoop branch-3.3</a>
<ul>
<li><a href="#Option:_fs.option.openfile.buffer.size"> Option: fs.option.openfile.buffer.size</a></li>
<li><a href="#Option:_fs.option.openfile.read.policy"> Option: fs.option.openfile.read.policy</a></li>
<li><a href="#Option:_fs.option.openfile.length"> Option: fs.option.openfile.length</a></li>
<li><a href="#Options:_fs.option.openfile.split.start_and_fs.option.openfile.split.end"> Options: fs.option.openfile.split.start and fs.option.openfile.split.end</a></li></ul></li>
<li><a href="#S3A-specific_options"> S3A-specific options</a></li>
<li><a href="#ABFS-specific_options"> ABFS-specific options</a></li>
<li><a href="#Examples"> Examples</a>
<ul>
<li><a href="#a"></a></li>
<li><a href="#Opening_a_file_with_both_standard_and_non-standard_options">Opening a file with both standard and non-standard options</a></li>
<li><a href="#Opening_a_file_with_older_releases">Opening a file with older releases</a></li>
<li><a href="#Passing_options_in_to_MapReduce">Passing options in to MapReduce</a></li>
<li><a href="#MapReduce_input_format_propagating_options">MapReduce input format propagating options</a></li>
<li><a href="#FileContext.openFile">FileContext.openFile</a></li>
<li><a href="#Example:_reading_a_whole_file">Example: reading a whole file</a></li></ul></li></ul>
<p>An interface offering of the Builder pattern for creating Java <code>Future</code> references to <code>FSDataInputStream</code> and its subclasses. It is used to initate a (potentially asynchronous) operation to open an existing file for reading.</p><section>
<h2><a name="History"></a> History</h2><section>
<h3><a name="Hadoop_3.3.0:_API_introduced"></a>Hadoop 3.3.0: API introduced</h3>
<p><a class="externalLink" href="https://issues.apache.org/jira/browse/HADOOP-15229">HADOOP-15229</a> <i>Add FileSystem builder-based openFile() API to match createFile()</i></p>
<ul>
<li>No <code>opt(String key, long value)</code> method was available.</li>
<li>the <code>withFileStatus(status)</code> call required a non-null parameter.</li>
<li>Sole Filesystem to process options and file status was S3A;</li>
<li>Only the s3a specific options were the S3 select and <code>fs.s3a.experimental.input.fadvise</code></li>
<li>S3A Filesystem raised <code>IllegalArgumentException</code> if a file status was passed in and the path of the filestatus did not match the path of the <code>openFile(path)</code> call.</li>
</ul>
<p>This is the baseline implementation. To write code guaranteed to compile against this version, use the <code>opt(String, String)</code> and <code>must(String, String)</code> methods, converting numbers to string explicitly.</p>
<div class="source">
<div class="source">
<pre>fs.open(&quot;s3a://bucket/file&quot;)
.opt(&quot;fs.option.openfile.length&quot;, Long.toString(length))
.build().get()
</pre></div></div>
</section><section>
<h3><a name="Hadoop_3.3.5:_standardization_and_expansion"></a>Hadoop 3.3.5: standardization and expansion</h3>
<p><a class="externalLink" href="https://issues.apache.org/jira/browse/HADOOP-16202">HADOOP-16202</a> <i>Enhance openFile() for better read performance against object stores</i></p>
<ul>
<li><code>withFileStatus(null)</code> required to be accepted (and ignored)</li>
<li>only the filename part of any supplied FileStatus path must match the filename passed in on <code>openFile(path)</code>.</li>
<li>An <code>opt(String key, long value)</code> option was added. *This is now deprecated as it caused regression</li>
<li>Standard <code>fs.option.openfile</code> options defined.</li>
<li>S3A FS to use openfile length option, seek start/end options not <i>yet</i> used.</li>
<li>Azure ABFS connector takes a supplied <code>VersionedFileStatus</code> and omits any HEAD probe for the object.</li>
</ul></section><section>
<h3><a name="Hadoop_3.3.6:_API_change_to_address_operator_overload_bugs."></a>Hadoop 3.3.6: API change to address operator overload bugs.</h3>
<p>new <code>optLong()</code>, <code>optDouble()</code>, <code>mustLong()</code> and <code>mustDouble()</code> builder methods.</p>
<ul>
<li>See <a class="externalLink" href="https://issues.apache.org/jira/browse/HADOOP-18724">HADOOP-18724</a> <i>Open file fails with NumberFormatException for S3AFileSystem</i>, which was somehow caused by the overloaded <code>opt(long)</code>.</li>
<li>Specification updated to declare that unparseable numbers MUST be treated as &#x201c;unset&#x201d; and the default value used instead.</li>
</ul></section></section><section>
<h2><a name="Invariants"></a>Invariants</h2>
<p>The <code>FutureDataInputStreamBuilder</code> interface does not require parameters or or the state of <code>FileSystem</code> until <a href="#build"><code>build()</code></a> is invoked and/or during the asynchronous open operation itself.</p>
<p>Some aspects of the state of the filesystem, MAY be checked in the initial <code>openFile()</code> call, provided they are known to be invariants which will not change between <code>openFile()</code> and the <code>build().get()</code> sequence. For example, path validation.</p></section><section>
<h2><a name="a.60Implementation-agnostic_parameters."></a><a name="parameters"></a> `Implementation-agnostic parameters.</h2><section>
<h3><a name="FutureDataInputStreamBuilder_bufferSize.28int_bufSize.29"></a><a name="Builder.bufferSize"></a> <code>FutureDataInputStreamBuilder bufferSize(int bufSize)</code></h3>
<p>Set the size of the buffer to be used.</p></section><section>
<h3><a name="FutureDataInputStreamBuilder_withFileStatus.28FileStatus_status.29"></a><a name="Builder.withFileStatus"></a> <code>FutureDataInputStreamBuilder withFileStatus(FileStatus status)</code></h3>
<p>A <code>FileStatus</code> instance which refers to the file being opened.</p>
<p>This MAY be used by implementations to short-circuit checks for the file, So potentially saving on remote calls especially to object stores.</p>
<p>Requirements:</p>
<ul>
<li><code>status != null</code></li>
<li><code>status.getPath().getName()</code> == the name of the file being opened.</li>
</ul>
<p>The path validation MUST take place if the store uses the <code>FileStatus</code> when it opens files, and MAY be performed otherwise. The validation SHOULD be postponed until the <code>build()</code> operation.</p>
<p>This operation should be considered a hint to the filesystem.</p>
<p>If a filesystem implementation extends the <code>FileStatus</code> returned in its implementation MAY use this information when opening the file.</p>
<p>This is relevant with those stores which return version/etag information, -they MAY use this to guarantee that the file they opened is exactly the one returned in the listing.</p>
<p>The final <code>status.getPath().getName()</code> element of the supplied status MUST equal the name value of the path supplied to the <code>openFile(path)</code> call.</p>
<p>Filesystems MUST NOT validate the rest of the path. This is needed to support viewfs and other mount-point wrapper filesystems where schemas and paths are different. These often create their own FileStatus results</p>
<p>Preconditions</p>
<div class="source">
<div class="source">
<pre>status == null or status.getPath().getName() == path.getName()
</pre></div></div>
<p>Filesystems MUST NOT require the class of <code>status</code> to equal that of any specific subclass their implementation returns in filestatus/list operations. This is to support wrapper filesystems and serialization/deserialization of the status.</p></section><section>
<h3><a name="Set_optional_or_mandatory_parameters"></a><a name="optional"></a> Set optional or mandatory parameters</h3>
<div class="source">
<div class="source">
<pre>FutureDataInputStreamBuilder opt(String key, String value)
FutureDataInputStreamBuilder opt(String key, int value)
FutureDataInputStreamBuilder opt(String key, boolean value)
FutureDataInputStreamBuilder optLong(String key, long value)
FutureDataInputStreamBuilder optDouble(String key, double value)
FutureDataInputStreamBuilder must(String key, String value)
FutureDataInputStreamBuilder must(String key, int value)
FutureDataInputStreamBuilder must(String key, boolean value)
FutureDataInputStreamBuilder mustLong(String key, long value)
FutureDataInputStreamBuilder mustDouble(String key, double value)
</pre></div></div>
<p>Set optional or mandatory parameters to the builder. Using <code>opt()</code> or <code>must()</code>, client can specify FS-specific parameters without inspecting the concrete type of <code>FileSystem</code>.</p>
<p>Example:</p>
<div class="source">
<div class="source">
<pre>out = fs.openFile(path)
.must(&quot;fs.option.openfile.read.policy&quot;, &quot;random&quot;)
.optLong(&quot;fs.http.connection.timeout&quot;, 30_000L)
.withFileStatus(statusFromListing)
.build()
.get();
</pre></div></div>
<p>Here the read policy of <code>random</code> has been specified, with the requirement that the filesystem implementation must understand the option. An http-specific option has been supplied which may be interpreted by any store; If the filesystem opening the file does not recognize the option, it can safely be ignored.</p></section><section>
<h3><a name="When_to_use_opt_versus_must"></a><a name="usage"></a> When to use <code>opt</code> versus <code>must</code></h3>
<p>The difference between <code>opt</code> versus <code>must</code> is how the FileSystem opening the file must react to an option which it does not recognize.</p>
<div class="source">
<div class="source">
<pre>def must(name, value):
if not name in known_keys:
raise IllegalArgumentException
if not name in supported_keys:
raise UnsupportedException
def opt(name, value):
if not name in known_keys:
# ignore option
</pre></div></div>
<p>For any known key, the validation of the <code>value</code> argument MUST be the same irrespective of how the (key, value) pair was declared.</p>
<ol style="list-style-type: decimal">
<li>For a filesystem-specific option, it is the choice of the implementation how to validate the entry.</li>
<li>For standard options, the specification of what is a valid <code>value</code> is defined in this filesystem specification, validated through contract tests.</li>
</ol></section></section><section>
<h2><a name="Implementation_Notes"></a><a name="implementation"></a> Implementation Notes</h2>
<p>Checking for supported options must be performed in the <code>build()</code> operation.</p>
<ol style="list-style-type: decimal">
<li>
<p>If a mandatory parameter declared via <code>must(key, value)</code>) is not recognized, <code>IllegalArgumentException</code> MUST be thrown.</p>
</li>
<li>
<p>If a mandatory parameter declared via <code>must(key, value)</code> relies on a feature which is recognized but not supported in the specific <code>FileSystem</code>/<code>FileContext</code> instance <code>UnsupportedException</code> MUST be thrown.</p>
</li>
</ol>
<p>Parsing of numeric values SHOULD trim any string and if the value cannot be parsed as a number, downgrade to any default value supplied. This is to address <a class="externalLink" href="https://issues.apache.org/jira/browse/HADOOP-18724">HADOOP-18724</a> <i>Open file fails with NumberFormatException for S3AFileSystem</i>, which was cause by the overloaded <code>opt()</code> builder parameter binding to <code>opt(String, double)</code> rather than <code>opt(String, long)</code> when a long value was passed in.</p>
<p>The behavior of resolving the conflicts between the parameters set by builder methods (i.e., <code>bufferSize()</code>) and <code>opt()</code>/<code>must()</code> is as follows:</p>
<blockquote>
<p>The last option specified defines the value and its optional/mandatory state.</p>
</blockquote>
<p>If the <code>FileStatus</code> option passed in <code>withFileStatus()</code> is used, implementations MUST accept all subclasses of <code>FileStatus</code>, including <code>LocatedFileStatus</code>, rather than just any FS-specific subclass implemented by the implementation (e.g <code>S3AFileStatus</code>). They MAY simply ignore those which are not the custom subclasses.</p>
<p>This is critical to ensure safe use of the feature: directory listing/ status serialization/deserialization can result in the <code>withFileStatus()</code> argument not being the custom subclass returned by the Filesystem instance&#x2019;s own <code>getFileStatus()</code>, <code>listFiles()</code>, <code>listLocatedStatus()</code> calls, etc.</p>
<p>In such a situation the implementations must:</p>
<ol style="list-style-type: decimal">
<li>Verify that <code>status.getPath().getName()</code> matches the current <code>path.getName()</code> value. The rest of the path MUST NOT be validated.</li>
<li>Use any status fields as desired -for example the file length.</li>
</ol>
<p>Even if not values of the status are used, the presence of the argument can be interpreted as the caller declaring that they believe the file to be present and of the given size.</p></section><section>
<h2><a name="Builder_interface"></a><a name="builder"></a> Builder interface</h2><section>
<h3><a name="CompletableFuture.3CFSDataInputStream.3E_build.28.29"></a><a name="build"></a> <code>CompletableFuture&lt;FSDataInputStream&gt; build()</code></h3>
<p>Return an <code>CompletableFuture&lt;FSDataInputStream&gt;</code> which, when successfully completed, returns an input stream which can read data from the filesystem.</p>
<p>The <code>build()</code> operation MAY perform the validation of the file&#x2019;s existence, its kind, so rejecting attempts to read from a directory or non-existent file. Alternatively * file existence/status checks MAY be performed asynchronously within the returned <code>CompletableFuture&lt;&gt;</code>. * file existence/status checks MAY be postponed until the first byte is read in any of the read such as <code>read()</code> or <code>PositionedRead</code>.</p>
<p>That is, the precondition <code>exists(FS, path)</code> and <code>isFile(FS, path)</code> are only guaranteed to have been met after the <code>get()</code> called on returned future and an attempt has been made to read the stream.</p>
<p>Thus, if even when file does not exist, or is a directory rather than a file, the following call MUST succeed, returning a <code>CompletableFuture</code> to be evaluated.</p>
<div class="source">
<div class="source">
<pre>Path p = new Path(&quot;file://tmp/file-which-does-not-exist&quot;);
CompletableFuture&lt;FSDataInputStream&gt; future = p.getFileSystem(conf)
.openFile(p)
.build();
</pre></div></div>
<p>The inability to access/read a file MUST raise an <code>IOException</code>or subclass in either the future&#x2019;s <code>get()</code> call, or, for late binding operations, when an operation to read data is invoked.</p>
<p>Therefore the following sequence SHALL fail when invoked on the <code>future</code> returned by the previous example.</p>
<div class="source">
<div class="source">
<pre> future.get().read();
</pre></div></div>
<p>Access permission checks have the same visibility requirements: permission failures MUST be delayed until the <code>get()</code> call and MAY be delayed into subsequent operations.</p>
<p>Note: some operations on the input stream, such as <code>seek()</code> may not attempt any IO at all. Such operations MAY NOT raise exceotions when interacting with nonexistent/unreadable files.</p></section></section><section>
<h2><a name="Standard_openFile.28.29_options_since_hadoop_branch-3.3"></a><a name="options"></a> Standard <code>openFile()</code> options since hadoop branch-3.3</h2>
<p>These are options which <code>FileSystem</code> and <code>FileContext</code> implementation MUST recognise and MAY support by changing the behavior of their input streams as appropriate.</p>
<p>Hadoop 3.3.0 added the <code>openFile()</code> API; these standard options were defined in a later release. Therefore, although they are &#x201c;well known&#x201d;, unless confident that the application will only be executed against releases of Hadoop which knows of the options -applications SHOULD set the options via <code>opt()</code> calls rather than <code>must()</code>.</p>
<p>When opening a file through the <code>openFile()</code> builder API, callers MAY use both <code>.opt(key, value)</code> and <code>.must(key, value)</code> calls to set standard and filesystem-specific options.</p>
<p>If set as an <code>opt()</code> parameter, unsupported &#x201c;standard&#x201d; options MUST be ignored, as MUST unrecognized standard options.</p>
<p>If set as a <code>must()</code> parameter, unsupported &#x201c;standard&#x201d; options MUST be ignored. unrecognized standard options MUST be rejected.</p>
<p>The standard <code>openFile()</code> options are defined in <code>org.apache.hadoop.fs.OpenFileOptions</code>; they all SHALL start with <code>fs.option.openfile.</code>.</p>
<p>Note that while all <code>FileSystem</code>/<code>FileContext</code> instances SHALL support these options to the extent that <code>must()</code> declarations SHALL NOT fail, the implementations MAY support them to the extent of interpreting the values. This means that it is not a requirement for the stores to actually read the read policy or file length values and use them when opening files.</p>
<p>Unless otherwise stated, they SHOULD be viewed as hints.</p>
<p>Note: if a standard option is added such that if set but not supported would be an error, then implementations SHALL reject it. For example, the S3A filesystem client supports the ability to push down SQL commands. If something like that were ever standardized, then the use of the option, either in <code>opt()</code> or <code>must()</code> argument MUST be rejected for filesystems which don&#x2019;t support the feature.</p><section>
<h3><a name="Option:_fs.option.openfile.buffer.size"></a><a name="buffer.size"></a> Option: <code>fs.option.openfile.buffer.size</code></h3>
<p>Read buffer size in bytes.</p>
<p>This overrides the default value set in the configuration with the option <code>io.file.buffer.size</code>.</p>
<p>It is supported by all filesystem clients which allow for stream-specific buffer sizes to be set via <code>FileSystem.open(path, buffersize)</code>.</p></section><section>
<h3><a name="Option:_fs.option.openfile.read.policy"></a><a name="read.policy"></a> Option: <code>fs.option.openfile.read.policy</code></h3>
<p>Declare the read policy of the input stream. This is a hint as to what the expected read pattern of an input stream will be. This MAY control readahead, buffering and other optimizations.</p>
<p>Sequential reads may be optimized with prefetching data and/or reading data in larger blocks. Some applications (e.g. distCp) perform sequential IO even over columnar data.</p>
<p>In contrast, random IO reads data in different parts of the file using a sequence of <code>seek()/read()</code> or via the <code>PositionedReadable</code> or <code>ByteBufferPositionedReadable</code> APIs.</p>
<p>Random IO performance may be best if little/no prefetching takes place, along with other possible optimizations</p>
<p>Queries over columnar formats such as Apache ORC and Apache Parquet perform such random IO; other data formats may be best read with sequential or whole-file policies.</p>
<p>What is key is that optimizing reads for seqential reads may impair random performance -and vice versa.</p>
<ol style="list-style-type: decimal">
<li>The seek policy is a hint; even if declared as a <code>must()</code> option, the filesystem MAY ignore it.</li>
<li>The interpretation/implementation of a policy is a filesystem specific behavior -and it may change with Hadoop releases and/or specific storage subsystems.</li>
<li>If a policy is not recognized, the filesystem client MUST ignore it.</li>
</ol>
<table border="0" class="bodyTable">
<thead>
<tr class="a">
<th> Policy </th>
<th> Meaning </th></tr>
</thead><tbody>
<tr class="b">
<td> <code>adaptive</code> </td>
<td> Any adaptive policy implemented by the store. </td></tr>
<tr class="a">
<td> <code>default</code> </td>
<td> The default policy for this store. Generally &#x201c;adaptive&#x201d;. </td></tr>
<tr class="b">
<td> <code>random</code> </td>
<td> Optimize for random access. </td></tr>
<tr class="a">
<td> <code>sequential</code> </td>
<td> Optimize for sequential access. </td></tr>
<tr class="b">
<td> <code>vector</code> </td>
<td> The Vectored IO API is intended to be used. </td></tr>
<tr class="a">
<td> <code>whole-file</code> </td>
<td> The whole file will be read. </td></tr>
</tbody>
</table>
<p>Choosing the wrong read policy for an input source may be inefficient.</p>
<p>A list of read policies MAY be supplied; the first one recognized/supported by the filesystem SHALL be the one used. This allows for custom policies to be supported, for example an <code>hbase-hfile</code> policy optimized for HBase HFiles.</p>
<p>The S3A and ABFS input streams both implement the <a href="iostatistics.html">IOStatisticsSource</a> API, and can be queried for their IO Performance.</p>
<p><i>Tip:</i> log the <code>toString()</code> value of input streams at <code>DEBUG</code>. The S3A and ABFS Input Streams log read statistics, which can provide insight about whether reads are being performed efficiently or not.</p>
<p><i>Futher reading</i></p>
<ul>
<li><a class="externalLink" href="https://linux.die.net/man/2/fadvise">Linux fadvise()</a>.</li>
<li><a class="externalLink" href="https://docs.microsoft.com/en-us/windows/win32/api/fileapi/nf-fileapi-createfilea#caching-behavior">Windows <code>CreateFile()</code></a></li>
</ul><section>
<h4><a name="Read_Policy_adaptive"></a><a name="read.policy.adaptive"></a> Read Policy <code>adaptive</code></h4>
<p>Try to adapt the seek policy to the read pattern of the application.</p>
<p>The <code>normal</code> policy of the S3A client and the sole policy supported by the <code>wasb:</code> client are both adaptive -they assume sequential IO, but once a backwards seek/positioned read call is made the stream switches to random IO.</p>
<p>Other filesystem implementations may wish to adopt similar strategies, and/or extend the algorithms to detect forward seeks and/or switch from random to sequential IO if that is considered more efficient.</p>
<p>Adaptive read policies are the absence of the ability to declare the seek policy in the <code>open()</code> API, so requiring it to be declared, if configurable, in the cluster/application configuration. However, the switch from sequential to random seek policies may be exensive.</p>
<p>When applications explicitly set the <code>fs.option.openfile.read.policy</code> option, if they know their read plan, they SHOULD declare which policy is most appropriate.</p></section><section>
<h4><a name="Read_Policy_.60.60"></a><a name="read.policy.default"></a> Read Policy ``</h4>
<p>The default policy for the filesystem instance. Implementation/installation-specific.</p></section><section>
<h4><a name="Read_Policy_sequential"></a><a name="read.policy.sequential"></a> Read Policy <code>sequential</code></h4>
<p>Expect sequential reads from the first byte read to the end of the file/until the stream is closed.</p></section><section>
<h4><a name="Read_Policy_random"></a><a name="read.policy.random"></a> Read Policy <code>random</code></h4>
<p>Expect <code>seek()/read()</code> sequences, or use of <code>PositionedReadable</code> or <code>ByteBufferPositionedReadable</code> APIs.</p></section><section>
<h4><a name="Read_Policy_vector"></a><a name="read.policy.vector"></a> Read Policy <code>vector</code></h4>
<p>This declares that the caller intends to use the Vectored read API of <a class="externalLink" href="https://issues.apache.org/jira/browse/HADOOP-11867">HADOOP-11867</a> <i>Add a high-performance vectored read API</i>.</p>
<p>This is a hint: it is not a requirement when using the API. It does inform the implemenations that the stream should be configured for optimal vectored IO performance, if such a feature has been implemented.</p>
<p>It is <i>not</i> exclusive: the same stream may still be used for classic <code>InputStream</code> and <code>PositionedRead</code> API calls. Implementations SHOULD use the <code>random</code> read policy with these operations.</p></section><section>
<h4><a name="Read_Policy_whole-file"></a><a name="read.policy.whole-file"></a> Read Policy <code>whole-file</code></h4>
<p>This declares that the whole file is to be read end-to-end; the file system client is free to enable whatever strategies maximise performance for this. In particular, larger ranged reads/GETs can deliver high bandwidth by reducing socket/TLS setup costs and providing a connection long-lived enough for TCP flow control to determine the optimal download rate.</p>
<p>Strategies can include:</p>
<ul>
<li>Initiate an HTTP GET of the entire file in <code>openFile()</code> operation.</li>
<li>Prefech data in large blocks, possibly in parallel read operations.</li>
</ul>
<p>Applications which know that the entire file is to be read from an opened stream SHOULD declare this read policy.</p></section></section><section>
<h3><a name="Option:_fs.option.openfile.length"></a><a name="openfile.length"></a> Option: <code>fs.option.openfile.length</code></h3>
<p>Declare the length of a file.</p>
<p>This can be used by clients to skip querying a remote store for the size of/existence of a file when opening it, similar to declaring a file status through the <code>withFileStatus()</code> option.</p>
<p>If supported by a filesystem connector, this option MUST be interpreted as declaring the minimum length of the file:</p>
<ol style="list-style-type: decimal">
<li>If the value is negative, the option SHALL be considered unset.</li>
<li>It SHALL NOT be an error if the actual length of the file is greater than this value.</li>
<li><code>read()</code>, <code>seek()</code> and positioned read calls MAY use a position across/beyond this length but below the actual length of the file. Implementations MAY raise <code>EOFExceptions</code> in such cases, or they MAY return data.</li>
</ol>
<p>If this option is used by the FileSystem implementation</p>
<p><i>Implementor&#x2019;s Notes</i></p>
<ul>
<li>A value of <code>fs.option.openfile.length</code> &lt; 0 MUST be ignored.</li>
<li>If a file status is supplied along with a value in <code>fs.opt.openfile.length</code>; the file status values take precedence.</li>
</ul></section><section>
<h3><a name="Options:_fs.option.openfile.split.start_and_fs.option.openfile.split.end"></a><a name="split.start"></a> Options: <code>fs.option.openfile.split.start</code> and <code>fs.option.openfile.split.end</code></h3>
<p>Declare the start and end of the split when a file has been split for processing in pieces.</p>
<ol style="list-style-type: decimal">
<li>If a value is negative, the option SHALL be considered unset.</li>
<li>Filesystems MAY assume that the length of the file is greater than or equal to the value of <code>fs.option.openfile.split.end</code>.</li>
<li>And that they MAY raise an exception if the client application reads past the value set in <code>fs.option.openfile.split.end</code>.</li>
<li>The pair of options MAY be used to optimise the read plan, such as setting the content range for GET requests, or using the split end as an implicit declaration of the guaranteed minimum length of the file.</li>
<li>If both options are set, and the split start is declared as greater than the split end, then the split start SHOULD just be reset to zero, rather than rejecting the operation.</li>
</ol>
<p>The split end value can provide a hint as to the end of the input stream. The split start can be used to optimize any initial read offset for filesystem clients.</p>
<p>*Note for implementors: applications will read past the end of a split when they need to read to the end of a record/line which begins before the end of the split.</p>
<p>Therefore clients MUST be allowed to <code>seek()</code>/<code>read()</code> past the length set in <code>fs.option.openfile.split.end</code> if the file is actually longer than that value.</p></section></section><section>
<h2><a name="S3A-specific_options"></a><a name="s3a"></a> S3A-specific options</h2>
<p>The S3A Connector supports custom options for readahead and seek policy.</p>
<table border="0" class="bodyTable">
<thead>
<tr class="a">
<th> Name </th>
<th> Type </th>
<th> Meaning </th></tr>
</thead><tbody>
<tr class="b">
<td> <code>fs.s3a.readahead.range</code> </td>
<td> <code>long</code> </td>
<td> readahead range in bytes </td></tr>
<tr class="a">
<td> <code>fs.s3a.experimental.input.fadvise</code> </td>
<td> <code>String</code> </td>
<td> seek policy. Superceded by <code>fs.option.openfile.read.policy</code> </td></tr>
<tr class="b">
<td> <code>fs.s3a.input.async.drain.threshold</code> </td>
<td> <code>long</code> </td>
<td> threshold to switch to asynchronous draining of the stream. (Since 3.3.5) </td></tr>
</tbody>
</table>
<p>If the option set contains a SQL statement in the <code>fs.s3a.select.sql</code> statement, then the file is opened as an S3 Select query. Consult the S3A documentation for more details.</p></section><section>
<h2><a name="ABFS-specific_options"></a><a name="abfs"></a> ABFS-specific options</h2>
<p>The ABFS Connector supports custom input stream options.</p>
<table border="0" class="bodyTable">
<thead>
<tr class="a">
<th> Name </th>
<th> Type </th>
<th> Meaning </th></tr>
</thead><tbody>
<tr class="b">
<td> <code>fs.azure.buffered.pread.disable</code> </td>
<td> <code>boolean</code> </td>
<td> disable caching on the positioned read operations. </td></tr>
</tbody>
</table>
<p>Disables caching on data read through the <a href="fsdatainputstream.html#PositionedReadable">PositionedReadable</a> APIs.</p>
<p>Consult the ABFS Documentation for more details.</p></section><section>
<h2><a name="Examples"></a><a name="examples"></a> Examples</h2><section><section>
<h4><a name="Declaring_seek_policy_and_split_limits_when_opening_a_file."></a>Declaring seek policy and split limits when opening a file.</h4>
<p>Here is an example from a proof of concept <code>org.apache.parquet.hadoop.util.HadoopInputFile</code> reader which uses a (nullable) file status and a split start/end.</p>
<p>The <code>FileStatus</code> value is always passed in -but if it is null, then the split end is used to declare the length of the file.</p>
<div class="source">
<div class="source">
<pre>protected SeekableInputStream newStream(Path path, FileStatus stat,
long splitStart, long splitEnd)
throws IOException {
FutureDataInputStreamBuilder builder = fs.openFile(path)
.opt(&quot;fs.option.openfile.read.policy&quot;, &quot;vector, random&quot;)
.withFileStatus(stat);
builder.optLong(&quot;fs.option.openfile.split.start&quot;, splitStart);
builder.optLong(&quot;fs.option.openfile.split.end&quot;, splitEnd);
CompletableFuture&lt;FSDataInputStream&gt; streamF = builder.build();
return HadoopStreams.wrap(FutureIO.awaitFuture(streamF));
}
</pre></div></div>
<p>As a result, whether driven directly by a file listing, or when opening a file from a query plan of <code>(path, splitStart, splitEnd)</code>, there is no need to probe the remote store for the length of the file. When working with remote object stores, this can save tens to hundreds of milliseconds, even if such a probe is done asynchronously.</p>
<p>If both the file length and the split end is set, then the file length MUST be considered &#x201c;more&#x201d; authoritative, that is it really SHOULD be defining the file length. If the split end is set, the caller MAY ot read past it.</p>
<p>The <code>CompressedSplitLineReader</code> can read past the end of a split if it is partway through processing a compressed record. That is: it assumes an incomplete record read means that the file length is greater than the split length, and that it MUST read the entirety of the partially read record. Other readers may behave similarly.</p>
<p>Therefore</p>
<ol style="list-style-type: decimal">
<li>File length as supplied in a <code>FileStatus</code> or in <code>fs.option.openfile.length</code> SHALL set the strict upper limit on the length of a file</li>
<li>The split end as set in <code>fs.option.openfile.split.end</code> MUST be viewed as a hint, rather than the strict end of the file.</li>
</ol></section></section><section>
<h3><a name="Opening_a_file_with_both_standard_and_non-standard_options"></a>Opening a file with both standard and non-standard options</h3>
<p>Standard and non-standard options MAY be combined in the same <code>openFile()</code> operation.</p>
<div class="source">
<div class="source">
<pre>Future&lt;FSDataInputStream&gt; f = openFile(path)
.must(&quot;fs.option.openfile.read.policy&quot;, &quot;random, adaptive&quot;)
.opt(&quot;fs.s3a.readahead.range&quot;, 1024 * 1024)
.build();
FSDataInputStream is = f.get();
</pre></div></div>
<p>The option set in <code>must()</code> MUST be understood, or at least recognized and ignored by all filesystems. In this example, S3A-specific option MAY be ignored by all other filesystem clients.</p></section><section>
<h3><a name="Opening_a_file_with_older_releases"></a>Opening a file with older releases</h3>
<p>Not all hadoop releases recognize the <code>fs.option.openfile.read.policy</code> option.</p>
<p>The option can be safely used in application code if it is added via the <code>opt()</code> builder argument, as it will be treated as an unknown optional key which can then be discarded.</p>
<div class="source">
<div class="source">
<pre>Future&lt;FSDataInputStream&gt; f = openFile(path)
.opt(&quot;fs.option.openfile.read.policy&quot;, &quot;vector, random, adaptive&quot;)
.build();
FSDataInputStream is = f.get();
</pre></div></div>
<p><i>Note 1</i> if the option name is set by a reference to a constant in <code>org.apache.hadoop.fs.Options.OpenFileOptions</code>, then the program will not link against versions of Hadoop without the specific option. Therefore for resilient linking against older releases -use a copy of the value.</p>
<p><i>Note 2</i> as option validation is performed in the FileSystem connector, a third-party connector designed to work with multiple hadoop versions MAY NOT support the option.</p></section><section>
<h3><a name="Passing_options_in_to_MapReduce"></a>Passing options in to MapReduce</h3>
<p>Hadoop MapReduce will automatically read MR Job Options with the prefixes <code>mapreduce.job.input.file.option.</code> and <code>mapreduce.job.input.file.must.</code> prefixes, and apply these values as <code>.opt()</code> and <code>must()</code> respectively, after remove the mapreduce-specific prefixes.</p>
<p>This makes passing options in to MR jobs straightforward. For example, to declare that a job should read its data using random IO:</p>
<div class="source">
<div class="source">
<pre>JobConf jobConf = (JobConf) job.getConfiguration()
jobConf.set(
&quot;mapreduce.job.input.file.option.fs.option.openfile.read.policy&quot;,
&quot;random&quot;);
</pre></div></div>
</section><section>
<h3><a name="MapReduce_input_format_propagating_options"></a>MapReduce input format propagating options</h3>
<p>An example of a record reader passing in options to the file it opens.</p>
<div class="source">
<div class="source">
<pre> public void initialize(InputSplit genericSplit,
TaskAttemptContext context) throws IOException {
FileSplit split = (FileSplit)genericSplit;
Configuration job = context.getConfiguration();
start = split.getStart();
end = start + split.getLength();
Path file = split.getPath();
// open the file and seek to the start of the split
FutureDataInputStreamBuilder builder =
file.getFileSystem(job).openFile(file);
// the start and end of the split may be used to build
// an input strategy.
builder.optLong(&quot;fs.option.openfile.split.start&quot;, start);
builder.optLong(&quot;fs.option.openfile.split.end&quot;, end);
FutureIO.propagateOptions(builder, job,
&quot;mapreduce.job.input.file.option&quot;,
&quot;mapreduce.job.input.file.must&quot;);
fileIn = FutureIO.awaitFuture(builder.build());
fileIn.seek(start)
/* Rest of the operation on the opened stream */
}
</pre></div></div>
</section><section>
<h3><a name="FileContext.openFile"></a><code>FileContext.openFile</code></h3>
<p>From <code>org.apache.hadoop.fs.AvroFSInput</code>; a file is opened with sequential input. Because the file length has already been probed for, the length is passed down</p>
<div class="source">
<div class="source">
<pre> public AvroFSInput(FileContext fc, Path p) throws IOException {
FileStatus status = fc.getFileStatus(p);
this.len = status.getLen();
this.stream = awaitFuture(fc.openFile(p)
.opt(&quot;fs.option.openfile.read.policy&quot;,
&quot;sequential&quot;)
.optLong(&quot;fs.option.openfile.length&quot;,
Long.toString(status.getLen()))
.build());
fc.open(p);
}
</pre></div></div>
<p>In this example, the length is passed down as a string (via <code>Long.toString()</code>) rather than directly as a long. This is to ensure that the input format will link against versions of $Hadoop which do not have the <code>opt(String, long)</code> and <code>must(String, long)</code> builder parameters. Similarly, the values are passed as optional, so that if unrecognized the application will still succeed.</p></section><section>
<h3><a name="Example:_reading_a_whole_file"></a>Example: reading a whole file</h3>
<p>This is from <code>org.apache.hadoop.util.JsonSerialization</code>.</p>
<p>Its <code>load(FileSystem, Path, FileStatus)</code> method * declares the whole file is to be read end to end. * passes down the file status</p>
<div class="source">
<div class="source">
<pre>public T load(FileSystem fs,
Path path,
status)
throws IOException {
try (FSDataInputStream dataInputStream =
awaitFuture(fs.openFile(path)
.opt(&quot;fs.option.openfile.read.policy&quot;, &quot;whole-file&quot;)
.withFileStatus(status)
.build())) {
return fromJsonStream(dataInputStream);
} catch (JsonProcessingException e) {
throw new PathIOException(path.toString(),
&quot;Failed to read JSON file &quot; + e, e);
}
}
</pre></div></div></section></section>
</div>
</div>
<div class="clear">
<hr/>
</div>
<div id="footer">
<div class="xright">
&#169; 2008-2024
Apache Software Foundation
- <a href="http://maven.apache.org/privacy-policy.html">Privacy Policy</a>.
Apache Maven, Maven, Apache, the Apache feather logo, and the Apache Maven project logos are trademarks of The Apache Software Foundation.
</div>
<div class="clear">
<hr/>
</div>
</div>
</body>
</html>