blob: f6298dc5cb1e88baa02e5e72867f0e54419d45d3 [file] [log] [blame]
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<!-- NewPage -->
<html lang="en">
<head>
<!-- Generated by javadoc -->
<title>ContextualTextIO (Apache Beam 2.38.0-SNAPSHOT)</title>
<link rel="stylesheet" type="text/css" href="../../../../../../stylesheet.css" title="Style">
<script type="text/javascript" src="../../../../../../script.js"></script>
</head>
<body>
<script type="text/javascript"><!--
try {
if (location.href.indexOf('is-external=true') == -1) {
parent.document.title="ContextualTextIO (Apache Beam 2.38.0-SNAPSHOT)";
}
}
catch(err) {
}
//-->
var methods = {"i0":9,"i1":9};
var tabs = {65535:["t0","All Methods"],1:["t1","Static Methods"],8:["t4","Concrete Methods"]};
var altColor = "altColor";
var rowColor = "rowColor";
var tableTab = "tableTab";
var activeTableTab = "activeTableTab";
</script>
<noscript>
<div>JavaScript is disabled on your browser.</div>
</noscript>
<!-- ========= START OF TOP NAVBAR ======= -->
<div class="topNav"><a name="navbar.top">
<!-- -->
</a>
<div class="skipNav"><a href="#skip.navbar.top" title="Skip navigation links">Skip navigation links</a></div>
<a name="navbar.top.firstrow">
<!-- -->
</a>
<ul class="navList" title="Navigation">
<li><a href="../../../../../../overview-summary.html">Overview</a></li>
<li><a href="package-summary.html">Package</a></li>
<li class="navBarCell1Rev">Class</li>
<li><a href="package-tree.html">Tree</a></li>
<li><a href="../../../../../../deprecated-list.html">Deprecated</a></li>
<li><a href="../../../../../../index-all.html">Index</a></li>
<li><a href="../../../../../../help-doc.html">Help</a></li>
</ul>
</div>
<div class="subNav">
<ul class="navList">
<li>Prev&nbsp;Class</li>
<li><a href="../../../../../../org/apache/beam/sdk/io/contextualtextio/ContextualTextIO.Read.html" title="class in org.apache.beam.sdk.io.contextualtextio"><span class="typeNameLink">Next&nbsp;Class</span></a></li>
</ul>
<ul class="navList">
<li><a href="../../../../../../index.html?org/apache/beam/sdk/io/contextualtextio/ContextualTextIO.html" target="_top">Frames</a></li>
<li><a href="ContextualTextIO.html" target="_top">No&nbsp;Frames</a></li>
</ul>
<ul class="navList" id="allclasses_navbar_top">
<li><a href="../../../../../../allclasses-noframe.html">All&nbsp;Classes</a></li>
</ul>
<div>
<script type="text/javascript"><!--
allClassesLink = document.getElementById("allclasses_navbar_top");
if(window==top) {
allClassesLink.style.display = "block";
}
else {
allClassesLink.style.display = "none";
}
//-->
</script>
</div>
<div>
<ul class="subNavList">
<li>Summary:&nbsp;</li>
<li><a href="#nested.class.summary">Nested</a>&nbsp;|&nbsp;</li>
<li>Field&nbsp;|&nbsp;</li>
<li>Constr&nbsp;|&nbsp;</li>
<li><a href="#method.summary">Method</a></li>
</ul>
<ul class="subNavList">
<li>Detail:&nbsp;</li>
<li>Field&nbsp;|&nbsp;</li>
<li>Constr&nbsp;|&nbsp;</li>
<li><a href="#method.detail">Method</a></li>
</ul>
</div>
<a name="skip.navbar.top">
<!-- -->
</a></div>
<!-- ========= END OF TOP NAVBAR ========= -->
<!-- ======== START OF CLASS DATA ======== -->
<div class="header">
<div class="subTitle">org.apache.beam.sdk.io.contextualtextio</div>
<h2 title="Class ContextualTextIO" class="title">Class ContextualTextIO</h2>
</div>
<div class="contentContainer">
<ul class="inheritance">
<li>java.lang.Object</li>
<li>
<ul class="inheritance">
<li>org.apache.beam.sdk.io.contextualtextio.ContextualTextIO</li>
</ul>
</li>
</ul>
<div class="description">
<ul class="blockList">
<li class="blockList">
<hr>
<br>
<pre>public class <span class="typeNameLabel">ContextualTextIO</span>
extends java.lang.Object</pre>
<div class="block"><a href="../../../../../../org/apache/beam/sdk/transforms/PTransform.html" title="class in org.apache.beam.sdk.transforms"><code>PTransform</code></a>s that read text files and collect contextual information of the elements in
the input.
<p>Prefer <a href="../../../../../../org/apache/beam/sdk/io/TextIO.html" title="class in org.apache.beam.sdk.io"><code>TextIO</code></a> when not reading files with multi-line records or additional record
metadata is not required.
<h2>Reading from text files</h2>
<p>To read a <a href="../../../../../../org/apache/beam/sdk/values/PCollection.html" title="class in org.apache.beam.sdk.values"><code>PCollection</code></a> from one or more text files, use <code>ContextualTextIO.read()</code>. To instantiate a transform use <a href="../../../../../../org/apache/beam/sdk/io/contextualtextio/ContextualTextIO.Read.html#from-java.lang.String-"><code>ContextualTextIO.Read.from(String)</code></a> and specify the path of the file(s) to be read.
Alternatively, if the filenames to be read are themselves in a <a href="../../../../../../org/apache/beam/sdk/values/PCollection.html" title="class in org.apache.beam.sdk.values"><code>PCollection</code></a> you can use
<a href="../../../../../../org/apache/beam/sdk/io/FileIO.html" title="class in org.apache.beam.sdk.io"><code>FileIO</code></a> to match them and <a href="../../../../../../org/apache/beam/sdk/io/contextualtextio/ContextualTextIO.html#readFiles--"><code>readFiles()</code></a> to read them.
<p><a href="../../../../../../org/apache/beam/sdk/io/contextualtextio/ContextualTextIO.html#read--"><code>read()</code></a> returns a <a href="../../../../../../org/apache/beam/sdk/values/PCollection.html" title="class in org.apache.beam.sdk.values"><code>PCollection</code></a> of <a href="../../../../../../org/apache/beam/sdk/values/Row.html" title="class in org.apache.beam.sdk.values"><code>Row</code></a>s with schema <a href="../../../../../../org/apache/beam/sdk/io/contextualtextio/RecordWithMetadata.html#getSchema--"><code>RecordWithMetadata.getSchema()</code></a>, each corresponding to one line of an input UTF-8 text file
(split into lines delimited by '\n', '\r', '\r\n', or specified delimiter via <a href="../../../../../../org/apache/beam/sdk/io/contextualtextio/ContextualTextIO.Read.html#withDelimiter-byte:A-"><code>ContextualTextIO.Read.withDelimiter(byte[])</code></a>).
<h3>Filepattern expansion and watching</h3>
<p>By default, the filepatterns are expanded only once. The combination of <code>FileIO.Match#continuously(Duration, TerminationCondition)</code> and <a href="../../../../../../org/apache/beam/sdk/io/contextualtextio/ContextualTextIO.html#readFiles--"><code>readFiles()</code></a> allow
streaming of new files matching the filepattern(s).
<p>By default, <a href="../../../../../../org/apache/beam/sdk/io/contextualtextio/ContextualTextIO.html#read--"><code>read()</code></a> prohibits filepatterns that match no files, and <a href="../../../../../../org/apache/beam/sdk/io/contextualtextio/ContextualTextIO.html#readFiles--"><code>readFiles()</code></a>
allows them in case the filepattern contains a glob wildcard character. Use <a href="../../../../../../org/apache/beam/sdk/io/contextualtextio/ContextualTextIO.Read.html#withEmptyMatchTreatment-org.apache.beam.sdk.io.fs.EmptyMatchTreatment-"><code>ContextualTextIO.Read.withEmptyMatchTreatment(org.apache.beam.sdk.io.fs.EmptyMatchTreatment)</code></a> or <code>FileIO.Match#withEmptyMatchTreatment(EmptyMatchTreatment)</code> plus <a href="../../../../../../org/apache/beam/sdk/io/contextualtextio/ContextualTextIO.html#readFiles--"><code>readFiles()</code></a> to configure
this behavior.
<p>Example 1: reading a file or filepattern.
<pre><code>
Pipeline p = ...;
// A simple Read of a file:
PCollection&lt;Row&gt; records = p.apply(ContextualTextIO.read().from("/local/path/to/file.txt"));
</code></pre>
<p>Example 2: reading a PCollection of filenames.
<pre><code>
Pipeline p = ...;
// E.g. the filenames might be computed from other data in the pipeline, or
// read from a data source.
PCollection&lt;String&gt; filenames = ...;
// Read all files in the collection.
PCollection&lt;Row&gt; records =
filenames
.apply(FileIO.matchAll())
.apply(FileIO.readMatches())
.apply(ContextualTextIO.readFiles());
</code></pre>
<p>Example 3: streaming new files matching a filepattern.
<pre><code>
Pipeline p = ...;
PCollection&lt;Row&gt; records = p.apply(ContextualTextIO.read()
.from("/local/path/to/files/*")
.watchForNewFiles(
// Check for new files every minute
Duration.standardMinutes(1),
// Stop watching the filepattern if no new files appear within an hour
afterTimeSinceNewOutput(Duration.standardHours(1))));
</code></pre>
<p>Example 4: reading a file or file pattern of RFC4180-compliant CSV files with fields that may
contain line breaks.
<p>Example of such a file could be:
<p>"aaa","b CRLF bb","ccc" CRLF zzz,yyy,xxx
<pre><code>
Pipeline p = ...;
PCollection&lt;Row&gt; records = p.apply(ContextualTextIO.read()
.from("/local/path/to/files/*.csv")
.withHasMultilineCSVRecords(true));
</code></pre>
<p>Example 5: reading while watching for new files
<pre><code>
Pipeline p = ...;
PCollection&lt;Row&gt; records = p.apply(FileIO.match()
.filepattern("filepattern")
.continuously(
Duration.millis(100),
Watch.Growth.afterTimeSinceNewOutput(Duration.standardSeconds(3))))
.apply(FileIO.readMatches())
.apply(ContextualTextIO.readFiles());
</code></pre>
<p>Example 6: reading with recordNum metadata.
<pre><code>
Pipeline p = ...;
PCollection&lt;Row&gt; records = p.apply(ContextualTextIO.read()
.from("/local/path/to/files/*.csv")
.setWithRecordNumMetadata(true));
</code></pre>
<p>NOTE: When using <a href="../../../../../../org/apache/beam/sdk/io/contextualtextio/ContextualTextIO.Read.html#withHasMultilineCSVRecords-java.lang.Boolean-"><code>ContextualTextIO.Read.withHasMultilineCSVRecords(Boolean)</code></a>, a single
reader will be used to process the file, rather than multiple readers which can read from
different offsets. For a large file this can result in lower performance.
<p>NOTE: Use <a href="../../../../../../org/apache/beam/sdk/io/contextualtextio/ContextualTextIO.Read.html#withRecordNumMetadata--"><code>ContextualTextIO.Read.withRecordNumMetadata()</code></a> when recordNum metadata is required. Computing
absolute record positions currently introduces a grouping step, which increases the resources
used by the pipeline. By default withRecordNumMetadata is set to false, in this case record
objects will not contain absolute record positions within the entire file, but will still contain
relative positions in respective offsets.
<h3>Reading a very large number of files</h3>
<p>If it is known that the filepattern will match a very large number of files (e.g. tens of
thousands or more), use <a href="../../../../../../org/apache/beam/sdk/io/contextualtextio/ContextualTextIO.Read.html#withHintMatchesManyFiles--"><code>ContextualTextIO.Read.withHintMatchesManyFiles()</code></a> for better
performance and scalability. Note that it may decrease performance if the filepattern matches
only a small number of files.</div>
</li>
</ul>
</div>
<div class="summary">
<ul class="blockList">
<li class="blockList">
<!-- ======== NESTED CLASS SUMMARY ======== -->
<ul class="blockList">
<li class="blockList"><a name="nested.class.summary">
<!-- -->
</a>
<h3>Nested Class Summary</h3>
<table class="memberSummary" border="0" cellpadding="3" cellspacing="0" summary="Nested Class Summary table, listing nested classes, and an explanation">
<caption><span>Nested Classes</span><span class="tabEnd">&nbsp;</span></caption>
<tr>
<th class="colFirst" scope="col">Modifier and Type</th>
<th class="colLast" scope="col">Class and Description</th>
</tr>
<tr class="altColor">
<td class="colFirst"><code>static class&nbsp;</code></td>
<td class="colLast"><code><span class="memberNameLink"><a href="../../../../../../org/apache/beam/sdk/io/contextualtextio/ContextualTextIO.Read.html" title="class in org.apache.beam.sdk.io.contextualtextio">ContextualTextIO.Read</a></span></code>
<div class="block">Implementation of <a href="../../../../../../org/apache/beam/sdk/io/contextualtextio/ContextualTextIO.html#read--"><code>read()</code></a>.</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><code>static class&nbsp;</code></td>
<td class="colLast"><code><span class="memberNameLink"><a href="../../../../../../org/apache/beam/sdk/io/contextualtextio/ContextualTextIO.ReadFiles.html" title="class in org.apache.beam.sdk.io.contextualtextio">ContextualTextIO.ReadFiles</a></span></code>
<div class="block">Implementation of <a href="../../../../../../org/apache/beam/sdk/io/contextualtextio/ContextualTextIO.html#readFiles--"><code>readFiles()</code></a>.</div>
</td>
</tr>
</table>
</li>
</ul>
<!-- ========== METHOD SUMMARY =========== -->
<ul class="blockList">
<li class="blockList"><a name="method.summary">
<!-- -->
</a>
<h3>Method Summary</h3>
<table class="memberSummary" border="0" cellpadding="3" cellspacing="0" summary="Method Summary table, listing methods, and an explanation">
<caption><span id="t0" class="activeTableTab"><span>All Methods</span><span class="tabEnd">&nbsp;</span></span><span id="t1" class="tableTab"><span><a href="javascript:show(1);">Static Methods</a></span><span class="tabEnd">&nbsp;</span></span><span id="t4" class="tableTab"><span><a href="javascript:show(8);">Concrete Methods</a></span><span class="tabEnd">&nbsp;</span></span></caption>
<tr>
<th class="colFirst" scope="col">Modifier and Type</th>
<th class="colLast" scope="col">Method and Description</th>
</tr>
<tr id="i0" class="altColor">
<td class="colFirst"><code>static <a href="../../../../../../org/apache/beam/sdk/io/contextualtextio/ContextualTextIO.Read.html" title="class in org.apache.beam.sdk.io.contextualtextio">ContextualTextIO.Read</a></code></td>
<td class="colLast"><code><span class="memberNameLink"><a href="../../../../../../org/apache/beam/sdk/io/contextualtextio/ContextualTextIO.html#read--">read</a></span>()</code>
<div class="block">A <a href="../../../../../../org/apache/beam/sdk/transforms/PTransform.html" title="class in org.apache.beam.sdk.transforms"><code>PTransform</code></a> that reads from one or more text files and returns a bounded <a href="../../../../../../org/apache/beam/sdk/values/PCollection.html" title="class in org.apache.beam.sdk.values"><code>PCollection</code></a> containing one <a href="../../../../../../org/apache/beam/sdk/values/Row.html" title="class in org.apache.beam.sdk.values"><code>element</code></a> for each line in the input files.</div>
</td>
</tr>
<tr id="i1" class="rowColor">
<td class="colFirst"><code>static <a href="../../../../../../org/apache/beam/sdk/io/contextualtextio/ContextualTextIO.ReadFiles.html" title="class in org.apache.beam.sdk.io.contextualtextio">ContextualTextIO.ReadFiles</a></code></td>
<td class="colLast"><code><span class="memberNameLink"><a href="../../../../../../org/apache/beam/sdk/io/contextualtextio/ContextualTextIO.html#readFiles--">readFiles</a></span>()</code>
<div class="block">Like <a href="../../../../../../org/apache/beam/sdk/io/contextualtextio/ContextualTextIO.html#read--"><code>read()</code></a>, but reads each file in a <a href="../../../../../../org/apache/beam/sdk/values/PCollection.html" title="class in org.apache.beam.sdk.values"><code>PCollection</code></a> of <code>FileIO.ReadableFile</code>, returned by <a href="../../../../../../org/apache/beam/sdk/io/FileIO.html#readMatches--"><code>FileIO.readMatches()</code></a>.</div>
</td>
</tr>
</table>
<ul class="blockList">
<li class="blockList"><a name="methods.inherited.from.class.java.lang.Object">
<!-- -->
</a>
<h3>Methods inherited from class&nbsp;java.lang.Object</h3>
<code>clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait</code></li>
</ul>
</li>
</ul>
</li>
</ul>
</div>
<div class="details">
<ul class="blockList">
<li class="blockList">
<!-- ============ METHOD DETAIL ========== -->
<ul class="blockList">
<li class="blockList"><a name="method.detail">
<!-- -->
</a>
<h3>Method Detail</h3>
<a name="read--">
<!-- -->
</a>
<ul class="blockList">
<li class="blockList">
<h4>read</h4>
<pre>public static&nbsp;<a href="../../../../../../org/apache/beam/sdk/io/contextualtextio/ContextualTextIO.Read.html" title="class in org.apache.beam.sdk.io.contextualtextio">ContextualTextIO.Read</a>&nbsp;read()</pre>
<div class="block">A <a href="../../../../../../org/apache/beam/sdk/transforms/PTransform.html" title="class in org.apache.beam.sdk.transforms"><code>PTransform</code></a> that reads from one or more text files and returns a bounded <a href="../../../../../../org/apache/beam/sdk/values/PCollection.html" title="class in org.apache.beam.sdk.values"><code>PCollection</code></a> containing one <a href="../../../../../../org/apache/beam/sdk/values/Row.html" title="class in org.apache.beam.sdk.values"><code>element</code></a> for each line in the input files.</div>
</li>
</ul>
<a name="readFiles--">
<!-- -->
</a>
<ul class="blockListLast">
<li class="blockList">
<h4>readFiles</h4>
<pre>public static&nbsp;<a href="../../../../../../org/apache/beam/sdk/io/contextualtextio/ContextualTextIO.ReadFiles.html" title="class in org.apache.beam.sdk.io.contextualtextio">ContextualTextIO.ReadFiles</a>&nbsp;readFiles()</pre>
<div class="block">Like <a href="../../../../../../org/apache/beam/sdk/io/contextualtextio/ContextualTextIO.html#read--"><code>read()</code></a>, but reads each file in a <a href="../../../../../../org/apache/beam/sdk/values/PCollection.html" title="class in org.apache.beam.sdk.values"><code>PCollection</code></a> of <code>FileIO.ReadableFile</code>, returned by <a href="../../../../../../org/apache/beam/sdk/io/FileIO.html#readMatches--"><code>FileIO.readMatches()</code></a>.</div>
</li>
</ul>
</li>
</ul>
</li>
</ul>
</div>
</div>
<!-- ========= END OF CLASS DATA ========= -->
<!-- ======= START OF BOTTOM NAVBAR ====== -->
<div class="bottomNav"><a name="navbar.bottom">
<!-- -->
</a>
<div class="skipNav"><a href="#skip.navbar.bottom" title="Skip navigation links">Skip navigation links</a></div>
<a name="navbar.bottom.firstrow">
<!-- -->
</a>
<ul class="navList" title="Navigation">
<li><a href="../../../../../../overview-summary.html">Overview</a></li>
<li><a href="package-summary.html">Package</a></li>
<li class="navBarCell1Rev">Class</li>
<li><a href="package-tree.html">Tree</a></li>
<li><a href="../../../../../../deprecated-list.html">Deprecated</a></li>
<li><a href="../../../../../../index-all.html">Index</a></li>
<li><a href="../../../../../../help-doc.html">Help</a></li>
</ul>
</div>
<div class="subNav">
<ul class="navList">
<li>Prev&nbsp;Class</li>
<li><a href="../../../../../../org/apache/beam/sdk/io/contextualtextio/ContextualTextIO.Read.html" title="class in org.apache.beam.sdk.io.contextualtextio"><span class="typeNameLink">Next&nbsp;Class</span></a></li>
</ul>
<ul class="navList">
<li><a href="../../../../../../index.html?org/apache/beam/sdk/io/contextualtextio/ContextualTextIO.html" target="_top">Frames</a></li>
<li><a href="ContextualTextIO.html" target="_top">No&nbsp;Frames</a></li>
</ul>
<ul class="navList" id="allclasses_navbar_bottom">
<li><a href="../../../../../../allclasses-noframe.html">All&nbsp;Classes</a></li>
</ul>
<div>
<script type="text/javascript"><!--
allClassesLink = document.getElementById("allclasses_navbar_bottom");
if(window==top) {
allClassesLink.style.display = "block";
}
else {
allClassesLink.style.display = "none";
}
//-->
</script>
</div>
<div>
<ul class="subNavList">
<li>Summary:&nbsp;</li>
<li><a href="#nested.class.summary">Nested</a>&nbsp;|&nbsp;</li>
<li>Field&nbsp;|&nbsp;</li>
<li>Constr&nbsp;|&nbsp;</li>
<li><a href="#method.summary">Method</a></li>
</ul>
<ul class="subNavList">
<li>Detail:&nbsp;</li>
<li>Field&nbsp;|&nbsp;</li>
<li>Constr&nbsp;|&nbsp;</li>
<li><a href="#method.detail">Method</a></li>
</ul>
</div>
<a name="skip.navbar.bottom">
<!-- -->
</a></div>
<!-- ======== END OF BOTTOM NAVBAR ======= -->
</body>
</html>