blob: 8789a0a9a7a4c54e6b02775528d05281790630f2 [file] [log] [blame]
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<!-- NewPage -->
<html lang="en">
<head>
<!-- Generated by javadoc -->
<title>Deduplicate (Apache Beam 2.38.0-SNAPSHOT)</title>
<link rel="stylesheet" type="text/css" href="../../../../../stylesheet.css" title="Style">
<script type="text/javascript" src="../../../../../script.js"></script>
</head>
<body>
<script type="text/javascript"><!--
try {
if (location.href.indexOf('is-external=true') == -1) {
parent.document.title="Deduplicate (Apache Beam 2.38.0-SNAPSHOT)";
}
}
catch(err) {
}
//-->
var methods = {"i0":9,"i1":9,"i2":9};
var tabs = {65535:["t0","All Methods"],1:["t1","Static Methods"],8:["t4","Concrete Methods"]};
var altColor = "altColor";
var rowColor = "rowColor";
var tableTab = "tableTab";
var activeTableTab = "activeTableTab";
</script>
<noscript>
<div>JavaScript is disabled on your browser.</div>
</noscript>
<!-- ========= START OF TOP NAVBAR ======= -->
<div class="topNav"><a name="navbar.top">
<!-- -->
</a>
<div class="skipNav"><a href="#skip.navbar.top" title="Skip navigation links">Skip navigation links</a></div>
<a name="navbar.top.firstrow">
<!-- -->
</a>
<ul class="navList" title="Navigation">
<li><a href="../../../../../overview-summary.html">Overview</a></li>
<li><a href="package-summary.html">Package</a></li>
<li class="navBarCell1Rev">Class</li>
<li><a href="package-tree.html">Tree</a></li>
<li><a href="../../../../../deprecated-list.html">Deprecated</a></li>
<li><a href="../../../../../index-all.html">Index</a></li>
<li><a href="../../../../../help-doc.html">Help</a></li>
</ul>
</div>
<div class="subNav">
<ul class="navList">
<li><a href="../../../../../org/apache/beam/sdk/transforms/Create.Values.html" title="class in org.apache.beam.sdk.transforms"><span class="typeNameLink">Prev&nbsp;Class</span></a></li>
<li><a href="../../../../../org/apache/beam/sdk/transforms/Deduplicate.KeyedValues.html" title="class in org.apache.beam.sdk.transforms"><span class="typeNameLink">Next&nbsp;Class</span></a></li>
</ul>
<ul class="navList">
<li><a href="../../../../../index.html?org/apache/beam/sdk/transforms/Deduplicate.html" target="_top">Frames</a></li>
<li><a href="Deduplicate.html" target="_top">No&nbsp;Frames</a></li>
</ul>
<ul class="navList" id="allclasses_navbar_top">
<li><a href="../../../../../allclasses-noframe.html">All&nbsp;Classes</a></li>
</ul>
<div>
<script type="text/javascript"><!--
allClassesLink = document.getElementById("allclasses_navbar_top");
if(window==top) {
allClassesLink.style.display = "block";
}
else {
allClassesLink.style.display = "none";
}
//-->
</script>
</div>
<div>
<ul class="subNavList">
<li>Summary:&nbsp;</li>
<li><a href="#nested.class.summary">Nested</a>&nbsp;|&nbsp;</li>
<li><a href="#field.summary">Field</a>&nbsp;|&nbsp;</li>
<li>Constr&nbsp;|&nbsp;</li>
<li><a href="#method.summary">Method</a></li>
</ul>
<ul class="subNavList">
<li>Detail:&nbsp;</li>
<li><a href="#field.detail">Field</a>&nbsp;|&nbsp;</li>
<li>Constr&nbsp;|&nbsp;</li>
<li><a href="#method.detail">Method</a></li>
</ul>
</div>
<a name="skip.navbar.top">
<!-- -->
</a></div>
<!-- ========= END OF TOP NAVBAR ========= -->
<!-- ======== START OF CLASS DATA ======== -->
<div class="header">
<div class="subTitle">org.apache.beam.sdk.transforms</div>
<h2 title="Class Deduplicate" class="title">Class Deduplicate</h2>
</div>
<div class="contentContainer">
<ul class="inheritance">
<li>java.lang.Object</li>
<li>
<ul class="inheritance">
<li>org.apache.beam.sdk.transforms.Deduplicate</li>
</ul>
</li>
</ul>
<div class="description">
<ul class="blockList">
<li class="blockList">
<hr>
<br>
<pre>public final class <span class="typeNameLabel">Deduplicate</span>
extends java.lang.Object</pre>
<div class="block">A set of <a href="../../../../../org/apache/beam/sdk/transforms/PTransform.html" title="class in org.apache.beam.sdk.transforms"><code>PTransform</code></a>s which deduplicate input records over a time domain and threshold.
Values in different windows will not be considered duplicates of each other. Deduplication is
best effort.
<p>Two values of type <code>T</code> are compared for equality <b>not</b> by regular Java <code>Object.equals(java.lang.Object)</code>, but instead by first encoding each of the elements using the <code>PCollection</code>'s <code>Coder</code>, and then comparing the encoded bytes. This admits efficient
parallel evaluation.
<p>These PTransforms are different then <a href="../../../../../org/apache/beam/sdk/transforms/Distinct.html" title="class in org.apache.beam.sdk.transforms"><code>Distinct</code></a> since <a href="../../../../../org/apache/beam/sdk/transforms/Distinct.html" title="class in org.apache.beam.sdk.transforms"><code>Distinct</code></a> guarantees
uniqueness of values within a <a href="../../../../../org/apache/beam/sdk/values/PCollection.html" title="class in org.apache.beam.sdk.values"><code>PCollection</code></a> but may support a narrower set of <a href="../../../../../org/apache/beam/sdk/values/WindowingStrategy.html" title="class in org.apache.beam.sdk.values"><code>windowing strategies</code></a> or may delay when output is
produced.
<p>The durations specified may impose memory and/or storage requirements within a runner and care
might need to be used to ensure that the deduplication time limit is long enough to remove
duplicates but short enough to not cause performance problems within a runner. Each runner may
provide an optimized implementation of their choice using the deduplication time domain and
threshold specified.
<p>Does not preserve any order the input PCollection might have had.
<p>Example of use:
<pre><code>
PCollection&lt;String&gt; words = ...;
PCollection&lt;String&gt; deduplicatedWords =
words.apply(Deduplicate.&lt;String&gt;values());
</code></pre></div>
</li>
</ul>
</div>
<div class="summary">
<ul class="blockList">
<li class="blockList">
<!-- ======== NESTED CLASS SUMMARY ======== -->
<ul class="blockList">
<li class="blockList"><a name="nested.class.summary">
<!-- -->
</a>
<h3>Nested Class Summary</h3>
<table class="memberSummary" border="0" cellpadding="3" cellspacing="0" summary="Nested Class Summary table, listing nested classes, and an explanation">
<caption><span>Nested Classes</span><span class="tabEnd">&nbsp;</span></caption>
<tr>
<th class="colFirst" scope="col">Modifier and Type</th>
<th class="colLast" scope="col">Class and Description</th>
</tr>
<tr class="altColor">
<td class="colFirst"><code>static class&nbsp;</code></td>
<td class="colLast"><code><span class="memberNameLink"><a href="../../../../../org/apache/beam/sdk/transforms/Deduplicate.KeyedValues.html" title="class in org.apache.beam.sdk.transforms">Deduplicate.KeyedValues</a>&lt;<a href="../../../../../org/apache/beam/sdk/transforms/Deduplicate.KeyedValues.html" title="type parameter in Deduplicate.KeyedValues">K</a>,<a href="../../../../../org/apache/beam/sdk/transforms/Deduplicate.KeyedValues.html" title="type parameter in Deduplicate.KeyedValues">V</a>&gt;</span></code>
<div class="block">Deduplicates keyed values using the key over a specified time domain and threshold.</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><code>static class&nbsp;</code></td>
<td class="colLast"><code><span class="memberNameLink"><a href="../../../../../org/apache/beam/sdk/transforms/Deduplicate.Values.html" title="class in org.apache.beam.sdk.transforms">Deduplicate.Values</a>&lt;<a href="../../../../../org/apache/beam/sdk/transforms/Deduplicate.Values.html" title="type parameter in Deduplicate.Values">T</a>&gt;</span></code>
<div class="block">Deduplicates values over a specified time domain and threshold.</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><code>static class&nbsp;</code></td>
<td class="colLast"><code><span class="memberNameLink"><a href="../../../../../org/apache/beam/sdk/transforms/Deduplicate.WithRepresentativeValues.html" title="class in org.apache.beam.sdk.transforms">Deduplicate.WithRepresentativeValues</a>&lt;<a href="../../../../../org/apache/beam/sdk/transforms/Deduplicate.WithRepresentativeValues.html" title="type parameter in Deduplicate.WithRepresentativeValues">T</a>,<a href="../../../../../org/apache/beam/sdk/transforms/Deduplicate.WithRepresentativeValues.html" title="type parameter in Deduplicate.WithRepresentativeValues">IdT</a>&gt;</span></code>
<div class="block">A <a href="../../../../../org/apache/beam/sdk/transforms/PTransform.html" title="class in org.apache.beam.sdk.transforms"><code>PTransform</code></a> that uses a <a href="../../../../../org/apache/beam/sdk/transforms/SerializableFunction.html" title="interface in org.apache.beam.sdk.transforms"><code>SerializableFunction</code></a> to obtain a representative value
for each input element used for deduplication.</div>
</td>
</tr>
</table>
</li>
</ul>
<!-- =========== FIELD SUMMARY =========== -->
<ul class="blockList">
<li class="blockList"><a name="field.summary">
<!-- -->
</a>
<h3>Field Summary</h3>
<table class="memberSummary" border="0" cellpadding="3" cellspacing="0" summary="Field Summary table, listing fields, and an explanation">
<caption><span>Fields</span><span class="tabEnd">&nbsp;</span></caption>
<tr>
<th class="colFirst" scope="col">Modifier and Type</th>
<th class="colLast" scope="col">Field and Description</th>
</tr>
<tr class="altColor">
<td class="colFirst"><code>static <a href="https://static.javadoc.io/joda-time/joda-time/2.10.10/org/joda/time/Duration.html?is-external=true" title="class or interface in org.joda.time">Duration</a></code></td>
<td class="colLast"><code><span class="memberNameLink"><a href="../../../../../org/apache/beam/sdk/transforms/Deduplicate.html#DEFAULT_DURATION">DEFAULT_DURATION</a></span></code>
<div class="block">The default duration is 10 mins.</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><code>static <a href="../../../../../org/apache/beam/sdk/state/TimeDomain.html" title="enum in org.apache.beam.sdk.state">TimeDomain</a></code></td>
<td class="colLast"><code><span class="memberNameLink"><a href="../../../../../org/apache/beam/sdk/transforms/Deduplicate.html#DEFAULT_TIME_DOMAIN">DEFAULT_TIME_DOMAIN</a></span></code>
<div class="block">The default is the <a href="../../../../../org/apache/beam/sdk/state/TimeDomain.html#PROCESSING_TIME"><code>processing time domain</code></a>.</div>
</td>
</tr>
</table>
</li>
</ul>
<!-- ========== METHOD SUMMARY =========== -->
<ul class="blockList">
<li class="blockList"><a name="method.summary">
<!-- -->
</a>
<h3>Method Summary</h3>
<table class="memberSummary" border="0" cellpadding="3" cellspacing="0" summary="Method Summary table, listing methods, and an explanation">
<caption><span id="t0" class="activeTableTab"><span>All Methods</span><span class="tabEnd">&nbsp;</span></span><span id="t1" class="tableTab"><span><a href="javascript:show(1);">Static Methods</a></span><span class="tabEnd">&nbsp;</span></span><span id="t4" class="tableTab"><span><a href="javascript:show(8);">Concrete Methods</a></span><span class="tabEnd">&nbsp;</span></span></caption>
<tr>
<th class="colFirst" scope="col">Modifier and Type</th>
<th class="colLast" scope="col">Method and Description</th>
</tr>
<tr id="i0" class="altColor">
<td class="colFirst"><code>static &lt;K,V&gt;&nbsp;<a href="../../../../../org/apache/beam/sdk/transforms/Deduplicate.KeyedValues.html" title="class in org.apache.beam.sdk.transforms">Deduplicate.KeyedValues</a>&lt;K,V&gt;</code></td>
<td class="colLast"><code><span class="memberNameLink"><a href="../../../../../org/apache/beam/sdk/transforms/Deduplicate.html#keyedValues--">keyedValues</a></span>()</code>
<div class="block">Returns a deduplication transform that deduplicates keyed values using the key for up to 10
mins within the <a href="../../../../../org/apache/beam/sdk/state/TimeDomain.html#PROCESSING_TIME"><code>processing time domain</code></a>.</div>
</td>
</tr>
<tr id="i1" class="rowColor">
<td class="colFirst"><code>static &lt;T&gt;&nbsp;<a href="../../../../../org/apache/beam/sdk/transforms/Deduplicate.Values.html" title="class in org.apache.beam.sdk.transforms">Deduplicate.Values</a>&lt;T&gt;</code></td>
<td class="colLast"><code><span class="memberNameLink"><a href="../../../../../org/apache/beam/sdk/transforms/Deduplicate.html#values--">values</a></span>()</code>
<div class="block">Returns a deduplication transform that deduplicates values for up to 10 mins within the <a href="../../../../../org/apache/beam/sdk/state/TimeDomain.html#PROCESSING_TIME"><code>processing time domain</code></a>.</div>
</td>
</tr>
<tr id="i2" class="altColor">
<td class="colFirst"><code>static &lt;T,IdT&gt;&nbsp;<a href="../../../../../org/apache/beam/sdk/transforms/Deduplicate.WithRepresentativeValues.html" title="class in org.apache.beam.sdk.transforms">Deduplicate.WithRepresentativeValues</a>&lt;T,IdT&gt;</code></td>
<td class="colLast"><code><span class="memberNameLink"><a href="../../../../../org/apache/beam/sdk/transforms/Deduplicate.html#withRepresentativeValueFn-org.apache.beam.sdk.transforms.SerializableFunction-">withRepresentativeValueFn</a></span>(<a href="../../../../../org/apache/beam/sdk/transforms/SerializableFunction.html" title="interface in org.apache.beam.sdk.transforms">SerializableFunction</a>&lt;T,IdT&gt;&nbsp;representativeValueFn)</code>
<div class="block">Returns a deduplication transform that deduplicates values using the supplied representative
value for up to 10 mins within the <a href="../../../../../org/apache/beam/sdk/state/TimeDomain.html#PROCESSING_TIME"><code>processing time domain</code></a>.</div>
</td>
</tr>
</table>
<ul class="blockList">
<li class="blockList"><a name="methods.inherited.from.class.java.lang.Object">
<!-- -->
</a>
<h3>Methods inherited from class&nbsp;java.lang.Object</h3>
<code>clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait</code></li>
</ul>
</li>
</ul>
</li>
</ul>
</div>
<div class="details">
<ul class="blockList">
<li class="blockList">
<!-- ============ FIELD DETAIL =========== -->
<ul class="blockList">
<li class="blockList"><a name="field.detail">
<!-- -->
</a>
<h3>Field Detail</h3>
<a name="DEFAULT_TIME_DOMAIN">
<!-- -->
</a>
<ul class="blockList">
<li class="blockList">
<h4>DEFAULT_TIME_DOMAIN</h4>
<pre>public static final&nbsp;<a href="../../../../../org/apache/beam/sdk/state/TimeDomain.html" title="enum in org.apache.beam.sdk.state">TimeDomain</a> DEFAULT_TIME_DOMAIN</pre>
<div class="block">The default is the <a href="../../../../../org/apache/beam/sdk/state/TimeDomain.html#PROCESSING_TIME"><code>processing time domain</code></a>.</div>
</li>
</ul>
<a name="DEFAULT_DURATION">
<!-- -->
</a>
<ul class="blockListLast">
<li class="blockList">
<h4>DEFAULT_DURATION</h4>
<pre>public static final&nbsp;<a href="https://static.javadoc.io/joda-time/joda-time/2.10.10/org/joda/time/Duration.html?is-external=true" title="class or interface in org.joda.time">Duration</a> DEFAULT_DURATION</pre>
<div class="block">The default duration is 10 mins.</div>
</li>
</ul>
</li>
</ul>
<!-- ============ METHOD DETAIL ========== -->
<ul class="blockList">
<li class="blockList"><a name="method.detail">
<!-- -->
</a>
<h3>Method Detail</h3>
<a name="values--">
<!-- -->
</a>
<ul class="blockList">
<li class="blockList">
<h4>values</h4>
<pre>public static&nbsp;&lt;T&gt;&nbsp;<a href="../../../../../org/apache/beam/sdk/transforms/Deduplicate.Values.html" title="class in org.apache.beam.sdk.transforms">Deduplicate.Values</a>&lt;T&gt;&nbsp;values()</pre>
<div class="block">Returns a deduplication transform that deduplicates values for up to 10 mins within the <a href="../../../../../org/apache/beam/sdk/state/TimeDomain.html#PROCESSING_TIME"><code>processing time domain</code></a>.</div>
</li>
</ul>
<a name="keyedValues--">
<!-- -->
</a>
<ul class="blockList">
<li class="blockList">
<h4>keyedValues</h4>
<pre>public static&nbsp;&lt;K,V&gt;&nbsp;<a href="../../../../../org/apache/beam/sdk/transforms/Deduplicate.KeyedValues.html" title="class in org.apache.beam.sdk.transforms">Deduplicate.KeyedValues</a>&lt;K,V&gt;&nbsp;keyedValues()</pre>
<div class="block">Returns a deduplication transform that deduplicates keyed values using the key for up to 10
mins within the <a href="../../../../../org/apache/beam/sdk/state/TimeDomain.html#PROCESSING_TIME"><code>processing time domain</code></a>.</div>
</li>
</ul>
<a name="withRepresentativeValueFn-org.apache.beam.sdk.transforms.SerializableFunction-">
<!-- -->
</a>
<ul class="blockListLast">
<li class="blockList">
<h4>withRepresentativeValueFn</h4>
<pre>public static&nbsp;&lt;T,IdT&gt;&nbsp;<a href="../../../../../org/apache/beam/sdk/transforms/Deduplicate.WithRepresentativeValues.html" title="class in org.apache.beam.sdk.transforms">Deduplicate.WithRepresentativeValues</a>&lt;T,IdT&gt;&nbsp;withRepresentativeValueFn(<a href="../../../../../org/apache/beam/sdk/transforms/SerializableFunction.html" title="interface in org.apache.beam.sdk.transforms">SerializableFunction</a>&lt;T,IdT&gt;&nbsp;representativeValueFn)</pre>
<div class="block">Returns a deduplication transform that deduplicates values using the supplied representative
value for up to 10 mins within the <a href="../../../../../org/apache/beam/sdk/state/TimeDomain.html#PROCESSING_TIME"><code>processing time domain</code></a>.</div>
</li>
</ul>
</li>
</ul>
</li>
</ul>
</div>
</div>
<!-- ========= END OF CLASS DATA ========= -->
<!-- ======= START OF BOTTOM NAVBAR ====== -->
<div class="bottomNav"><a name="navbar.bottom">
<!-- -->
</a>
<div class="skipNav"><a href="#skip.navbar.bottom" title="Skip navigation links">Skip navigation links</a></div>
<a name="navbar.bottom.firstrow">
<!-- -->
</a>
<ul class="navList" title="Navigation">
<li><a href="../../../../../overview-summary.html">Overview</a></li>
<li><a href="package-summary.html">Package</a></li>
<li class="navBarCell1Rev">Class</li>
<li><a href="package-tree.html">Tree</a></li>
<li><a href="../../../../../deprecated-list.html">Deprecated</a></li>
<li><a href="../../../../../index-all.html">Index</a></li>
<li><a href="../../../../../help-doc.html">Help</a></li>
</ul>
</div>
<div class="subNav">
<ul class="navList">
<li><a href="../../../../../org/apache/beam/sdk/transforms/Create.Values.html" title="class in org.apache.beam.sdk.transforms"><span class="typeNameLink">Prev&nbsp;Class</span></a></li>
<li><a href="../../../../../org/apache/beam/sdk/transforms/Deduplicate.KeyedValues.html" title="class in org.apache.beam.sdk.transforms"><span class="typeNameLink">Next&nbsp;Class</span></a></li>
</ul>
<ul class="navList">
<li><a href="../../../../../index.html?org/apache/beam/sdk/transforms/Deduplicate.html" target="_top">Frames</a></li>
<li><a href="Deduplicate.html" target="_top">No&nbsp;Frames</a></li>
</ul>
<ul class="navList" id="allclasses_navbar_bottom">
<li><a href="../../../../../allclasses-noframe.html">All&nbsp;Classes</a></li>
</ul>
<div>
<script type="text/javascript"><!--
allClassesLink = document.getElementById("allclasses_navbar_bottom");
if(window==top) {
allClassesLink.style.display = "block";
}
else {
allClassesLink.style.display = "none";
}
//-->
</script>
</div>
<div>
<ul class="subNavList">
<li>Summary:&nbsp;</li>
<li><a href="#nested.class.summary">Nested</a>&nbsp;|&nbsp;</li>
<li><a href="#field.summary">Field</a>&nbsp;|&nbsp;</li>
<li>Constr&nbsp;|&nbsp;</li>
<li><a href="#method.summary">Method</a></li>
</ul>
<ul class="subNavList">
<li>Detail:&nbsp;</li>
<li><a href="#field.detail">Field</a>&nbsp;|&nbsp;</li>
<li>Constr&nbsp;|&nbsp;</li>
<li><a href="#method.detail">Method</a></li>
</ul>
</div>
<a name="skip.navbar.bottom">
<!-- -->
</a></div>
<!-- ======== END OF BOTTOM NAVBAR ======= -->
</body>
</html>