blob: 6193e6c387a58b5d7a6f6788a2ba5e71e5236a83 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.beam.runners.direct;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.beam.runners.local.StructuralKey;
import org.apache.beam.sdk.coders.ByteArrayCoder;
import org.apache.beam.sdk.io.Read.Unbounded;
import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.CacheBuilder;
import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.CacheLoader;
import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.LoadingCache;
import org.joda.time.Duration;
/**
* Provides methods to determine if a record is a duplicate within the evaluation of a {@link
* Unbounded} {@link PTransform}.
*/
interface UnboundedReadDeduplicator {
/**
* Returns true if the record with the provided ID should be output, and false if it should not be
* because it is a duplicate.
*/
boolean shouldOutput(byte[] recordId);
/**
* An {@link UnboundedReadDeduplicator} that always returns true. For use with sources do not
* require deduplication.
*/
class NeverDeduplicator implements UnboundedReadDeduplicator {
/** Create a new {@link NeverDeduplicator}. */
public static UnboundedReadDeduplicator create() {
return new NeverDeduplicator();
}
@Override
public boolean shouldOutput(byte[] recordId) {
return true;
}
}
/**
* An {@link UnboundedReadDeduplicator} that returns true if the record ID has not been seen
* within 10 minutes.
*/
class CachedIdDeduplicator implements UnboundedReadDeduplicator {
private static final ByteArrayCoder RECORD_ID_CODER = ByteArrayCoder.of();
private static final long MAX_RETENTION_SINCE_ACCESS =
Duration.standardMinutes(10L).getMillis();
private final LoadingCache<StructuralKey<byte[]>, AtomicBoolean> ids;
/** Create a new {@link CachedIdDeduplicator}. */
public static UnboundedReadDeduplicator create() {
return new CachedIdDeduplicator();
}
private CachedIdDeduplicator() {
ids =
CacheBuilder.newBuilder()
.expireAfterAccess(MAX_RETENTION_SINCE_ACCESS, TimeUnit.MILLISECONDS)
.maximumSize(100_000L)
.build(new TrueBooleanLoader());
}
@Override
public boolean shouldOutput(byte[] recordId) {
return ids.getUnchecked(StructuralKey.of(recordId, RECORD_ID_CODER)).getAndSet(false);
}
private static class TrueBooleanLoader
extends CacheLoader<StructuralKey<byte[]>, AtomicBoolean> {
@Override
public AtomicBoolean load(StructuralKey<byte[]> key) throws Exception {
return new AtomicBoolean(true);
}
}
}
}