blob: a1d507ab4c569d525afbc9651d3248c15af8923e [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.crunch;
import java.util.Collection;
import org.apache.crunch.types.PTableType;
import org.apache.crunch.types.PType;
import org.apache.crunch.types.PTypeFamily;
/**
* A representation of an immutable, distributed collection of elements that is
* the fundamental target of computations in Crunch.
*
*/
public interface PCollection<S> {
/**
* Returns the {@code Pipeline} associated with this PCollection.
*/
Pipeline getPipeline();
/**
* Returns a {@code PCollection} instance that acts as the union of this
* {@code PCollection} and the given {@code PCollection}.
*/
PCollection<S> union(PCollection<S> other);
/**
* Returns a {@code PCollection} instance that acts as the union of this
* {@code PCollection} and the input {@code PCollection}s.
*/
PCollection<S> union(PCollection<S>... collections);
/**
* Applies the given doFn to the elements of this {@code PCollection} and
* returns a new {@code PCollection} that is the output of this processing.
*
* @param doFn
* The {@code DoFn} to apply
* @param type
* The {@link PType} of the resulting {@code PCollection}
* @return a new {@code PCollection}
*/
<T> PCollection<T> parallelDo(DoFn<S, T> doFn, PType<T> type);
/**
* Applies the given doFn to the elements of this {@code PCollection} and
* returns a new {@code PCollection} that is the output of this processing.
*
* @param name
* An identifier for this processing step, useful for debugging
* @param doFn
* The {@code DoFn} to apply
* @param type
* The {@link PType} of the resulting {@code PCollection}
* @return a new {@code PCollection}
*/
<T> PCollection<T> parallelDo(String name, DoFn<S, T> doFn, PType<T> type);
/**
* Applies the given doFn to the elements of this {@code PCollection} and
* returns a new {@code PCollection} that is the output of this processing.
*
* @param name
* An identifier for this processing step, useful for debugging
* @param doFn
* The {@code DoFn} to apply
* @param type
* The {@link PType} of the resulting {@code PCollection}
* @param options
* Optional information that is needed for certain pipeline operations
* @return a new {@code PCollection}
*/
<T> PCollection<T> parallelDo(String name, DoFn<S, T> doFn, PType<T> type,
ParallelDoOptions options);
/**
* Similar to the other {@code parallelDo} instance, but returns a
* {@code PTable} instance instead of a {@code PCollection}.
*
* @param doFn
* The {@code DoFn} to apply
* @param type
* The {@link PTableType} of the resulting {@code PTable}
* @return a new {@code PTable}
*/
<K, V> PTable<K, V> parallelDo(DoFn<S, Pair<K, V>> doFn, PTableType<K, V> type);
/**
* Similar to the other {@code parallelDo} instance, but returns a
* {@code PTable} instance instead of a {@code PCollection}.
*
* @param name
* An identifier for this processing step
* @param doFn
* The {@code DoFn} to apply
* @param type
* The {@link PTableType} of the resulting {@code PTable}
* @return a new {@code PTable}
*/
<K, V> PTable<K, V> parallelDo(String name, DoFn<S, Pair<K, V>> doFn, PTableType<K, V> type);
/**
* Similar to the other {@code parallelDo} instance, but returns a
* {@code PTable} instance instead of a {@code PCollection}.
*
* @param name
* An identifier for this processing step
* @param doFn
* The {@code DoFn} to apply
* @param type
* The {@link PTableType} of the resulting {@code PTable}
* @param options
* Optional information that is needed for certain pipeline operations
* @return a new {@code PTable}
*/
<K, V> PTable<K, V> parallelDo(String name, DoFn<S, Pair<K, V>> doFn, PTableType<K, V> type,
ParallelDoOptions options);
/**
* Write the contents of this {@code PCollection} to the given {@code Target},
* using the storage format specified by the target.
*
* @param target
* The target to write to
*/
PCollection<S> write(Target target);
/**
* Write the contents of this {@code PCollection} to the given {@code Target},
* using the given {@code Target.WriteMode} to handle existing
* targets.
*
* @param target
* The target
* @param writeMode
* The rule for handling existing outputs at the target location
*/
PCollection<S> write(Target target, Target.WriteMode writeMode);
/**
* Returns a reference to the data set represented by this PCollection that
* may be used by the client to read the data locally.
*/
Iterable<S> materialize();
/**
* Marks this data as cached using the default {@link CachingOptions}. Cached {@code PCollection}s will only
* be processed once, and then their contents will be saved so that downstream code can process them many times.
*
* @return this {@code PCollection} instance
*/
PCollection<S> cache();
/**
* Marks this data as cached using the given {@code CachingOptions}. Cached {@code PCollection}s will only
* be processed once and then their contents will be saved so that downstream code can process them many times.
*
* @param options the options that control the cache settings for the data
* @return this {@code PCollection} instance
*/
PCollection<S> cache(CachingOptions options);
/**
* @return A {@code PObject} encapsulating an in-memory {@link Collection} containing the values
* of this {@code PCollection}.
*/
PObject<Collection<S>> asCollection();
/**
* @return The first element of this {@code PCollection}.
*/
PObject<S> first();
/**
* Adds the materialized data in this {@code PCollection} as a dependency to the given
* {@code PipelineCallable} and registers it with the {@code Pipeline} associated with this
* instance.
*
* @param label the label to use inside of the PipelineCallable for referencing this PCollection
* @param pipelineCallable the function itself
*
* @return The value of the {@code getOutput} function on the given argument.
*/
<Output> Output sequentialDo(String label, PipelineCallable<Output> pipelineCallable);
/**
* @return A reference to the data in this instance that can be read from a job running
* on a cluster.
*
* @param materialize If true, materialize this data before returning a reference to it
*/
ReadableData<S> asReadable(boolean materialize);
/**
* Returns the {@code PType} of this {@code PCollection}.
*/
PType<S> getPType();
/**
* Returns the {@code PTypeFamily} of this {@code PCollection}.
*/
PTypeFamily getTypeFamily();
/**
* Returns the size of the data represented by this {@code PCollection} in
* bytes.
*/
long getSize();
/**
* Returns the number of elements represented by this {@code PCollection}.
*
* @return An {@code PObject} containing the number of elements in this {@code PCollection}.
*/
PObject<Long> length();
/**
* Returns a shorthand name for this PCollection.
*/
String getName();
/**
* Apply the given filter function to this instance and return the resulting
* {@code PCollection}.
*/
PCollection<S> filter(FilterFn<S> filterFn);
/**
* Apply the given filter function to this instance and return the resulting
* {@code PCollection}.
*
* @param name
* An identifier for this processing step
* @param filterFn
* The {@code FilterFn} to apply
*/
PCollection<S> filter(String name, FilterFn<S> filterFn);
/**
* Apply the given map function to each element of this instance in order to
* create a {@code PTable}.
*/
<K> PTable<K, S> by(MapFn<S, K> extractKeyFn, PType<K> keyType);
/**
* Apply the given map function to each element of this instance in order to
* create a {@code PTable}.
*
* @param name
* An identifier for this processing step
* @param extractKeyFn
* The {@code MapFn} to apply
*/
<K> PTable<K, S> by(String name, MapFn<S, K> extractKeyFn, PType<K> keyType);
/**
* Returns a {@code PTable} instance that contains the counts of each unique
* element of this PCollection.
*/
PTable<S, Long> count();
/**
* Returns a {@code PObject} of the maximum element of this instance.
*/
PObject<S> max();
/**
* Returns a {@code PObject} of the minimum element of this instance.
*/
PObject<S> min();
/**
* Returns a {@code PCollection} that contains the result of aggregating all values in this instance.
*/
PCollection<S> aggregate(Aggregator<S> aggregator);
}