blob: 114c65501bc76ce934a2b3becfa314f6e738b3b1 [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.crunch;
import java.util.Collection;
import java.util.Map;
import org.apache.crunch.types.PTableType;
import org.apache.crunch.types.PType;
/**
* A sub-interface of {@code PCollection} that represents an immutable,
* distributed multi-map of keys and values.
*
*/
public interface PTable<K, V> extends PCollection<Pair<K, V>> {
/**
Returns a {@code PTable} instance that acts as the union of this
* {@code PTable} and the other {@code PTable}s.
*/
PTable<K, V> union(PTable<K, V> other);
/**
* Returns a {@code PTable} instance that acts as the union of this
* {@code PTable} and the input {@code PTable}s.
*/
PTable<K, V> union(PTable<K, V>... others);
/**
* Performs a grouping operation on the keys of this table.
*
* @return a {@code PGroupedTable} instance that represents the grouping
*/
PGroupedTable<K, V> groupByKey();
/**
* Performs a grouping operation on the keys of this table, using the given
* number of partitions.
*
* @param numPartitions
* The number of partitions for the data.
* @return a {@code PGroupedTable} instance that represents this grouping
*/
PGroupedTable<K, V> groupByKey(int numPartitions);
/**
* Performs a grouping operation on the keys of this table, using the
* additional {@code GroupingOptions} to control how the grouping is executed.
*
* @param options
* The grouping options to use
* @return a {@code PGroupedTable} instance that represents the grouping
*/
PGroupedTable<K, V> groupByKey(GroupingOptions options);
/**
* Writes this {@code PTable} to the given {@code Target}.
*/
PTable<K, V> write(Target target);
/**
* Writes this {@code PTable} to the given {@code Target}, using the
* given {@code Target.WriteMode} to handle existing targets.
*/
PTable<K, V> write(Target target, Target.WriteMode writeMode);
PTable<K, V> cache();
PTable<K, V> cache(CachingOptions options);
/**
* Returns the {@code PTableType} of this {@code PTable}.
*/
PTableType<K, V> getPTableType();
/**
* Returns the {@code PType} of the key.
*/
PType<K> getKeyType();
/**
* Returns the {@code PType} of the value.
*/
PType<V> getValueType();
/**
* Returns a {@code PTable} that has the same keys as this instance, but
* uses the given function to map the values.
*/
<U> PTable<K, U> mapValues(MapFn<V, U> mapFn, PType<U> ptype);
/**
* Returns a {@code PTable} that has the same keys as this instance, but
* uses the given function to map the values.
*/
<U> PTable<K, U> mapValues(String name, MapFn<V, U> mapFn, PType<U> ptype);
/**
* Returns a {@code PTable} that has the same values as this instance, but
* uses the given function to map the keys.
*/
<K2> PTable<K2, V> mapKeys(MapFn<K, K2> mapFn, PType<K2> ptype);
/**
* Returns a {@code PTable} that has the same values as this instance, but
* uses the given function to map the keys.
*/
<K2> PTable<K2, V> mapKeys(String name, MapFn<K, K2> mapFn, PType<K2> ptype);
/**
* Aggregate all of the values with the same key into a single key-value pair
* in the returned PTable.
*/
PTable<K, Collection<V>> collectValues();
/**
* Apply the given filter function to this instance and return the resulting
* {@code PTable}.
*/
PTable<K, V> filter(FilterFn<Pair<K, V>> filterFn);
/**
* Apply the given filter function to this instance and return the resulting
* {@code PTable}.
*
* @param name
* An identifier for this processing step
* @param filterFn
* The {@code FilterFn} to apply
*/
PTable<K, V> filter(String name, FilterFn<Pair<K, V>> filterFn);
/**
* Returns a PTable made up of the pairs in this PTable with the largest value
* field.
*
* @param count
* The number of pairs to return
*/
PTable<K, V> top(int count);
/**
* Returns a PTable made up of the pairs in this PTable with the smallest
* value field.
*
* @param count
* The number of pairs to return
*/
PTable<K, V> bottom(int count);
/**
* Perform an inner join on this table and the one passed in as an argument on
* their common keys.
*/
<U> PTable<K, Pair<V, U>> join(PTable<K, U> other);
/**
* Co-group operation with the given table.
* <p>
* <b>Note:</b> If the given table contains keys that are not present in this PTable, an empty
* PCollection is set for the relationship.
*/
<U> PTable<K, Pair<Collection<V>, Collection<U>>> cogroup(PTable<K, U> other);
/**
* Returns a {@link PCollection} made up of the keys in this PTable.
*/
PCollection<K> keys();
/**
* Returns a {@link PCollection} made up of the values in this PTable.
*/
PCollection<V> values();
/**
* Returns a Map<K, V> made up of the keys and values in this PTable.
* <p>
* <b>Note:</b> The contents of the returned map may not be exactly the same
* as this PTable, as a PTable is a multi-map (i.e. can contain multiple
* values for a single key).
*/
Map<K, V> materializeToMap();
/**
* Returns a {@link PObject} encapsulating a {@link Map} made up of the keys and values in this
* {@code PTable}.
* <p><b>Note:</b>The contents of the returned map may not be exactly the same as this PTable,
* as a PTable is a multi-map (i.e. can contain multiple values for a single key).
* </p>
*
* @return The {@code PObject} encapsulating a {@code Map} made up of the keys and values in
* this {@code PTable}.
*/
PObject<Map<K, V>> asMap();
}