flink-core/src/main/java/org/apache/flink/api/common/functions/AggregateFunction.java - flink - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.flink.api.common.functions;

 import org.apache.flink.annotation.PublicEvolving;

 import java.io.Serializable;

 /**
  * The {@code AggregateFunction} is a flexible aggregation function, characterized by the
  * following features:
  *
  * <ul>
  *     <li>The aggregates may use different types for input values, intermediate aggregates,
  *         and result type, to support a wide range of aggregation types.</li>
  *
  *     <li>Support for distributive aggregations: Different intermediate aggregates can be
  *         merged together, to allow for pre-aggregation/final-aggregation optimizations.</li>
  * </ul>
  *
  * <p>The {@code AggregateFunction}'s intermediate aggregate (in-progress aggregation state)
  * is called the <i>accumulator</i>. Values are added to the accumulator, and final aggregates are
  * obtained by finalizing the accumulator state. This supports aggregation functions where the
  * intermediate state needs to be different than the aggregated values and the final result type,
  * such as for example <i>average</i> (which typically keeps a count and sum).
  * Merging intermediate aggregates (partial aggregates) means merging the accumulators.
  *
  * <p>The AggregationFunction itself is stateless. To allow a single AggregationFunction
  * instance to maintain multiple aggregates (such as one aggregate per key), the
  * AggregationFunction creates a new accumulator whenever a new aggregation is started.
  *
  * <p>Aggregation functions must be {@link Serializable} because they are sent around
  * between distributed processes during distributed execution.
  *
  * <h1>Example: Average and Weighted Average</h1>
  *
  * <pre>{@code
  * // the accumulator, which holds the state of the in-flight aggregate
  * public class AverageAccumulator {
  *     long count;
  *     long sum;
  * }
  *
  * // implementation of an aggregation function for an 'average'
  * public class Average implements AggregateFunction<Integer, AverageAccumulator, Double> {
  *
  *     public AverageAccumulator createAccumulator() {
  *         return new AverageAccumulator();
  *     }
  *
  *     public AverageAccumulator merge(AverageAccumulator a, AverageAccumulator b) {
  *         a.count += b.count;
  *         a.sum += b.sum;
  *         return a;
  *     }
  *
  *     public void add(Integer value, AverageAccumulator acc) {
  *         acc.sum += value;
  *         acc.count++;
  *     }
  *
  *     public Double getResult(AverageAccumulator acc) {
  *         return acc.sum / (double) acc.count;
  *     }
  * }
  *
  * // implementation of a weighted average
  * // this reuses the same accumulator type as the aggregate function for 'average'
  * public class WeightedAverage implements AggregateFunction<Datum, AverageAccumulator, Double> {
  *
  *     public AverageAccumulator createAccumulator() {
  *         return new AverageAccumulator();
  *     }
  *
  *     public AverageAccumulator merge(AverageAccumulator a, AverageAccumulator b) {
  *         a.count += b.count;
  *         a.sum += b.sum;
  *         return a;
  *     }
  *
  *     public void add(Datum value, AverageAccumulator acc) {
  *         acc.count += value.getWeight();
  *         acc.sum += value.getValue();
  *     }
  *
  *     public Double getResult(AverageAccumulator acc) {
  *         return acc.sum / (double) acc.count;
  *     }
  * }
  * }</pre>
  *
  * @param <IN>  The type of the values that are aggregated (input values)
  * @param <ACC> The type of the accumulator (intermediate aggregate state).
  * @param <OUT> The type of the aggregated result
  */
 @PublicEvolving
 public interface AggregateFunction<IN, ACC, OUT> extends Function, Serializable {

 	/**
 	 * Creates a new accumulator, starting a new aggregate.
 	 *
 	 * <p>The new accumulator is typically meaningless unless a value is added
 	 * via {@link #add(Object, Object)}.
 	 *
 	 * <p>The accumulator is the state of a running aggregation. When a program has multiple
 	 * aggregates in progress (such as per key and window), the state (per key and window)
 	 * is the size of the accumulator.
 	 *
 	 * @return A new accumulator, corresponding to an empty aggregate.
 	 */
 	ACC createAccumulator();

 	/**
 	 * Adds the given input value to the given accumulator, returning the
 	 * new accumulator value.
 	 *
 	 * <p>For efficiency, the input accumulator may be modified and returned.
 	 *
 	 * @param value The value to add
 	 * @param accumulator The accumulator to add the value to
 	 */
 	ACC add(IN value, ACC accumulator);

 	/**
 	 * Gets the result of the aggregation from the accumulator.
 	 *
 	 * @param accumulator The accumulator of the aggregation
 	 * @return The final aggregation result.
 	 */
 	OUT getResult(ACC accumulator);

 	/**
 	 * Merges two accumulators, returning an accumulator with the merged state.
 	 *
 	 * <p>This function may reuse any of the given accumulators as the target for the merge
 	 * and return that. The assumption is that the given accumulators will not be used any
 	 * more after having been passed to this function.
 	 *
 	 * @param a An accumulator to merge
 	 * @param b Another accumulator to merge
 	 *
 	 * @return The accumulator with the merged state
 	 */
 	ACC merge(ACC a, ACC b);
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.flink.api.common.functions;

	import org.apache.flink.annotation.PublicEvolving;

	import java.io.Serializable;

	/**
	* The {@code AggregateFunction} is a flexible aggregation function, characterized by the
	* following features:
	*
	* <ul>
	* <li>The aggregates may use different types for input values, intermediate aggregates,
	* and result type, to support a wide range of aggregation types.</li>
	*
	* <li>Support for distributive aggregations: Different intermediate aggregates can be
	* merged together, to allow for pre-aggregation/final-aggregation optimizations.</li>
	* </ul>
	*
	* <p>The {@code AggregateFunction}'s intermediate aggregate (in-progress aggregation state)
	* is called the <i>accumulator</i>. Values are added to the accumulator, and final aggregates are
	* obtained by finalizing the accumulator state. This supports aggregation functions where the
	* intermediate state needs to be different than the aggregated values and the final result type,
	* such as for example <i>average</i> (which typically keeps a count and sum).
	* Merging intermediate aggregates (partial aggregates) means merging the accumulators.
	*
	* <p>The AggregationFunction itself is stateless. To allow a single AggregationFunction
	* instance to maintain multiple aggregates (such as one aggregate per key), the
	* AggregationFunction creates a new accumulator whenever a new aggregation is started.
	*
	* <p>Aggregation functions must be {@link Serializable} because they are sent around
	* between distributed processes during distributed execution.
	*
	* <h1>Example: Average and Weighted Average</h1>
	*
	* <pre>{@code
	* // the accumulator, which holds the state of the in-flight aggregate
	* public class AverageAccumulator {
	* long count;
	* long sum;
	* }
	*
	* // implementation of an aggregation function for an 'average'
	* public class Average implements AggregateFunction<Integer, AverageAccumulator, Double> {
	*
	* public AverageAccumulator createAccumulator() {
	* return new AverageAccumulator();
	* }
	*
	* public AverageAccumulator merge(AverageAccumulator a, AverageAccumulator b) {
	* a.count += b.count;
	* a.sum += b.sum;
	* return a;
	* }
	*
	* public void add(Integer value, AverageAccumulator acc) {
	* acc.sum += value;
	* acc.count++;
	* }
	*
	* public Double getResult(AverageAccumulator acc) {
	* return acc.sum / (double) acc.count;
	* }
	* }
	*
	* // implementation of a weighted average
	* // this reuses the same accumulator type as the aggregate function for 'average'
	* public class WeightedAverage implements AggregateFunction<Datum, AverageAccumulator, Double> {
	*
	* public AverageAccumulator createAccumulator() {
	* return new AverageAccumulator();
	* }
	*
	* public AverageAccumulator merge(AverageAccumulator a, AverageAccumulator b) {
	* a.count += b.count;
	* a.sum += b.sum;
	* return a;
	* }
	*
	* public void add(Datum value, AverageAccumulator acc) {
	* acc.count += value.getWeight();
	* acc.sum += value.getValue();
	* }
	*
	* public Double getResult(AverageAccumulator acc) {
	* return acc.sum / (double) acc.count;
	* }
	* }
	* }</pre>
	*
	* @param <IN> The type of the values that are aggregated (input values)
	* @param <ACC> The type of the accumulator (intermediate aggregate state).
	* @param <OUT> The type of the aggregated result
	*/
	@PublicEvolving
	public interface AggregateFunction<IN, ACC, OUT> extends Function, Serializable {

	/**
	* Creates a new accumulator, starting a new aggregate.
	*
	* <p>The new accumulator is typically meaningless unless a value is added
	* via {@link #add(Object, Object)}.
	*
	* <p>The accumulator is the state of a running aggregation. When a program has multiple
	* aggregates in progress (such as per key and window), the state (per key and window)
	* is the size of the accumulator.
	*
	* @return A new accumulator, corresponding to an empty aggregate.
	*/
	ACC createAccumulator();

	/**
	* Adds the given input value to the given accumulator, returning the
	* new accumulator value.
	*
	* <p>For efficiency, the input accumulator may be modified and returned.
	*
	* @param value The value to add
	* @param accumulator The accumulator to add the value to
	*/
	ACC add(IN value, ACC accumulator);

	/**
	* Gets the result of the aggregation from the accumulator.
	*
	* @param accumulator The accumulator of the aggregation
	* @return The final aggregation result.
	*/
	OUT getResult(ACC accumulator);

	/**
	* Merges two accumulators, returning an accumulator with the merged state.
	*
	* <p>This function may reuse any of the given accumulators as the target for the merge
	* and return that. The assumption is that the given accumulators will not be used any
	* more after having been passed to this function.
	*
	* @param a An accumulator to merge
	* @param b Another accumulator to merge
	*
	* @return The accumulator with the merged state
	*/
	ACC merge(ACC a, ACC b);
	}