flink-optimizer/src/main/java/org/apache/flink/optimizer/costs/CostEstimator.java - flink - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */


 package org.apache.flink.optimizer.costs;

 import java.util.Iterator;

 import org.apache.flink.optimizer.CompilerException;
 import org.apache.flink.optimizer.dag.EstimateProvider;
 import org.apache.flink.optimizer.dag.TempMode;
 import org.apache.flink.optimizer.plan.Channel;
 import org.apache.flink.optimizer.plan.PlanNode;

 /**
  * Abstract base class for a cost estimator. Defines cost estimation methods and implements the basic work
  * method that computes the cost of an operator by adding input shipping cost, input local cost, and
  * driver cost.
  */
 public abstract class CostEstimator {

 	public abstract void addRandomPartitioningCost(EstimateProvider estimates, Costs costs);

 	public abstract void addHashPartitioningCost(EstimateProvider estimates, Costs costs);

 	public abstract void addRangePartitionCost(EstimateProvider estimates, Costs costs);

 	public abstract void addBroadcastCost(EstimateProvider estimates, int replicationFactor, Costs costs);

 	// ------------------------------------------------------------------------

 	public abstract void addFileInputCost(long fileSizeInBytes, Costs costs);

 	public abstract void addLocalSortCost(EstimateProvider estimates, Costs costs);

 	public abstract void addLocalMergeCost(EstimateProvider estimates1, EstimateProvider estimates2, Costs costs, int costWeight);

 	public abstract void addHybridHashCosts(EstimateProvider buildSide, EstimateProvider probeSide, Costs costs, int costWeight);

 	public abstract void addCachedHybridHashCosts(EstimateProvider buildSide, EstimateProvider probeSide, Costs costs, int costWeight);

 	public abstract void addStreamedNestedLoopsCosts(EstimateProvider outerSide, EstimateProvider innerSide, long bufferSize, Costs costs, int costWeight);

 	public abstract void addBlockNestedLoopsCosts(EstimateProvider outerSide, EstimateProvider innerSide, long blockSize, Costs costs, int costWeight);

 	// ------------------------------------------------------------------------

 	public abstract void addArtificialDamCost(EstimateProvider estimates, long bufferSize, Costs costs);

 	// ------------------------------------------------------------------------

 	/**
 	 * This method computes the cost of an operator. The cost is composed of cost for input shipping,
 	 * locally processing an input, and running the operator.
 	 *
 	 * It requires at least that all inputs are set and have a proper ship strategy set,
 	 * which is not equal to <tt>NONE</tt>.
 	 *
 	 * @param n The node to compute the costs for.
 	 */
 	public void costOperator(PlanNode n) {
 		// initialize costs objects with no costs
 		final Costs totalCosts = new Costs();
 		final long availableMemory = n.getGuaranteedAvailableMemory();

 		// add the shipping strategy costs
 		for (Channel channel : n.getInputs()) {
 			final Costs costs = new Costs();

 			// Plans that apply the same strategies, but at different points
 			// are equally expensive. For example, if a partitioning can be
 			// pushed below a Map function there is often no difference in plan
 			// costs between the pushed down version and the version that partitions
 			// after the Mapper. However, in those cases, we want the expensive
 			// strategy to appear later in the plan, as data reduction often occurs
 			// by large factors, while blowup is rare and typically by smaller fractions.
 			// We achieve this by adding a penalty to small penalty to the FORWARD strategy,
 			// weighted by the current plan depth (steps to the earliest data source).
 			// that way, later FORWARDS are more expensive than earlier forwards.
 			// Note that this only applies to the heuristic costs.

 			switch (channel.getShipStrategy()) {
 			case NONE:
 				throw new CompilerException(
 					"Cannot determine costs: Shipping strategy has not been set for an input.");
 			case FORWARD:
 //				costs.addHeuristicNetworkCost(channel.getMaxDepth());
 				break;
 			case PARTITION_RANDOM:
 				addRandomPartitioningCost(channel, costs);
 				break;
 			case PARTITION_HASH:
 			case PARTITION_CUSTOM:
 				addHashPartitioningCost(channel, costs);
 				break;
 			case PARTITION_RANGE:
 				addRangePartitionCost(channel, costs);
 				break;
 			case BROADCAST:
 				addBroadcastCost(channel, channel.getReplicationFactor(), costs);
 				break;
 			case PARTITION_FORCED_REBALANCE:
 				addRandomPartitioningCost(channel, costs);
 				break;
 			default:
 				throw new CompilerException("Unknown shipping strategy for input: " + channel.getShipStrategy());
 			}

 			switch (channel.getLocalStrategy()) {
 			case NONE:
 				break;
 			case SORT:
 			case COMBININGSORT:
 				addLocalSortCost(channel, costs);
 				break;
 			default:
 				throw new CompilerException("Unsupported local strategy for input: " + channel.getLocalStrategy());
 			}

 			if (channel.getTempMode() != null && channel.getTempMode() != TempMode.NONE) {
 				addArtificialDamCost(channel, 0, costs);
 			}

 			// adjust with the cost weight factor
 			if (channel.isOnDynamicPath()) {
 				costs.multiplyWith(channel.getCostWeight());
 			}

 			totalCosts.addCosts(costs);
 		}

 		Channel firstInput = null;
 		Channel secondInput = null;
 		Costs driverCosts = new Costs();
 		int costWeight = 1;

 		// adjust with the cost weight factor
 		if (n.isOnDynamicPath()) {
 			costWeight = n.getCostWeight();
 		}

 		// get the inputs, if we have some
 		{
 			Iterator<Channel> channels = n.getInputs().iterator();
 			if (channels.hasNext()) {
 				firstInput = channels.next();
 			}
 			if (channels.hasNext()) {
 				secondInput = channels.next();
 			}
 		}

 		// determine the local costs
 		switch (n.getDriverStrategy()) {
 		case NONE:
 		case UNARY_NO_OP:
 		case BINARY_NO_OP:
 		case MAP:
 		case MAP_PARTITION:
 		case FLAT_MAP:

 		case ALL_GROUP_REDUCE:
 		case ALL_REDUCE:
 			// this operations does not do any actual grouping, since every element is in the same single group

 		case CO_GROUP:
 		case CO_GROUP_RAW:
 		case SORTED_GROUP_REDUCE:
 		case SORTED_REDUCE:
 			// grouping or co-grouping over sorted streams for free

 		case SORTED_GROUP_COMBINE:
 			// partial grouping is always local and main memory resident. we should add a relative cpu cost at some point

 			// partial grouping is always local and main memory resident. we should add a relative cpu cost at some point
 		case ALL_GROUP_COMBINE:

 		case UNION:
 			// pipelined local union is for free

 			break;
 		case INNER_MERGE:
 		case FULL_OUTER_MERGE:
 		case LEFT_OUTER_MERGE:
 		case RIGHT_OUTER_MERGE:
 			addLocalMergeCost(firstInput, secondInput, driverCosts, costWeight);
 			break;
 		case HYBRIDHASH_BUILD_FIRST:
 		case RIGHT_HYBRIDHASH_BUILD_FIRST:
 		case LEFT_HYBRIDHASH_BUILD_FIRST:
 		case FULL_OUTER_HYBRIDHASH_BUILD_FIRST:
 			addHybridHashCosts(firstInput, secondInput, driverCosts, costWeight);
 			break;
 		case HYBRIDHASH_BUILD_SECOND:
 		case LEFT_HYBRIDHASH_BUILD_SECOND:
 		case RIGHT_HYBRIDHASH_BUILD_SECOND:
 		case FULL_OUTER_HYBRIDHASH_BUILD_SECOND:
 			addHybridHashCosts(secondInput, firstInput, driverCosts, costWeight);
 			break;
 		case HYBRIDHASH_BUILD_FIRST_CACHED:
 			addCachedHybridHashCosts(firstInput, secondInput, driverCosts, costWeight);
 			break;
 		case HYBRIDHASH_BUILD_SECOND_CACHED:
 			addCachedHybridHashCosts(secondInput, firstInput, driverCosts, costWeight);
 			break;
 		case NESTEDLOOP_BLOCKED_OUTER_FIRST:
 			addBlockNestedLoopsCosts(firstInput, secondInput, availableMemory, driverCosts, costWeight);
 			break;
 		case NESTEDLOOP_BLOCKED_OUTER_SECOND:
 			addBlockNestedLoopsCosts(secondInput, firstInput, availableMemory, driverCosts, costWeight);
 			break;
 		case NESTEDLOOP_STREAMED_OUTER_FIRST:
 			addStreamedNestedLoopsCosts(firstInput, secondInput, availableMemory, driverCosts, costWeight);
 			break;
 		case NESTEDLOOP_STREAMED_OUTER_SECOND:
 			addStreamedNestedLoopsCosts(secondInput, firstInput, availableMemory, driverCosts, costWeight);
 			break;
 		default:
 			throw new CompilerException("Unknown local strategy: " + n.getDriverStrategy().name());
 		}

 		totalCosts.addCosts(driverCosts);
 		n.setCosts(totalCosts);
 	}
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/


	package org.apache.flink.optimizer.costs;

	import java.util.Iterator;

	import org.apache.flink.optimizer.CompilerException;
	import org.apache.flink.optimizer.dag.EstimateProvider;
	import org.apache.flink.optimizer.dag.TempMode;
	import org.apache.flink.optimizer.plan.Channel;
	import org.apache.flink.optimizer.plan.PlanNode;

	/**
	* Abstract base class for a cost estimator. Defines cost estimation methods and implements the basic work
	* method that computes the cost of an operator by adding input shipping cost, input local cost, and
	* driver cost.
	*/
	public abstract class CostEstimator {

	public abstract void addRandomPartitioningCost(EstimateProvider estimates, Costs costs);

	public abstract void addHashPartitioningCost(EstimateProvider estimates, Costs costs);

	public abstract void addRangePartitionCost(EstimateProvider estimates, Costs costs);

	public abstract void addBroadcastCost(EstimateProvider estimates, int replicationFactor, Costs costs);

	// ------------------------------------------------------------------------

	public abstract void addFileInputCost(long fileSizeInBytes, Costs costs);

	public abstract void addLocalSortCost(EstimateProvider estimates, Costs costs);

	public abstract void addLocalMergeCost(EstimateProvider estimates1, EstimateProvider estimates2, Costs costs, int costWeight);

	public abstract void addHybridHashCosts(EstimateProvider buildSide, EstimateProvider probeSide, Costs costs, int costWeight);

	public abstract void addCachedHybridHashCosts(EstimateProvider buildSide, EstimateProvider probeSide, Costs costs, int costWeight);

	public abstract void addStreamedNestedLoopsCosts(EstimateProvider outerSide, EstimateProvider innerSide, long bufferSize, Costs costs, int costWeight);

	public abstract void addBlockNestedLoopsCosts(EstimateProvider outerSide, EstimateProvider innerSide, long blockSize, Costs costs, int costWeight);

	// ------------------------------------------------------------------------

	public abstract void addArtificialDamCost(EstimateProvider estimates, long bufferSize, Costs costs);

	// ------------------------------------------------------------------------

	/**
	* This method computes the cost of an operator. The cost is composed of cost for input shipping,
	* locally processing an input, and running the operator.
	*
	* It requires at least that all inputs are set and have a proper ship strategy set,
	* which is not equal to <tt>NONE</tt>.
	*
	* @param n The node to compute the costs for.
	*/
	public void costOperator(PlanNode n) {
	// initialize costs objects with no costs
	final Costs totalCosts = new Costs();
	final long availableMemory = n.getGuaranteedAvailableMemory();

	// add the shipping strategy costs
	for (Channel channel : n.getInputs()) {
	final Costs costs = new Costs();

	// Plans that apply the same strategies, but at different points
	// are equally expensive. For example, if a partitioning can be
	// pushed below a Map function there is often no difference in plan
	// costs between the pushed down version and the version that partitions
	// after the Mapper. However, in those cases, we want the expensive
	// strategy to appear later in the plan, as data reduction often occurs
	// by large factors, while blowup is rare and typically by smaller fractions.
	// We achieve this by adding a penalty to small penalty to the FORWARD strategy,
	// weighted by the current plan depth (steps to the earliest data source).
	// that way, later FORWARDS are more expensive than earlier forwards.
	// Note that this only applies to the heuristic costs.

	switch (channel.getShipStrategy()) {
	case NONE:
	throw new CompilerException(
	"Cannot determine costs: Shipping strategy has not been set for an input.");
	case FORWARD:
	// costs.addHeuristicNetworkCost(channel.getMaxDepth());
	break;
	case PARTITION_RANDOM:
	addRandomPartitioningCost(channel, costs);
	break;
	case PARTITION_HASH:
	case PARTITION_CUSTOM:
	addHashPartitioningCost(channel, costs);
	break;
	case PARTITION_RANGE:
	addRangePartitionCost(channel, costs);
	break;
	case BROADCAST:
	addBroadcastCost(channel, channel.getReplicationFactor(), costs);
	break;
	case PARTITION_FORCED_REBALANCE:
	addRandomPartitioningCost(channel, costs);
	break;
	default:
	throw new CompilerException("Unknown shipping strategy for input: " + channel.getShipStrategy());
	}

	switch (channel.getLocalStrategy()) {
	case NONE:
	break;
	case SORT:
	case COMBININGSORT:
	addLocalSortCost(channel, costs);
	break;
	default:
	throw new CompilerException("Unsupported local strategy for input: " + channel.getLocalStrategy());
	}

	if (channel.getTempMode() != null && channel.getTempMode() != TempMode.NONE) {
	addArtificialDamCost(channel, 0, costs);
	}

	// adjust with the cost weight factor
	if (channel.isOnDynamicPath()) {
	costs.multiplyWith(channel.getCostWeight());
	}

	totalCosts.addCosts(costs);
	}

	Channel firstInput = null;
	Channel secondInput = null;
	Costs driverCosts = new Costs();
	int costWeight = 1;

	// adjust with the cost weight factor
	if (n.isOnDynamicPath()) {
	costWeight = n.getCostWeight();
	}

	// get the inputs, if we have some
	{
	Iterator<Channel> channels = n.getInputs().iterator();
	if (channels.hasNext()) {
	firstInput = channels.next();
	}
	if (channels.hasNext()) {
	secondInput = channels.next();
	}
	}

	// determine the local costs
	switch (n.getDriverStrategy()) {
	case NONE:
	case UNARY_NO_OP:
	case BINARY_NO_OP:
	case MAP:
	case MAP_PARTITION:
	case FLAT_MAP:

	case ALL_GROUP_REDUCE:
	case ALL_REDUCE:
	// this operations does not do any actual grouping, since every element is in the same single group

	case CO_GROUP:
	case CO_GROUP_RAW:
	case SORTED_GROUP_REDUCE:
	case SORTED_REDUCE:
	// grouping or co-grouping over sorted streams for free

	case SORTED_GROUP_COMBINE:
	// partial grouping is always local and main memory resident. we should add a relative cpu cost at some point

	// partial grouping is always local and main memory resident. we should add a relative cpu cost at some point
	case ALL_GROUP_COMBINE:

	case UNION:
	// pipelined local union is for free

	break;
	case INNER_MERGE:
	case FULL_OUTER_MERGE:
	case LEFT_OUTER_MERGE:
	case RIGHT_OUTER_MERGE:
	addLocalMergeCost(firstInput, secondInput, driverCosts, costWeight);
	break;
	case HYBRIDHASH_BUILD_FIRST:
	case RIGHT_HYBRIDHASH_BUILD_FIRST:
	case LEFT_HYBRIDHASH_BUILD_FIRST:
	case FULL_OUTER_HYBRIDHASH_BUILD_FIRST:
	addHybridHashCosts(firstInput, secondInput, driverCosts, costWeight);
	break;
	case HYBRIDHASH_BUILD_SECOND:
	case LEFT_HYBRIDHASH_BUILD_SECOND:
	case RIGHT_HYBRIDHASH_BUILD_SECOND:
	case FULL_OUTER_HYBRIDHASH_BUILD_SECOND:
	addHybridHashCosts(secondInput, firstInput, driverCosts, costWeight);
	break;
	case HYBRIDHASH_BUILD_FIRST_CACHED:
	addCachedHybridHashCosts(firstInput, secondInput, driverCosts, costWeight);
	break;
	case HYBRIDHASH_BUILD_SECOND_CACHED:
	addCachedHybridHashCosts(secondInput, firstInput, driverCosts, costWeight);
	break;
	case NESTEDLOOP_BLOCKED_OUTER_FIRST:
	addBlockNestedLoopsCosts(firstInput, secondInput, availableMemory, driverCosts, costWeight);
	break;
	case NESTEDLOOP_BLOCKED_OUTER_SECOND:
	addBlockNestedLoopsCosts(secondInput, firstInput, availableMemory, driverCosts, costWeight);
	break;
	case NESTEDLOOP_STREAMED_OUTER_FIRST:
	addStreamedNestedLoopsCosts(firstInput, secondInput, availableMemory, driverCosts, costWeight);
	break;
	case NESTEDLOOP_STREAMED_OUTER_SECOND:
	addStreamedNestedLoopsCosts(secondInput, firstInput, availableMemory, driverCosts, costWeight);
	break;
	default:
	throw new CompilerException("Unknown local strategy: " + n.getDriverStrategy().name());
	}

	totalCosts.addCosts(driverCosts);
	n.setCosts(totalCosts);
	}
	}