tajo-core/src/main/java/org/apache/tajo/engine/planner/global/GlobalPlanner.java - tajo - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.tajo.engine.planner.global;

 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
 import com.google.common.collect.Lists;
 import com.google.common.collect.Maps;
 import com.google.common.collect.Sets;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.tajo.ExecutionBlockId;
 import org.apache.tajo.SessionVars;
 import org.apache.tajo.algebra.JoinType;
 import org.apache.tajo.catalog.*;
 import org.apache.tajo.catalog.partition.PartitionMethodDesc;
 import org.apache.tajo.catalog.proto.CatalogProtos;
 import org.apache.tajo.common.TajoDataTypes;
 import org.apache.tajo.conf.TajoConf;
 import org.apache.tajo.engine.planner.global.builder.DistinctGroupbyBuilder;
 import org.apache.tajo.engine.planner.global.rewriter.GlobalPlanRewriteEngine;
 import org.apache.tajo.engine.planner.global.rewriter.GlobalPlanRewriteRuleProvider;
 import org.apache.tajo.engine.query.QueryContext;
 import org.apache.tajo.exception.NotImplementedException;
 import org.apache.tajo.exception.TajoException;
 import org.apache.tajo.exception.TajoInternalError;
 import org.apache.tajo.exception.UnsupportedException;
 import org.apache.tajo.plan.LogicalPlan;
 import org.apache.tajo.plan.Target;
 import org.apache.tajo.plan.expr.*;
 import org.apache.tajo.plan.logical.*;
 import org.apache.tajo.plan.rewrite.rules.ProjectionPushDownRule;
 import org.apache.tajo.plan.util.PlannerUtil;
 import org.apache.tajo.plan.visitor.BasicLogicalPlanVisitor;
 import org.apache.tajo.storage.StorageConstants;
 import org.apache.tajo.util.KeyValueSet;
 import org.apache.tajo.util.ReflectionUtil;
 import org.apache.tajo.util.TUtil;
 import org.apache.tajo.worker.TajoWorker;

 import java.io.IOException;
 import java.util.*;

 import static org.apache.tajo.conf.TajoConf.ConfVars;
 import static org.apache.tajo.conf.TajoConf.ConfVars.GLOBAL_PLAN_REWRITE_RULE_PROVIDER_CLASS;
 import static org.apache.tajo.plan.serder.PlanProto.ShuffleType.*;

 /**
  * Build DAG
  */
 public class GlobalPlanner {
   private static Log LOG = LogFactory.getLog(GlobalPlanner.class);

   private final TajoConf conf;
   private final String dataFormat;
   private final String finalOutputDataFormat;
   private final CatalogService catalog;
   private final GlobalPlanRewriteEngine rewriteEngine;

   @VisibleForTesting
   public GlobalPlanner(final TajoConf conf, final CatalogService catalog) throws IOException {
     this.conf = conf;
     this.catalog = catalog;
     this.dataFormat = conf.getVar(ConfVars.SHUFFLE_FILE_FORMAT).toUpperCase();
     this.finalOutputDataFormat = conf.getVar(ConfVars.QUERY_OUTPUT_DEFAULT_FILE_FORMAT).toUpperCase();

     Class<? extends GlobalPlanRewriteRuleProvider> clazz =
         (Class<? extends GlobalPlanRewriteRuleProvider>) conf.getClassVar(GLOBAL_PLAN_REWRITE_RULE_PROVIDER_CLASS);
     GlobalPlanRewriteRuleProvider provider = ReflectionUtil.newInstance(clazz, conf);
     rewriteEngine = new GlobalPlanRewriteEngine();
     rewriteEngine.addRewriteRule(provider.getRules());
   }

   public GlobalPlanner(final TajoConf conf, final TajoWorker.WorkerContext workerContext) throws IOException {
     this(conf, workerContext.getCatalog());
   }

   public TajoConf getConf() {
     return conf;
   }

   public CatalogService getCatalog() {
     return catalog;
   }

   public String getDataFormat() {
     return dataFormat;
   }

   public static class GlobalPlanContext {
     MasterPlan plan;
     Map<Integer, ExecutionBlock> execBlockMap = Maps.newHashMap();

     public MasterPlan getPlan() {
       return plan;
     }

     public Map<Integer, ExecutionBlock> getExecBlockMap() {
       return execBlockMap;
     }
   }

   /**
    * Builds a master plan from the given logical plan.
    */
   public void build(QueryContext queryContext, MasterPlan masterPlan) throws IOException, TajoException {

     DistributedPlannerVisitor planner = new DistributedPlannerVisitor();
     GlobalPlanContext globalPlanContext = new GlobalPlanContext();
     globalPlanContext.plan = masterPlan;

     LOG.info(masterPlan.getLogicalPlan());

     // copy a logical plan in order to keep the original logical plan. The distributed planner can modify
     // an input logical plan.
     LogicalNode inputPlan = PlannerUtil.clone(masterPlan.getLogicalPlan(),
         masterPlan.getLogicalPlan().getRootBlock().getRoot());

     // create a distributed execution plan by visiting each logical node.
     // Its output is a graph, where each vertex is an execution block, and each edge is a data channel.
     // MasterPlan contains them.
     LogicalNode lastNode = planner.visit(globalPlanContext,
         masterPlan.getLogicalPlan(), masterPlan.getLogicalPlan().getRootBlock(), inputPlan, new Stack<>());
     ExecutionBlock childExecBlock = globalPlanContext.execBlockMap.get(lastNode.getPID());

     ExecutionBlock terminalBlock;
     // TODO - consider two terminal types: specified output or not
     if (childExecBlock.getPlan() != null) {
       terminalBlock = masterPlan.createTerminalBlock();
       DataChannel finalChannel = new DataChannel(childExecBlock.getId(), terminalBlock.getId());
       setFinalOutputChannel(finalChannel, lastNode.getOutSchema());
       masterPlan.addConnect(finalChannel);
     } else { // if one or more unions is terminal
       terminalBlock = childExecBlock;
       for (DataChannel outputChannel : masterPlan.getIncomingChannels(terminalBlock.getId())) {
         setFinalOutputChannel(outputChannel, lastNode.getOutSchema());
       }
     }

     masterPlan.setTerminal(terminalBlock);
     LOG.info("\n\nNon-optimized master plan\n" + masterPlan.toString());

     masterPlan = rewriteEngine.rewrite(queryContext, masterPlan);
     LOG.info("\n\nOptimized master plan\n" + masterPlan.toString());
   }

   private void setFinalOutputChannel(DataChannel outputChannel, Schema outputSchema) {
     outputChannel.setShuffleType(NONE_SHUFFLE);
     outputChannel.setShuffleOutputNum(1);
     outputChannel.setDataFormat(finalOutputDataFormat);
     outputChannel.setSchema(outputSchema);
   }

   public static ScanNode buildInputExecutor(LogicalPlan plan, DataChannel channel) {
     Preconditions.checkArgument(channel.getSchema() != null,
         "Channel schema (" + channel.getSrcId().getId() + " -> " + channel.getTargetId().getId() +
             ") is not initialized");
     TableMeta meta = new TableMeta(channel.getDataFormat(), new KeyValueSet());
     TableDesc desc = new TableDesc(
         channel.getSrcId().toString(), channel.getSchema(), meta, StorageConstants.LOCAL_FS_URI);
     ScanNode scanNode = plan.createNode(ScanNode.class);
     scanNode.init(desc);
     return scanNode;
   }

   private DataChannel createDataChannelFromJoin(ExecutionBlock leftBlock, ExecutionBlock rightBlock,
                                                 ExecutionBlock parent, JoinNode join, boolean leftTable) {
     ExecutionBlock childBlock = leftTable ? leftBlock : rightBlock;

     DataChannel channel = new DataChannel(childBlock, parent, HASH_SHUFFLE, 32);
     channel.setDataFormat(dataFormat);
     if (join.getJoinType() != JoinType.CROSS) {
       // ShuffleKeys need to not have thea-join condition because Tajo supports only equi-join.
       Column [][] joinColumns = PlannerUtil.joinJoinKeyForEachTable(join.getJoinQual(),
           leftBlock.getPlan().getOutSchema(), rightBlock.getPlan().getOutSchema(), false);
       if (leftTable) {
         channel.setShuffleKeys(joinColumns[0]);
       } else {
         channel.setShuffleKeys(joinColumns[1]);
       }
     }
     return channel;
   }

   private ExecutionBlock buildJoinPlan(GlobalPlanContext context, JoinNode joinNode,
                                         ExecutionBlock leftBlock, ExecutionBlock rightBlock) throws TajoException {
     MasterPlan masterPlan = context.plan;
     ExecutionBlock currentBlock;

     LogicalNode leftNode = joinNode.getLeftChild();
     LogicalNode rightNode = joinNode.getRightChild();

     // symmetric repartition join
     boolean leftUnion = leftNode.getType() == NodeType.TABLE_SUBQUERY &&
         ((TableSubQueryNode)leftNode).getSubQuery().getType() == NodeType.UNION;
     boolean rightUnion = rightNode.getType() == NodeType.TABLE_SUBQUERY &&
         ((TableSubQueryNode)rightNode).getSubQuery().getType() == NodeType.UNION;

     if (leftUnion || rightUnion) { // if one of child execution block is union
       /*
        Join with tableC and result of union tableA, tableB is expected the following physical plan.
        But Union execution block is not necessary.
        |-eb_0001_000006 (Terminal)
           |-eb_0001_000005 (Join eb_0001_000003, eb_0001_000004)
              |-eb_0001_000004 (Scan TableC)
              |-eb_0001_000003 (Union TableA, TableB)
                |-eb_0001_000002 (Scan TableB)
                |-eb_0001_000001 (Scan TableA)

        The above plan can be changed to the following plan.
        |-eb_0001_000005 (Terminal)
           |-eb_0001_000003    (Join [eb_0001_000001, eb_0001_000002], eb_0001_000004)
              |-eb_0001_000004 (Scan TableC)
              |-eb_0001_000002 (Scan TableB)
              |-eb_0001_000001 (Scan TableA)

        eb_0001_000003's left child should be eb_0001_000001 + eb_0001_000001 and right child should be eb_0001_000004.
        For this eb_0001_000001 is representative of eb_0001_000001, eb_0001_000002.
        So eb_0001_000003's left child is eb_0001_000001
        */
       Column[][] joinColumns = null;
       if (joinNode.getJoinType() != JoinType.CROSS) {
         // ShuffleKeys need to not have thea-join condition because Tajo supports only equi-join.
         joinColumns = PlannerUtil.joinJoinKeyForEachTable(joinNode.getJoinQual(),
             leftNode.getOutSchema(), rightNode.getOutSchema(), false);
       }

       if (leftUnion && !rightUnion) { // if only left is union
         currentBlock = leftBlock;
         context.execBlockMap.remove(leftNode.getPID());
         Column[] shuffleKeys = (joinColumns != null) ? joinColumns[0] : null;
         Column[] otherSideShuffleKeys = (joinColumns != null) ? joinColumns[1] : null;
         buildJoinPlanWithUnionChannel(context, joinNode, currentBlock, leftBlock, rightBlock, leftNode,
             shuffleKeys, otherSideShuffleKeys, true);
         currentBlock.setPlan(joinNode);
       } else if (!leftUnion && rightUnion) { // if only right is union
         currentBlock = rightBlock;
         context.execBlockMap.remove(rightNode.getPID());
         Column[] shuffleKeys = (joinColumns != null) ? joinColumns[1] : null;
         Column[] otherSideShuffleKeys = (joinColumns != null) ? joinColumns[0] : null;
         buildJoinPlanWithUnionChannel(context, joinNode, currentBlock, rightBlock, leftBlock, rightNode,
             shuffleKeys, otherSideShuffleKeys, false);
         currentBlock.setPlan(joinNode);
       } else { // if both are unions
         currentBlock = leftBlock;
         context.execBlockMap.remove(leftNode.getPID());
         context.execBlockMap.remove(rightNode.getPID());
         buildJoinPlanWithUnionChannel(context, joinNode, currentBlock, leftBlock, null, leftNode,
             (joinColumns != null ? joinColumns[0] : null), null, true);
         buildJoinPlanWithUnionChannel(context, joinNode, currentBlock, rightBlock, null, rightNode,
             (joinColumns != null ? joinColumns[1] : null), null, false);
         currentBlock.setPlan(joinNode);
       }

       return currentBlock;
     } else {
       // !leftUnion && !rightUnion
       currentBlock = masterPlan.newExecutionBlock();
       DataChannel leftChannel = createDataChannelFromJoin(leftBlock, rightBlock, currentBlock, joinNode, true);
       DataChannel rightChannel = createDataChannelFromJoin(leftBlock, rightBlock, currentBlock, joinNode, false);

       ScanNode leftScan = buildInputExecutor(masterPlan.getLogicalPlan(), leftChannel);
       ScanNode rightScan = buildInputExecutor(masterPlan.getLogicalPlan(), rightChannel);

       joinNode.setLeftChild(leftScan);
       joinNode.setRightChild(rightScan);
       currentBlock.setPlan(joinNode);

       masterPlan.addConnect(leftChannel);
       masterPlan.addConnect(rightChannel);

       return currentBlock;
     }
   }

   private void buildJoinPlanWithUnionChannel(GlobalPlanContext context, JoinNode joinNode,
                                              ExecutionBlock targetBlock,
                                              ExecutionBlock sourceBlock,
                                              ExecutionBlock otherSideBlock,
                                              LogicalNode childNode,
                                              Column[] shuffleKeys,
                                              Column[] otherSideShuffleKeys,
                                              boolean left) {
     MasterPlan masterPlan = context.getPlan();
     String subQueryRelationName = ((TableSubQueryNode)childNode).getCanonicalName();
     ExecutionBlockId dedicatedScanNodeBlock = null;
     for (DataChannel channel : masterPlan.getIncomingChannels(sourceBlock.getId())) {
       // If all union and right, add channel to left
       if (otherSideBlock == null && !left) {
         DataChannel oldChannel = channel;
         masterPlan.disconnect(oldChannel.getSrcId(), oldChannel.getTargetId());
         channel = new DataChannel(oldChannel.getSrcId(), targetBlock.getId());
       }
       channel.setSchema(childNode.getOutSchema());
       channel.setShuffleType(HASH_SHUFFLE);
       channel.setShuffleOutputNum(32);
       if (shuffleKeys != null) {
         channel.setShuffleKeys(shuffleKeys);
       }

       ScanNode scanNode = buildInputExecutor(masterPlan.getLogicalPlan(), channel);
       scanNode.getOutSchema().setQualifier(subQueryRelationName);
       if (dedicatedScanNodeBlock == null) {
         dedicatedScanNodeBlock = channel.getSrcId();
         if (left) {
           joinNode.setLeftChild(scanNode);
         } else {
           joinNode.setRightChild(scanNode);
         }
       }
       masterPlan.addConnect(channel);
       targetBlock.addUnionScan(channel.getSrcId(), dedicatedScanNodeBlock);
     }

     // create other side channel
     if (otherSideBlock != null) {
       DataChannel otherSideChannel = new DataChannel(otherSideBlock, targetBlock, HASH_SHUFFLE, 32);
       otherSideChannel.setDataFormat(dataFormat);
       if (otherSideShuffleKeys != null) {
         otherSideChannel.setShuffleKeys(otherSideShuffleKeys);
       }
       masterPlan.addConnect(otherSideChannel);

       ScanNode scan = buildInputExecutor(masterPlan.getLogicalPlan(), otherSideChannel);
       if (left) {
         joinNode.setRightChild(scan);
       } else {
         joinNode.setLeftChild(scan);
       }
     }
   }

   private AggregationFunctionCallEval createSumFunction(EvalNode[] args) throws TajoException {
     FunctionDesc functionDesc = null;
     functionDesc = getCatalog().getFunction("sum", CatalogProtos.FunctionType.AGGREGATION,
         TypeConverter.convert(args[0].getValueType()).getDataType());
     return new AggregationFunctionCallEval(functionDesc, args);
   }

   private AggregationFunctionCallEval createCountFunction(EvalNode [] args) throws TajoException {
     FunctionDesc functionDesc = getCatalog().getFunction("count", CatalogProtos.FunctionType.AGGREGATION,
         TypeConverter.convert(args[0].getValueType()).getDataType());
     return new AggregationFunctionCallEval(functionDesc, args);
   }

   private AggregationFunctionCallEval createCountRowFunction(EvalNode[] args) throws TajoException {
     FunctionDesc functionDesc = getCatalog().getFunction("count", CatalogProtos.FunctionType.AGGREGATION,
         new TajoDataTypes.DataType[]{});
     return new AggregationFunctionCallEval(functionDesc, args);
   }

   private AggregationFunctionCallEval createMaxFunction(EvalNode [] args) throws TajoException {
     FunctionDesc functionDesc = getCatalog().getFunction("max", CatalogProtos.FunctionType.AGGREGATION,
         TypeConverter.convert(args[0].getValueType()).getDataType());
     return new AggregationFunctionCallEval(functionDesc, args);
   }

   private AggregationFunctionCallEval createMinFunction(EvalNode [] args) throws TajoException {
     FunctionDesc functionDesc = getCatalog().getFunction("min", CatalogProtos.FunctionType.AGGREGATION,
         TypeConverter.convert(args[0].getValueType()).getDataType());
     return new AggregationFunctionCallEval(functionDesc, args);
   }

   /**
    * It contains transformed functions and it related data.
    * Each non-distinct function is transformed into two functions for both first and second stages.
    */
   private static class RewrittenFunctions {
     AggregationFunctionCallEval [] firstStageEvals;
     List<Target> firstStageTargets;
     AggregationFunctionCallEval secondStageEvals;

     public RewrittenFunctions(int firstStageEvalNum) {
       firstStageEvals = new AggregationFunctionCallEval[firstStageEvalNum];
       firstStageTargets = new ArrayList<>();
     }
   }

   /**
    * Tajo uses three execution blocks for an aggregation operator including distinct aggregations.
    * We call this approach <i><b>three-phase aggregation</b></i>.
    *
    * In this case, non-distinct set functions (i.e., <code>count(1), sum(col1)</code>) should be rewritten
    * to other forms. Please see the following example. This is a rewriting case for a query which includes distinct
    * aggregation functions. In this example, <code>count(*)</code> functions are transformed into two
    * functions: count(*) in the inner query and sum() in the outer query.
    *
    * <h2>Original query</h2>
    * <pre>
    * SELECT
    *   grp1, grp2, count(*) as total, count(distinct grp3) as distinct_col
    * from
    *   rel1
    * group by
    *   grp1, grp2;
    * </pre>
    *
    * <h2>Rewritten query</h2>
    * <pre>
    * SELECT grp1, grp2, sum(cnt) as total, count(grp3) as distinct_col from (
    *   SELECT
    *     grp1, grp2, grp3, count(*) as cnt
    *   from
    *     rel1
    *   group by
    *     grp1, grp2, grp3) tmp1
    * group by
    *   grp1, grp2
    * ) table1;
    * </pre>
    *
    * The main objective of this method is to transform non-distinct aggregation functions for three-phase aggregation.
    */
   private RewrittenFunctions rewriteAggFunctionsForDistinctAggregation(GlobalPlanContext context,
                                                                        AggregationFunctionCallEval function)
       throws TajoException {

     LogicalPlan plan = context.plan.getLogicalPlan();
     RewrittenFunctions rewritten = null;

     if (function.getName().equalsIgnoreCase("count")) {
       rewritten = new RewrittenFunctions(1);

       if (function.getArgs().length == 0) {
         rewritten.firstStageEvals[0] = createCountRowFunction(function.getArgs());
       } else {
         rewritten.firstStageEvals[0] = createCountFunction(function.getArgs());
       }
       String referenceName = plan.generateUniqueColumnName(rewritten.firstStageEvals[0]);
       FieldEval fieldEval = new FieldEval(referenceName, rewritten.firstStageEvals[0].getValueType());
       rewritten.firstStageTargets.add(0, new Target(fieldEval));
       rewritten.secondStageEvals = createSumFunction(new EvalNode[]{fieldEval});
     } else if (function.getName().equalsIgnoreCase("sum")) {
       rewritten = new RewrittenFunctions(1);

       rewritten.firstStageEvals[0] = createSumFunction(function.getArgs());
       String referenceName = plan.generateUniqueColumnName(rewritten.firstStageEvals[0]);
       FieldEval fieldEval = new FieldEval(referenceName, rewritten.firstStageEvals[0].getValueType());
       rewritten.firstStageTargets.add(0, new Target(fieldEval));
       rewritten.secondStageEvals = createSumFunction(new EvalNode[]{fieldEval});

     } else if (function.getName().equals("max")) {
       rewritten = new RewrittenFunctions(1);

       rewritten.firstStageEvals[0] = createMaxFunction(function.getArgs());
       String referenceName = plan.generateUniqueColumnName(rewritten.firstStageEvals[0]);
       FieldEval fieldEval = new FieldEval(referenceName, rewritten.firstStageEvals[0].getValueType());
       rewritten.firstStageTargets.add(0, new Target(fieldEval));
       rewritten.secondStageEvals = createMaxFunction(new EvalNode[]{fieldEval});

     } else if (function.getName().equals("min")) {

       rewritten = new RewrittenFunctions(1);

       rewritten.firstStageEvals[0] = createMinFunction(function.getArgs());
       String referenceName = plan.generateUniqueColumnName(rewritten.firstStageEvals[0]);
       FieldEval fieldEval = new FieldEval(referenceName, rewritten.firstStageEvals[0].getValueType());
       rewritten.firstStageTargets.add(0, new Target(fieldEval));
       rewritten.secondStageEvals = createMinFunction(new EvalNode[]{fieldEval});

     } else {
       throw new UnsupportedException("a mix of other functions");
     }

     return rewritten;
   }

   /**
    * If there are at least one distinct aggregation function, a query works as if the query is rewritten as follows:
    *
    * <h2>Original query</h2>
    * <pre>
    * SELECT
    *   grp1, grp2, count(*) as total, count(distinct grp3) as distinct_col
    * from
    *   rel1
    * group by
    *   grp1, grp2;
    * </pre>
    *
    * The query will work as if the query is rewritten into two queries as follows:
    *
    * <h2>Rewritten query</h2>
    * <pre>
    * SELECT grp1, grp2, sum(cnt) as total, count(grp3) as distinct_col from (
    *   SELECT
    *     grp1, grp2, grp3, count(*) as cnt
    *   from
    *     rel1
    *   group by
    *     grp1, grp2, grp3) tmp1
    * group by
    *   grp1, grp2
    * ) table1;
    * </pre>
    *
    * In more detail, the first aggregation aggregates not only original grouping fields but also distinct columns.
    * Non-distinct aggregation functions should be transformed to proper functions.
    * Then, the second aggregation aggregates only original grouping fields with distinct aggregation functions and
    * transformed non-distinct aggregation functions.
    *
    * As a result, although a no-distinct aggregation requires two stages, a distinct aggregation requires three
    * execution blocks.
    */
   private ExecutionBlock buildGroupByIncludingDistinctFunctionsMultiStage(GlobalPlanContext context,
                                                                 ExecutionBlock latestExecBlock,
                                                                 GroupbyNode groupbyNode) throws TajoException {

     Column [] originalGroupingColumns = groupbyNode.getGroupingColumns();
     LinkedHashSet<Column> firstStageGroupingColumns =
         Sets.newLinkedHashSet(Arrays.asList(groupbyNode.getGroupingColumns()));
     List<AggregationFunctionCallEval> firstStageAggFunctions = Lists.newArrayList();
     List<AggregationFunctionCallEval> secondPhaseEvalNodes = Lists.newArrayList();
     List<Target> firstPhaseEvalNodeTargets = Lists.newArrayList();

     for (AggregationFunctionCallEval aggFunction : groupbyNode.getAggFunctions()) {
       if (aggFunction.isDistinct()) {
         // add distinct columns to first stage's grouping columns
         firstStageGroupingColumns.addAll(EvalTreeUtil.findUniqueColumns(aggFunction));
         // keep distinct aggregation functions for the second stage
         secondPhaseEvalNodes.add(aggFunction);

       } else {
         // Rewrite non-distinct aggregation functions
         RewrittenFunctions rewritten = rewriteAggFunctionsForDistinctAggregation(context, aggFunction);
         firstStageAggFunctions.addAll(Lists.newArrayList(rewritten.firstStageEvals));
         firstPhaseEvalNodeTargets.addAll(Lists.newArrayList(rewritten.firstStageTargets));

         // keep rewritten non-aggregation functions for the second stage
         secondPhaseEvalNodes.add(rewritten.secondStageEvals);
       }
     }

     List<Target> firstStageTargets = new ArrayList<>();
     for (Column column : firstStageGroupingColumns) {
       firstStageTargets.add(new Target(new FieldEval(column)));
     }
     for (Target target : firstPhaseEvalNodeTargets) {
       firstStageTargets.add(target);
     }

     // Create the groupby node for the first stage and set all necessary descriptions
     GroupbyNode firstStageGroupby = new GroupbyNode(context.plan.getLogicalPlan().newPID());
     firstStageGroupby.setGroupingColumns(TUtil.toArray(firstStageGroupingColumns, Column.class));
     firstStageGroupby.setAggFunctions(firstStageAggFunctions);
     firstStageGroupby.setTargets(firstStageTargets);
     firstStageGroupby.setChild(groupbyNode.getChild());
     firstStageGroupby.setInSchema(groupbyNode.getInSchema());

     // Makes two execution blocks for the first stage
     ExecutionBlock firstStage = buildGroupBy(context, latestExecBlock, firstStageGroupby);

     // Create the groupby node for the second stage.
     GroupbyNode secondPhaseGroupby = new GroupbyNode(context.plan.getLogicalPlan().newPID());
     secondPhaseGroupby.setGroupingColumns(originalGroupingColumns);
     secondPhaseGroupby.setAggFunctions(secondPhaseEvalNodes);
     secondPhaseGroupby.setTargets(groupbyNode.getTargets());

     ExecutionBlock secondStage = context.plan.newExecutionBlock();
     secondStage.setPlan(secondPhaseGroupby);
     SortSpec [] sortSpecs = PlannerUtil.columnsToSortSpecs(firstStageGroupingColumns);
     secondStage.getEnforcer().enforceSortAggregation(secondPhaseGroupby.getPID(), sortSpecs);

     // Create a data channel between the first and second stages
     DataChannel channel;
     channel = new DataChannel(firstStage, secondStage, HASH_SHUFFLE, 32);
     channel.setShuffleKeys(secondPhaseGroupby.getGroupingColumns().clone());
     channel.setSchema(firstStage.getPlan().getOutSchema());
     channel.setDataFormat(dataFormat);

     // Setting for the second phase's logical plan
     ScanNode scanNode = buildInputExecutor(context.plan.getLogicalPlan(), channel);
     secondPhaseGroupby.setChild(scanNode);
     secondPhaseGroupby.setInSchema(scanNode.getOutSchema());
     secondStage.setPlan(secondPhaseGroupby);

     context.plan.addConnect(channel);

     return secondStage;
   }

   private ExecutionBlock buildGroupBy(GlobalPlanContext context, ExecutionBlock lastBlock,
                                       GroupbyNode groupbyNode) throws TajoException {
     MasterPlan masterPlan = context.plan;
     ExecutionBlock currentBlock;

     if (groupbyNode.isDistinct()) { // if there is at one distinct aggregation function
       boolean multiLevelEnabled = context.getPlan().getContext().getBool(SessionVars.GROUPBY_MULTI_LEVEL_ENABLED);

       if (multiLevelEnabled) {
         if (PlannerUtil.findTopNode(groupbyNode, NodeType.UNION) == null) {
           DistinctGroupbyBuilder builder = new DistinctGroupbyBuilder(this);
           return builder.buildMultiLevelPlan(context, lastBlock, groupbyNode);
         } else {
           DistinctGroupbyBuilder builder = new DistinctGroupbyBuilder(this);
           return builder.buildPlan(context, lastBlock, groupbyNode);
         }
       } else {
         DistinctGroupbyBuilder builder = new DistinctGroupbyBuilder(this);
         return builder.buildPlan(context, lastBlock, groupbyNode);
       }
     } else {
       GroupbyNode firstPhaseGroupby = createFirstPhaseGroupBy(masterPlan.getLogicalPlan(), groupbyNode);

       if (hasUnionChild(firstPhaseGroupby)) {
         currentBlock = buildGroupbyAndUnionPlan(masterPlan, lastBlock, firstPhaseGroupby, groupbyNode);
       } else {
         // general hash-shuffled aggregation
         currentBlock = buildTwoPhaseGroupby(masterPlan, lastBlock, firstPhaseGroupby, groupbyNode);
       }
     }

     return currentBlock;
   }

   public static boolean hasUnionChild(UnaryNode node) {

     // there are three cases:
     //
     // The first case is:
     //
     //  create table [tbname] as select * from ( select ... UNION select ...) T
     //
     // We can generalize this case as 'a store operator on the top of union'.
     // In this case, a store operator determines a shuffle method.
     //
     // The second case is:
     //
     // select avg(..) from (select ... UNION select ) T
     //
     // We can generalize this case as 'a shuffle required operator on the top of union'.
     //
     // The third case is:
     //
     // create table select * from ( select ... ) a union all select * from ( select ... ) b

     LogicalNode childNode = node.getChild();

     if (childNode instanceof UnaryNode) { // first case
       UnaryNode child = (UnaryNode) childNode;

       if (child.getChild().getType() == NodeType.PROJECTION) {
         child = child.getChild();
       }

       if (child.getChild().getType() == NodeType.TABLE_SUBQUERY) {
         TableSubQueryNode tableSubQuery = child.getChild();
         return tableSubQuery.getSubQuery().getType() == NodeType.UNION;
       }

     } else if (childNode.getType() == NodeType.TABLE_SUBQUERY) { // second case
       TableSubQueryNode tableSubQuery = node.getChild();
       return tableSubQuery.getSubQuery().getType() == NodeType.UNION;
     } else if (childNode.getType() == NodeType.UNION) { // third case
       return true;
     }

     return false;
   }

   private ExecutionBlock buildGroupbyAndUnionPlan(MasterPlan masterPlan, ExecutionBlock lastBlock,
                                                   GroupbyNode firstPhaseGroupBy, GroupbyNode secondPhaseGroupBy) throws TajoException {
     DataChannel lastDataChannel = null;

     // It pushes down the first phase group-by operator into all child blocks.
     //
     // (second phase)    G (currentBlock)
     //                  /|\
     //                / / | \
     // (first phase) G G  G  G (child block)

     // They are already connected one another.
     // So, we don't need to connect them again.
     for (DataChannel dataChannel : masterPlan.getIncomingChannels(lastBlock.getId())) {
       if (firstPhaseGroupBy.isEmptyGrouping()) {
         dataChannel.setShuffle(HASH_SHUFFLE, firstPhaseGroupBy.getGroupingColumns(), 1);
       } else {
         dataChannel.setShuffle(HASH_SHUFFLE, firstPhaseGroupBy.getGroupingColumns(), 32);
       }
       dataChannel.setSchema(firstPhaseGroupBy.getOutSchema());
       ExecutionBlock childBlock = masterPlan.getExecBlock(dataChannel.getSrcId());

       // Why must firstPhaseGroupby be copied?
       //
       // A groupby in each execution block can have different child.
       // It affects groupby's input schema.
       GroupbyNode firstPhaseGroupbyCopy = PlannerUtil.clone(masterPlan.getLogicalPlan(), firstPhaseGroupBy);
       firstPhaseGroupbyCopy.setChild(childBlock.getPlan());
       childBlock.setPlan(firstPhaseGroupbyCopy);

       // just keep the last data channel.
       lastDataChannel = dataChannel;
     }

     ScanNode scanNode = buildInputExecutor(masterPlan.getLogicalPlan(), lastDataChannel);
     secondPhaseGroupBy.setChild(scanNode);
     lastBlock.setPlan(secondPhaseGroupBy);
     return lastBlock;
   }

   private ExecutionBlock buildTwoPhaseGroupby(MasterPlan masterPlan, ExecutionBlock latestBlock,
                                                      GroupbyNode firstPhaseGroupby, GroupbyNode secondPhaseGroupby) throws TajoException {

     ExecutionBlock childBlock = latestBlock;
     childBlock.setPlan(firstPhaseGroupby);
     ExecutionBlock currentBlock = masterPlan.newExecutionBlock();

     DataChannel channel;
     if (firstPhaseGroupby.isEmptyGrouping()) {
       channel = new DataChannel(childBlock, currentBlock, HASH_SHUFFLE, 1);
       channel.setShuffleKeys(firstPhaseGroupby.getGroupingColumns());
     } else {
       channel = new DataChannel(childBlock, currentBlock, HASH_SHUFFLE, 32);
       channel.setShuffleKeys(firstPhaseGroupby.getGroupingColumns());
     }
     channel.setSchema(firstPhaseGroupby.getOutSchema());
     channel.setDataFormat(dataFormat);

     ScanNode scanNode = buildInputExecutor(masterPlan.getLogicalPlan(), channel);
     secondPhaseGroupby.setChild(scanNode);
     secondPhaseGroupby.setInSchema(scanNode.getOutSchema());
     currentBlock.setPlan(secondPhaseGroupby);

     masterPlan.addConnect(channel);

     return currentBlock;
   }

   public static GroupbyNode createFirstPhaseGroupBy(LogicalPlan plan, GroupbyNode groupBy) {
     Preconditions.checkNotNull(groupBy);

     GroupbyNode firstPhaseGroupBy = PlannerUtil.clone(plan, groupBy);
     GroupbyNode secondPhaseGroupBy = groupBy;

     // Set first phase expressions
     if (secondPhaseGroupBy.hasAggFunctions()) {
       int evalNum = secondPhaseGroupBy.getAggFunctions().size();
       List<AggregationFunctionCallEval> secondPhaseEvals = secondPhaseGroupBy.getAggFunctions();
       List<AggregationFunctionCallEval> firstPhaseEvals = new ArrayList<>();

       String [] firstPhaseEvalNames = new String[evalNum];
       for (int i = 0; i < evalNum; i++) {
         try {
           firstPhaseEvals.add((AggregationFunctionCallEval) secondPhaseEvals.get(i).clone());
         } catch (CloneNotSupportedException e) {
           throw new RuntimeException(e);
         }

         firstPhaseEvals.get(i).setFirstPhase();
         firstPhaseEvalNames[i] = plan.generateUniqueColumnName(firstPhaseEvals.get(i));
         FieldEval param = new FieldEval(firstPhaseEvalNames[i], firstPhaseEvals.get(i).getValueType());

         secondPhaseEvals.get(i).setLastPhase();
         secondPhaseEvals.get(i).setArgs(new EvalNode[]{param});
       }

       secondPhaseGroupBy.setAggFunctions(secondPhaseEvals);
       firstPhaseGroupBy.setAggFunctions(firstPhaseEvals);
       List<Target> firstPhaseTargets = ProjectionPushDownRule.buildGroupByTarget(firstPhaseGroupBy, null,
           firstPhaseEvalNames);
       firstPhaseGroupBy.setTargets(firstPhaseTargets);
       secondPhaseGroupBy.setInSchema(PlannerUtil.targetToSchema(firstPhaseTargets));
     }
     return firstPhaseGroupBy;
   }

   private ExecutionBlock buildSortPlan(GlobalPlanContext context, ExecutionBlock childBlock, SortNode currentNode) throws TajoException {
     MasterPlan masterPlan = context.plan;
     ExecutionBlock currentBlock;

     SortNode firstSortNode = PlannerUtil.clone(context.plan.getLogicalPlan(), currentNode);

     if (firstSortNode.getChild().getType() == NodeType.TABLE_SUBQUERY &&
         ((TableSubQueryNode)firstSortNode.getChild()).getSubQuery().getType() == NodeType.UNION) {

       currentBlock = childBlock;
       for (DataChannel channel : masterPlan.getIncomingChannels(childBlock.getId())) {
         channel.setShuffle(RANGE_SHUFFLE, PlannerUtil.sortSpecsToSchema(currentNode.getSortKeys()).toArray(), 32);
         channel.setSchema(firstSortNode.getOutSchema());

         ExecutionBlock subBlock = masterPlan.getExecBlock(channel.getSrcId());
         SortNode s1 = PlannerUtil.clone(context.plan.getLogicalPlan(), firstSortNode);
         s1.setChild(subBlock.getPlan());
         subBlock.setPlan(s1);

         ScanNode secondScan = buildInputExecutor(masterPlan.getLogicalPlan(), channel);
         currentNode.setChild(secondScan);
         currentNode.setInSchema(secondScan.getOutSchema());
         currentBlock.setPlan(currentNode);
         currentBlock.getEnforcer().addSortedInput(secondScan.getTableName(), currentNode.getSortKeys());
       }
     } else {
       LogicalNode childBlockPlan = childBlock.getPlan();
       firstSortNode.setChild(childBlockPlan);
       // sort is a non-projectable operator. So, in/out schemas are the same to its child operator.
       firstSortNode.setInSchema(childBlockPlan.getOutSchema());
       firstSortNode.setOutSchema(childBlockPlan.getOutSchema());
       childBlock.setPlan(firstSortNode);

       currentBlock = masterPlan.newExecutionBlock();
       DataChannel channel = new DataChannel(childBlock, currentBlock, RANGE_SHUFFLE, 32);
       channel.setShuffleKeys(PlannerUtil.sortSpecsToSchema(currentNode.getSortKeys()).toArray());
       channel.setSchema(firstSortNode.getOutSchema());

       ScanNode secondScan = buildInputExecutor(masterPlan.getLogicalPlan(), channel);
       currentNode.setChild(secondScan);
       currentNode.setInSchema(secondScan.getOutSchema());
       currentBlock.setPlan(currentNode);
       currentBlock.getEnforcer().addSortedInput(secondScan.getTableName(), currentNode.getSortKeys());
       masterPlan.addConnect(channel);
     }

     return currentBlock;
   }

   /**
    * It builds a distributed execution block for CTAS, InsertNode, and StoreTableNode.
    */
   private ExecutionBlock buildStorePlan(GlobalPlanContext context,
                                         ExecutionBlock lastBlock,
                                         StoreTableNode currentNode) throws TajoException {


     if(currentNode.hasPartition()) { // if a target table is a partitioned table

       // Verify supported partition types
       PartitionMethodDesc partitionMethod = currentNode.getPartitionMethod();
       if (partitionMethod.getPartitionType() != CatalogProtos.PartitionType.COLUMN) {
         throw new NotImplementedException("partition type '" + partitionMethod.getPartitionType().name() + "'");
       }

       if (hasUnionChild(currentNode)) { // if it has union children
         return buildShuffleAndStorePlanToPartitionedTableWithUnion(context, currentNode, lastBlock);
       } else { // otherwise
         return buildShuffleAndStorePlanToPartitionedTable(context, currentNode, lastBlock);
       }
     } else { // if result table is not a partitioned table, directly store it
       return buildNoPartitionedStorePlan(context, currentNode, lastBlock);
     }
   }

   /**
    * It makes a plan to store directly union plans into a non-partitioned table.
    */
   private ExecutionBlock buildShuffleAndStorePlanNoPartitionedTableWithUnion(GlobalPlanContext context,
                                                                              StoreTableNode currentNode,
                                                                              ExecutionBlock childBlock) throws TajoException {
     for (ExecutionBlock grandChildBlock : context.plan.getChilds(childBlock)) {
       StoreTableNode copy = PlannerUtil.clone(context.plan.getLogicalPlan(), currentNode);
       copy.setChild(grandChildBlock.getPlan());
       grandChildBlock.setPlan(copy);
     }
     return childBlock;
   }

   /**
    * It inserts shuffle and adds store plan on a partitioned table,
    * and it push downs those plans into child unions.
    */
   private ExecutionBlock buildShuffleAndStorePlanToPartitionedTableWithUnion(GlobalPlanContext context,
                                                                              StoreTableNode currentNode,
                                                                              ExecutionBlock lastBlock) throws TajoException {

     MasterPlan masterPlan = context.plan;
     DataChannel lastChannel = null;
     for (DataChannel channel : masterPlan.getIncomingChannels(lastBlock.getId())) {
       ExecutionBlock childBlock = masterPlan.getExecBlock(channel.getSrcId());
       setShuffleKeysFromPartitionedTableStore(currentNode, channel);
       channel.setSchema(childBlock.getPlan().getOutSchema());
       channel.setDataFormat(dataFormat);
       lastChannel = channel;
     }

     ScanNode scanNode = buildInputExecutor(masterPlan.getLogicalPlan(), lastChannel);
     currentNode.setChild(scanNode);
     currentNode.setInSchema(scanNode.getOutSchema());
     lastBlock.setPlan(currentNode);
     return lastBlock;
   }

   /**
    * It inserts shuffle and adds store plan on a partitioned table.
    */
   private ExecutionBlock buildShuffleAndStorePlanToPartitionedTable(GlobalPlanContext context,
                                                                     StoreTableNode currentNode,
                                                                     ExecutionBlock lastBlock) throws TajoException {
     MasterPlan masterPlan = context.plan;

     ExecutionBlock nextBlock = masterPlan.newExecutionBlock();
     DataChannel channel = new DataChannel(lastBlock, nextBlock, HASH_SHUFFLE, 32);
     setShuffleKeysFromPartitionedTableStore(currentNode, channel);
     channel.setSchema(lastBlock.getPlan().getOutSchema());
     channel.setDataFormat(dataFormat);

     ScanNode scanNode = buildInputExecutor(masterPlan.getLogicalPlan(), channel);
     currentNode.setChild(scanNode);
     currentNode.setInSchema(scanNode.getOutSchema());
     nextBlock.setPlan(currentNode);

     masterPlan.addConnect(channel);

     return nextBlock;
   }

   private ExecutionBlock buildNoPartitionedStorePlan(GlobalPlanContext context,
                                                      StoreTableNode currentNode,
                                                      ExecutionBlock childBlock) throws TajoException {
     if (hasUnionChild(currentNode)) { // when the below is union
       return buildShuffleAndStorePlanNoPartitionedTableWithUnion(context, currentNode, childBlock);
     } else {
       currentNode.setChild(childBlock.getPlan());
       currentNode.setInSchema(childBlock.getPlan().getOutSchema());
       childBlock.setPlan(currentNode);
       return childBlock;
     }
   }

   private void setShuffleKeysFromPartitionedTableStore(StoreTableNode node, DataChannel channel) {
     Preconditions.checkState(node.hasTargetTable(), "A target table must be a partitioned table.");
     PartitionMethodDesc partitionMethod = node.getPartitionMethod();

     if (node.getType() == NodeType.INSERT || node.getType() == NodeType.CREATE_TABLE) {
       Schema tableSchema = null, projectedSchema = null;
       if (node.getType() == NodeType.INSERT) {
         tableSchema = ((InsertNode) node).getTableSchema();
         projectedSchema = ((InsertNode) node).getProjectedSchema();
       } else {
         tableSchema = node.getOutSchema();
         projectedSchema = node.getInSchema();
       }
       channel.setSchema(projectedSchema);

       Column[] shuffleKeys = new Column[partitionMethod.getExpressionSchema().size()];
       int i = 0, id = 0;
       for (Column column : partitionMethod.getExpressionSchema().getRootColumns()) {
         if (node.getType() == NodeType.INSERT) {
           id = tableSchema.getColumnId(column.getQualifiedName());
         } else {
           id = tableSchema.getRootColumns().size() + i;
         }
         shuffleKeys[i++] = projectedSchema.getColumn(id);
       }
       channel.setShuffleKeys(shuffleKeys);
       channel.setShuffleType(SCATTERED_HASH_SHUFFLE);
     } else {
       channel.setShuffleKeys(partitionMethod.getExpressionSchema().toArray());
       channel.setShuffleType(HASH_SHUFFLE);
     }
     channel.setShuffleOutputNum(32);
   }

   public class DistributedPlannerVisitor extends BasicLogicalPlanVisitor<GlobalPlanContext, LogicalNode> {

     @Override
     public LogicalNode visitRoot(GlobalPlanContext context, LogicalPlan plan, LogicalPlan.QueryBlock block,
                                  LogicalRootNode node, Stack<LogicalNode> stack) throws TajoException {
       return super.visitRoot(context, plan, block, node, stack);
     }

     @Override
     public LogicalNode visitProjection(GlobalPlanContext context, LogicalPlan plan, LogicalPlan.QueryBlock block,
                                        ProjectionNode node, Stack<LogicalNode> stack) throws TajoException {
       LogicalNode child = super.visitProjection(context, plan, block, node, stack);

       ExecutionBlock execBlock = context.execBlockMap.remove(child.getPID());

       if (child.getType() == NodeType.TABLE_SUBQUERY &&
           ((TableSubQueryNode)child).getSubQuery().getType() == NodeType.UNION) {
         MasterPlan masterPlan = context.plan;
         for (DataChannel dataChannel : masterPlan.getIncomingChannels(execBlock.getId())) {

           dataChannel.setDataFormat(finalOutputDataFormat);
           ExecutionBlock subBlock = masterPlan.getExecBlock(dataChannel.getSrcId());

           ProjectionNode copy = PlannerUtil.clone(plan, node);
           copy.setChild(subBlock.getPlan());
           subBlock.setPlan(copy);
         }
         execBlock.setPlan(null);
       } else {
         node.setChild(execBlock.getPlan());
         node.setInSchema(execBlock.getPlan().getOutSchema());
         execBlock.setPlan(node);
       }

       context.execBlockMap.put(node.getPID(), execBlock);
       return node;
     }

     @Override
     public LogicalNode visitLimit(GlobalPlanContext context, LogicalPlan plan, LogicalPlan.QueryBlock block,
                                   LimitNode node, Stack<LogicalNode> stack) throws TajoException {
       LogicalNode child = super.visitLimit(context, plan, block, node, stack);

       ExecutionBlock execBlock;
       execBlock = context.execBlockMap.remove(child.getPID());
       if (child.getType() == NodeType.SORT) {
         node.setChild(execBlock.getPlan());
         execBlock.setPlan(node);

         ExecutionBlock childBlock = context.plan.getChild(execBlock, 0);
         LimitNode childLimit = PlannerUtil.clone(context.plan.getLogicalPlan(), node);
         childLimit.setChild(childBlock.getPlan());
         childBlock.setPlan(childLimit);

         DataChannel channel = context.plan.getChannel(childBlock, execBlock);
         channel.setShuffleOutputNum(1);
         context.execBlockMap.put(node.getPID(), execBlock);
       } else {
         node.setChild(execBlock.getPlan());
         execBlock.setPlan(node);

         ExecutionBlock newExecBlock = context.plan.newExecutionBlock();
         DataChannel newChannel = new DataChannel(execBlock, newExecBlock, HASH_SHUFFLE, 1);
         newChannel.setShuffleKeys(new Column[]{});
         newChannel.setSchema(node.getOutSchema());
         newChannel.setDataFormat(dataFormat);

         ScanNode scanNode = buildInputExecutor(plan, newChannel);
         LimitNode parentLimit = PlannerUtil.clone(context.plan.getLogicalPlan(), node);
         parentLimit.setChild(scanNode);
         newExecBlock.setPlan(parentLimit);
         context.plan.addConnect(newChannel);
         context.execBlockMap.put(parentLimit.getPID(), newExecBlock);
         node = parentLimit;
       }

       return node;
     }

     @Override
     public LogicalNode visitSort(GlobalPlanContext context, LogicalPlan plan, LogicalPlan.QueryBlock block,
                                  SortNode node, Stack<LogicalNode> stack) throws TajoException {

       LogicalNode child = super.visitSort(context, plan, block, node, stack);

       ExecutionBlock childBlock = context.execBlockMap.remove(child.getPID());
       ExecutionBlock newExecBlock = buildSortPlan(context, childBlock, node);
       context.execBlockMap.put(node.getPID(), newExecBlock);

       return node;
     }

     @Override
     public LogicalNode visitHaving(GlobalPlanContext context, LogicalPlan plan, LogicalPlan.QueryBlock block,
                                     HavingNode node, Stack<LogicalNode> stack) throws TajoException {
       LogicalNode child = super.visitHaving(context, plan, block, node, stack);

       // Don't separate execution block. Having is pushed to the second grouping execution block.
       ExecutionBlock childBlock = context.execBlockMap.remove(child.getPID());
       node.setChild(childBlock.getPlan());
       childBlock.setPlan(node);
       context.execBlockMap.put(node.getPID(), childBlock);

       return node;
     }

     @Override
     public LogicalNode visitWindowAgg(GlobalPlanContext context, LogicalPlan plan, LogicalPlan.QueryBlock block,
                                     WindowAggNode node, Stack<LogicalNode> stack) throws TajoException {
       LogicalNode child = super.visitWindowAgg(context, plan, block, node, stack);

       ExecutionBlock childBlock = context.execBlockMap.remove(child.getPID());
       ExecutionBlock newExecBlock = buildWindowAgg(context, childBlock, node);
       context.execBlockMap.put(newExecBlock.getPlan().getPID(), newExecBlock);

       return newExecBlock.getPlan();
     }

     private ExecutionBlock buildWindowAgg(GlobalPlanContext context, ExecutionBlock lastBlock,
                                         WindowAggNode windowAgg) throws TajoException {
       MasterPlan masterPlan = context.plan;

       ExecutionBlock childBlock = lastBlock;
       ExecutionBlock currentBlock = masterPlan.newExecutionBlock();
       DataChannel channel;
       if (windowAgg.hasPartitionKeys()) { // if there is at one distinct aggregation function
         channel = new DataChannel(childBlock, currentBlock, RANGE_SHUFFLE, 32);
         channel.setShuffleKeys(windowAgg.getPartitionKeys());
       } else {
         channel = new DataChannel(childBlock, currentBlock, HASH_SHUFFLE, 1);
         channel.setShuffleKeys(null);
       }
       channel.setSchema(windowAgg.getInSchema());
       channel.setDataFormat(dataFormat);

       LogicalNode childNode = windowAgg.getChild();
       ScanNode scanNode = buildInputExecutor(masterPlan.getLogicalPlan(), channel);

       if (windowAgg.hasPartitionKeys()) {
         SortNode sortNode = masterPlan.getLogicalPlan().createNode(SortNode.class);
         sortNode.setOutSchema(scanNode.getOutSchema());
         sortNode.setInSchema(scanNode.getOutSchema());
         sortNode.setSortSpecs(PlannerUtil.columnsToSortSpecs(windowAgg.getPartitionKeys()));
         sortNode.setChild(childNode);
         childBlock.setPlan(sortNode);

         windowAgg.setChild(scanNode);
       } else {
         windowAgg.setInSchema(scanNode.getOutSchema());
         windowAgg.setChild(scanNode);
         childBlock.setPlan(childNode);
       }

       currentBlock.setPlan(windowAgg);
       context.plan.addConnect(channel);

       return currentBlock;
     }

     @Override
     public LogicalNode visitGroupBy(GlobalPlanContext context, LogicalPlan plan, LogicalPlan.QueryBlock block,
                                     GroupbyNode node, Stack<LogicalNode> stack) throws TajoException {
       LogicalNode child = super.visitGroupBy(context, plan, block, node, stack);

       ExecutionBlock childBlock = context.execBlockMap.remove(child.getPID());
       ExecutionBlock newExecBlock = buildGroupBy(context, childBlock, node);
       context.execBlockMap.put(newExecBlock.getPlan().getPID(), newExecBlock);

       return newExecBlock.getPlan();
     }

     @Override
     public LogicalNode visitFilter(GlobalPlanContext context, LogicalPlan plan, LogicalPlan.QueryBlock block,
                                    SelectionNode node, Stack<LogicalNode> stack) throws TajoException {
       LogicalNode child = super.visitFilter(context, plan, block, node, stack);

       ExecutionBlock execBlock = context.execBlockMap.remove(child.getPID());
       node.setChild(execBlock.getPlan());
       node.setInSchema(execBlock.getPlan().getOutSchema());
       execBlock.setPlan(node);
       context.execBlockMap.put(node.getPID(), execBlock);

       return node;
     }

     @Override
     public LogicalNode visitJoin(GlobalPlanContext context, LogicalPlan plan, LogicalPlan.QueryBlock block,
                                  JoinNode node, Stack<LogicalNode> stack) throws TajoException {
       LogicalNode leftChild = visit(context, plan, block, node.getLeftChild(), stack);
       ExecutionBlock leftChildBlock = context.execBlockMap.get(leftChild.getPID());

       LogicalNode rightChild = visit(context, plan, block, node.getRightChild(), stack);
       ExecutionBlock rightChildBlock = context.execBlockMap.get(rightChild.getPID());

       if (node.getJoinType() == JoinType.LEFT_OUTER) {
         leftChildBlock.setPreservedRow();
         rightChildBlock.setNullSuppllying();
       } else if (node.getJoinType() == JoinType.RIGHT_OUTER) {
         leftChildBlock.setNullSuppllying();
         rightChildBlock.setPreservedRow();
       } else if (node.getJoinType() == JoinType.FULL_OUTER) {
         leftChildBlock.setPreservedRow();
         leftChildBlock.setNullSuppllying();
         rightChildBlock.setPreservedRow();
         rightChildBlock.setNullSuppllying();
       }

       ExecutionBlock newExecBlock = buildJoinPlan(context, node, leftChildBlock, rightChildBlock);
       context.execBlockMap.put(node.getPID(), newExecBlock);

       return node;
     }

     @Override
     public LogicalNode visitUnion(GlobalPlanContext context, LogicalPlan plan, LogicalPlan.QueryBlock queryBlock,
                                   UnionNode node, Stack<LogicalNode> stack) throws TajoException {
       stack.push(node);
       LogicalPlan.QueryBlock leftQueryBlock = plan.getBlock(node.getLeftChild());
       LogicalNode leftChild = visit(context, plan, leftQueryBlock, leftQueryBlock.getRoot(), stack);

       LogicalPlan.QueryBlock rightQueryBlock = plan.getBlock(node.getRightChild());
       LogicalNode rightChild = visit(context, plan, rightQueryBlock, rightQueryBlock.getRoot(), stack);
       stack.pop();

       MasterPlan masterPlan = context.getPlan();

       List<ExecutionBlock> unionBlocks = Lists.newArrayList();
       List<ExecutionBlock> queryBlockBlocks = Lists.newArrayList();

       ExecutionBlock leftBlock = context.execBlockMap.remove(leftChild.getPID());
       ExecutionBlock rightBlock = context.execBlockMap.remove(rightChild.getPID());

       // These union types need to eliminate unnecessary nodes between parent and child node of query tree.
       boolean leftUnion = (leftChild.getType() == NodeType.UNION) ||
           ((leftChild.getType() == NodeType.TABLE_SUBQUERY) &&
           (((TableSubQueryNode)leftChild).getSubQuery().getType() == NodeType.UNION));
       boolean rightUnion = (rightChild.getType() == NodeType.UNION) ||
           (rightChild.getType() == NodeType.TABLE_SUBQUERY) &&
           (((TableSubQueryNode)rightChild).getSubQuery().getType() == NodeType.UNION);
       if (leftUnion) {
         unionBlocks.add(leftBlock);
       } else {
         queryBlockBlocks.add(leftBlock);
       }
       if (rightUnion) {
         unionBlocks.add(rightBlock);
       } else {
         queryBlockBlocks.add(rightBlock);
       }

       ExecutionBlock execBlock;
       if (unionBlocks.size() == 0) {
         execBlock = context.plan.newExecutionBlock();
       } else {
         execBlock = unionBlocks.get(0);
       }

       for (ExecutionBlock childBlocks : unionBlocks) {
         for (ExecutionBlock grandChildBlock : masterPlan.getChilds(childBlocks)) {
           masterPlan.disconnect(grandChildBlock, childBlocks);
           queryBlockBlocks.add(grandChildBlock);
         }
       }

       for (ExecutionBlock childBlocks : queryBlockBlocks) {
         DataChannel channel = new DataChannel(childBlocks, execBlock, NONE_SHUFFLE, 1);
         channel.setDataFormat(dataFormat);
         masterPlan.addConnect(channel);
       }

       context.execBlockMap.put(node.getPID(), execBlock);

       return node;
     }

     private LogicalNode handleUnaryNode(GlobalPlanContext context, LogicalNode child, LogicalNode node)
         throws TajoException {
       ExecutionBlock execBlock = context.execBlockMap.remove(child.getPID());
       execBlock.setPlan(node);
       context.execBlockMap.put(node.getPID(), execBlock);

       return node;
     }

     @Override
     public LogicalNode visitExcept(GlobalPlanContext context, LogicalPlan plan, LogicalPlan.QueryBlock queryBlock,
                                    ExceptNode node, Stack<LogicalNode> stack) throws TajoException {
       LogicalNode child = super.visitExcept(context, plan, queryBlock, node, stack);
       return handleUnaryNode(context, child, node);
     }

     @Override
     public LogicalNode visitIntersect(GlobalPlanContext context, LogicalPlan plan, LogicalPlan.QueryBlock queryBlock,
                                       IntersectNode node, Stack<LogicalNode> stack) throws TajoException {
       LogicalNode child = super.visitIntersect(context, plan, queryBlock, node, stack);
       return handleUnaryNode(context, child, node);
     }

     @Override
     public LogicalNode visitTableSubQuery(GlobalPlanContext context, LogicalPlan plan,
                                           LogicalPlan.QueryBlock queryBlock,
                                           TableSubQueryNode node, Stack<LogicalNode> stack) throws TajoException {
       LogicalNode child = super.visitTableSubQuery(context, plan, queryBlock, node, stack);
       node.setSubQuery(child);

       ExecutionBlock currentBlock = context.execBlockMap.remove(child.getPID());

       if (child.getType() == NodeType.UNION) {
         List<TableSubQueryNode> addedTableSubQueries = new ArrayList<>();
         TableSubQueryNode leftMostSubQueryNode = null;
         for (ExecutionBlock childBlock : context.plan.getChilds(currentBlock.getId())) {
           TableSubQueryNode copy = PlannerUtil.clone(plan, node);
           copy.setSubQuery(childBlock.getPlan());
           childBlock.setPlan(copy);
           addedTableSubQueries.add(copy);

           //Find a SubQueryNode which contains all columns in InputSchema matched with Target and OutputSchema's column
           if (copy.getInSchema().containsAll(copy.getOutSchema().getRootColumns())) {
             for (Target eachTarget : copy.getTargets()) {
               Set<Column> columns = EvalTreeUtil.findUniqueColumns(eachTarget.getEvalTree());
               if (copy.getInSchema().containsAll(columns)) {
                 leftMostSubQueryNode = copy;
                 break;
               }
             }
           }
         }
         if (leftMostSubQueryNode != null) {
           // replace target column name
           List<Target> targets = leftMostSubQueryNode.getTargets();
           int[] targetMappings = new int[targets.size()];
           for (int i = 0; i < targets.size(); i++) {
             if (targets.get(i).getEvalTree().getType() != EvalType.FIELD) {
               throw new TajoInternalError("Target of a UnionNode's subquery should be FieldEval.");
             }
             int index = leftMostSubQueryNode.getInSchema().getColumnId(targets.get(i).getNamedColumn().getQualifiedName());
             if (index < 0) {
               // If a target has alias, getNamedColumn() only returns alias
               Set<Column> columns = EvalTreeUtil.findUniqueColumns(targets.get(i).getEvalTree());
               Column column = columns.iterator().next();
               index = leftMostSubQueryNode.getInSchema().getColumnId(column.getQualifiedName());
             }
             if (index < 0) {
               throw new TajoInternalError("Can't find matched Target in UnionNode's input schema: " + targets.get(i)
                   + "->" + leftMostSubQueryNode.getInSchema());
             }
             targetMappings[i] = index;
           }

           for (TableSubQueryNode eachNode: addedTableSubQueries) {
             if (eachNode.getPID() == leftMostSubQueryNode.getPID()) {
               continue;
             }
             List<Target> eachNodeTargets = eachNode.getTargets();
             if (eachNodeTargets.size() != targetMappings.length) {
               throw new TajoInternalError("Union query can't have different number of target columns.");
             }
             for (int i = 0; i < eachNodeTargets.size(); i++) {
               Column inColumn = eachNode.getInSchema().getColumn(targetMappings[i]);
               Target t = eachNodeTargets.get(i);
               t.setAlias(t.getNamedColumn().getQualifiedName());
               EvalNode evalNode = eachNodeTargets.get(i).getEvalTree();
               if (evalNode.getType() != EvalType.FIELD) {
                 throw new TajoInternalError("Target of a UnionNode's subquery should be FieldEval.");
               }
               FieldEval fieldEval = (FieldEval) evalNode;
               EvalTreeUtil.changeColumnRef(fieldEval,
                   fieldEval.getColumnRef().getQualifiedName(), inColumn.getQualifiedName());
             }
           }
         } else {
           LOG.warn("Can't find left most SubQuery in the UnionNode.");
         }
       } else {
         currentBlock.setPlan(node);
       }
       context.execBlockMap.put(node.getPID(), currentBlock);
       return node;
     }

     @Override
     public LogicalNode visitScan(GlobalPlanContext context, LogicalPlan plan, LogicalPlan.QueryBlock queryBlock,
                                  ScanNode node, Stack<LogicalNode> stack) throws TajoException {
       ExecutionBlock newExecBlock = context.plan.newExecutionBlock();
       newExecBlock.setPlan(node);
       context.execBlockMap.put(node.getPID(), newExecBlock);
       return node;
     }

     @Override
     public LogicalNode visitIndexScan(GlobalPlanContext context, LogicalPlan plan, LogicalPlan.QueryBlock block,
                                       IndexScanNode node, Stack<LogicalNode> stack) throws TajoException {
       ExecutionBlock newBlock = context.plan.newExecutionBlock();
       newBlock.setPlan(node);
       context.execBlockMap.put(node.getPID(), newBlock);
       return node;
     }

     @Override
     public LogicalNode visitPartitionedTableScan(GlobalPlanContext context, LogicalPlan plan,
                                                  LogicalPlan.QueryBlock block, PartitionedTableScanNode node,
                                                  Stack<LogicalNode> stack)throws TajoException {
       ExecutionBlock newExecBlock = context.plan.newExecutionBlock();
       newExecBlock.setPlan(node);
       context.execBlockMap.put(node.getPID(), newExecBlock);
       return node;
     }

     @Override
     public LogicalNode visitStoreTable(GlobalPlanContext context, LogicalPlan plan, LogicalPlan.QueryBlock queryBlock,
                                        StoreTableNode node, Stack<LogicalNode> stack) throws TajoException {
       LogicalNode child = super.visitStoreTable(context, plan, queryBlock, node, stack);

       ExecutionBlock childBlock = context.execBlockMap.remove(child.getPID());
       ExecutionBlock newExecBlock = buildStorePlan(context, childBlock, node);
       context.execBlockMap.put(node.getPID(), newExecBlock);

       return node;
     }

     @Override
     public LogicalNode visitCreateTable(GlobalPlanContext context, LogicalPlan plan, LogicalPlan.QueryBlock queryBlock,
                                        CreateTableNode node, Stack<LogicalNode> stack) throws TajoException {
       LogicalNode child = super.visitStoreTable(context, plan, queryBlock, node, stack);

       ExecutionBlock childBlock = context.execBlockMap.remove(child.getPID());
       ExecutionBlock newExecBlock = buildStorePlan(context, childBlock, node);
       context.execBlockMap.put(node.getPID(), newExecBlock);

       return node;
     }

     @Override
     public LogicalNode visitInsert(GlobalPlanContext context, LogicalPlan plan, LogicalPlan.QueryBlock queryBlock,
                                    InsertNode node, Stack<LogicalNode> stack)
         throws TajoException {
       LogicalNode child = super.visitInsert(context, plan, queryBlock, node, stack);

       ExecutionBlock childBlock = context.execBlockMap.remove(child.getPID());
       ExecutionBlock newExecBlock = buildStorePlan(context, childBlock, node);
       context.execBlockMap.put(node.getPID(), newExecBlock);

       return node;
     }

     @Override
     public LogicalNode visitCreateIndex(GlobalPlanContext context, LogicalPlan plan, LogicalPlan.QueryBlock queryBlock,
                                         CreateIndexNode node, Stack<LogicalNode> stack) throws TajoException {
       LogicalNode child = super.visitCreateIndex(context, plan, queryBlock, node, stack);

       // Don't separate execution block. CreateIndex is pushed to the first execution block.
       ExecutionBlock childBlock = context.execBlockMap.remove(child.getPID());
       node.setChild(childBlock.getPlan());
       childBlock.setPlan(node);
       context.execBlockMap.put(node.getPID(), childBlock);

       return node;
     }
   }
 }