exec/java-exec/src/main/java/org/apache/drill/exec/planner/physical/MetadataAggPrule.java - drill - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.drill.exec.planner.physical;

 import org.apache.calcite.plan.RelOptRuleCall;
 import org.apache.calcite.plan.RelTraitSet;
 import org.apache.calcite.rel.RelCollation;
 import org.apache.calcite.rel.RelCollationImpl;
 import org.apache.calcite.rel.RelCollations;
 import org.apache.calcite.rel.RelFieldCollation;
 import org.apache.calcite.rel.RelNode;
 import org.apache.drill.common.expression.FieldReference;
 import org.apache.drill.common.expression.SchemaPath;
 import org.apache.drill.common.logical.data.NamedExpression;
 import org.apache.drill.exec.planner.logical.DrillRel;
 import org.apache.drill.exec.planner.logical.MetadataAggRel;
 import org.apache.drill.exec.planner.logical.RelOptHelper;
 import org.apache.drill.exec.planner.physical.AggPrelBase.OperatorPhase;
 import org.apache.drill.exec.planner.physical.DrillDistributionTrait.NamedDistributionField;
 import org.apache.drill.exec.store.parquet.FilterEvaluatorUtils.FieldReferenceFinder;
 import com.google.common.collect.ImmutableList;

 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
 import java.util.Set;
 import java.util.stream.Collectors;

 public class MetadataAggPrule extends Prule {
   public static final MetadataAggPrule INSTANCE = new MetadataAggPrule();

   public MetadataAggPrule() {
     super(RelOptHelper.any(MetadataAggRel.class, DrillRel.DRILL_LOGICAL),
         "MetadataAggPrule");
   }

   @Override
   public void onMatch(RelOptRuleCall call) {
     MetadataAggRel aggregate = call.rel(0);
     RelNode input = aggregate.getInput();

     int groupByExprsSize = aggregate.getContext().groupByExpressions().size();

     List<RelFieldCollation> collations = new ArrayList<>();
     List<String> names = new ArrayList<>();
     for (int i = 0; i < groupByExprsSize; i++) {
       collations.add(new RelFieldCollation(i + 1));
       SchemaPath fieldPath = getArgumentReference(aggregate.getContext().groupByExpressions().get(i));
       names.add(fieldPath.getRootSegmentPath());
     }

     RelCollation collation = new NamedRelCollation(collations, names);

     RelTraitSet traits;

     if (aggregate.getContext().groupByExpressions().isEmpty()) {
       DrillDistributionTrait singleDist = DrillDistributionTrait.SINGLETON;
       RelTraitSet singleDistTrait = call.getPlanner().emptyTraitSet().plus(Prel.DRILL_PHYSICAL).plus(singleDist);

       createTransformRequest(call, aggregate, input, singleDistTrait);
     } else {
       // hash distribute on all grouping keys
       DrillDistributionTrait distOnAllKeys =
           new DrillDistributionTrait(DrillDistributionTrait.DistributionType.HASH_DISTRIBUTED,
               ImmutableList.copyOf(getDistributionFields(aggregate.getContext().groupByExpressions())));

       PlannerSettings settings = PrelUtil.getPlannerSettings(call.getPlanner());
       boolean smallInput =
           input.estimateRowCount(input.getCluster().getMetadataQuery()) < settings.getSliceTarget();

       // force 2-phase aggregation for bottom aggregate call
       // to produce sort locally before aggregation is produced for large inputs
       if (aggregate.getContext().createNewAggregations() && !smallInput) {
         traits = call.getPlanner().emptyTraitSet().plus(Prel.DRILL_PHYSICAL);
         RelNode convertedInput = convert(input, traits);

         new TwoPhaseMetadataAggSubsetTransformer(call, collation, distOnAllKeys)
             .go(aggregate, convertedInput);
       } else {
         // TODO: DRILL-7433 - replace DrillDistributionTrait.SINGLETON with distOnAllKeys when parallelization for MetadataHandler is implemented
         traits = call.getPlanner().emptyTraitSet().plus(Prel.DRILL_PHYSICAL).plus(collation).plus(DrillDistributionTrait.SINGLETON);
         createTransformRequest(call, aggregate, input, traits);
       }
     }
   }

   private void createTransformRequest(RelOptRuleCall call, MetadataAggRel aggregate,
       RelNode input, RelTraitSet traits) {

     RelNode convertedInput = convert(input, PrelUtil.fixTraits(call, traits));

     MetadataStreamAggPrel newAgg = new MetadataStreamAggPrel(
         aggregate.getCluster(),
         traits,
         convertedInput,
         aggregate.getContext(),
         OperatorPhase.PHASE_1of1);

     call.transformTo(newAgg);
   }

   /**
    * Returns list with named distribution fields which correspond to specified expressions arguments.
    *
    * @param namedExpressions expressions list
    * @return list of {@link NamedDistributionField} instances
    */
   private static List<NamedDistributionField> getDistributionFields(List<NamedExpression> namedExpressions) {
     List<NamedDistributionField> distributionFields = new ArrayList<>();
     int groupByExprsSize = namedExpressions.size();

     for (int index = 0; index < groupByExprsSize; index++) {
       SchemaPath fieldPath = getArgumentReference(namedExpressions.get(index));
       NamedDistributionField field =
           new NamedDistributionField(index + 1, fieldPath.getRootSegmentPath());
       distributionFields.add(field);
     }

     return distributionFields;
   }

   /**
    * Returns {@link FieldReference} instance which corresponds to the argument of specified {@code namedExpression}.
    *
    * @param namedExpression expression
    * @return {@link FieldReference} instance
    */
   private static FieldReference getArgumentReference(NamedExpression namedExpression) {
     Set<SchemaPath> arguments = namedExpression.getExpr().accept(FieldReferenceFinder.INSTANCE, null);
     assert arguments.size() == 1 : "Group by expression contains more than one argument";
     return new FieldReference(arguments.iterator().next());
   }

   /**
    * Implementation of {@link RelCollationImpl} with field name.
    * Stores {@link RelFieldCollation} list and corresponding field names to be used in sort operators.
    * Field name is required for the case of dynamic schema discovering
    * when field is not present in rel data type at planning time.
    */
   public static class NamedRelCollation extends RelCollationImpl {
     private final List<String> names;

     protected NamedRelCollation(List<RelFieldCollation> fieldCollations, List<String> names) {
       super(com.google.common.collect.ImmutableList.copyOf(fieldCollations));
       this.names = Collections.unmodifiableList(names);
     }

     public String getName(int collationIndex) {
       return names.get(collationIndex - 1);
     }
   }

   /**
    * {@link SubsetTransformer} for creating two-phase metadata aggregation.
    */
   private static class TwoPhaseMetadataAggSubsetTransformer
       extends SubsetTransformer<MetadataAggRel, RuntimeException> {

     private final RelCollation collation;
     private final DrillDistributionTrait distributionTrait;

     public TwoPhaseMetadataAggSubsetTransformer(RelOptRuleCall call,
         RelCollation collation, DrillDistributionTrait distributionTrait) {
       super(call);
       this.collation = collation;
       this.distributionTrait = distributionTrait;
     }

     @Override
     public RelNode convertChild(MetadataAggRel aggregate, RelNode child) {
       DrillDistributionTrait toDist = child.getTraitSet().getTrait(DrillDistributionTraitDef.INSTANCE);
       RelTraitSet traits = newTraitSet(Prel.DRILL_PHYSICAL, RelCollations.EMPTY, toDist);
       RelNode newInput = convert(child, traits);

       // maps group by expressions to themselves to be able to produce the second aggregation
       List<NamedExpression> identityExpressions = aggregate.getContext().groupByExpressions().stream()
           .map(namedExpression -> new NamedExpression(namedExpression.getExpr(), getArgumentReference(namedExpression)))
           .collect(Collectors.toList());

       // use hash aggregation for the first stage to avoid sorting raw data
       MetadataHashAggPrel phase1Agg = new MetadataHashAggPrel(
           aggregate.getCluster(),
           traits,
           newInput,
           aggregate.getContext().toBuilder().groupByExpressions(identityExpressions).build(),
           OperatorPhase.PHASE_1of2);

       traits = newTraitSet(Prel.DRILL_PHYSICAL, collation, toDist).plus(distributionTrait);
       SortPrel sort = new SortPrel(
           aggregate.getCluster(),
           traits,
           phase1Agg,
           (RelCollation) traits.getTrait(collation.getTraitDef()));

       int numEndPoints = PrelUtil.getSettings(phase1Agg.getCluster()).numEndPoints();

       HashToMergeExchangePrel exch =
           new HashToMergeExchangePrel(phase1Agg.getCluster(),
               traits,
               sort,
               ImmutableList.copyOf(getDistributionFields(aggregate.getContext().groupByExpressions())),
               collation,
               numEndPoints);

       return new MetadataStreamAggPrel(
           aggregate.getCluster(),
           newTraitSet(Prel.DRILL_PHYSICAL, collation, DrillDistributionTrait.SINGLETON),
           exch,
           aggregate.getContext(),
           OperatorPhase.PHASE_2of2);
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.drill.exec.planner.physical;

	import org.apache.calcite.plan.RelOptRuleCall;
	import org.apache.calcite.plan.RelTraitSet;
	import org.apache.calcite.rel.RelCollation;
	import org.apache.calcite.rel.RelCollationImpl;
	import org.apache.calcite.rel.RelCollations;
	import org.apache.calcite.rel.RelFieldCollation;
	import org.apache.calcite.rel.RelNode;
	import org.apache.drill.common.expression.FieldReference;
	import org.apache.drill.common.expression.SchemaPath;
	import org.apache.drill.common.logical.data.NamedExpression;
	import org.apache.drill.exec.planner.logical.DrillRel;
	import org.apache.drill.exec.planner.logical.MetadataAggRel;
	import org.apache.drill.exec.planner.logical.RelOptHelper;
	import org.apache.drill.exec.planner.physical.AggPrelBase.OperatorPhase;
	import org.apache.drill.exec.planner.physical.DrillDistributionTrait.NamedDistributionField;
	import org.apache.drill.exec.store.parquet.FilterEvaluatorUtils.FieldReferenceFinder;
	import com.google.common.collect.ImmutableList;

	import java.util.ArrayList;
	import java.util.Collections;
	import java.util.List;
	import java.util.Set;
	import java.util.stream.Collectors;

	public class MetadataAggPrule extends Prule {
	public static final MetadataAggPrule INSTANCE = new MetadataAggPrule();

	public MetadataAggPrule() {
	super(RelOptHelper.any(MetadataAggRel.class, DrillRel.DRILL_LOGICAL),
	"MetadataAggPrule");
	}

	@Override
	public void onMatch(RelOptRuleCall call) {
	MetadataAggRel aggregate = call.rel(0);
	RelNode input = aggregate.getInput();

	int groupByExprsSize = aggregate.getContext().groupByExpressions().size();

	List<RelFieldCollation> collations = new ArrayList<>();
	List<String> names = new ArrayList<>();
	for (int i = 0; i < groupByExprsSize; i++) {
	collations.add(new RelFieldCollation(i + 1));
	SchemaPath fieldPath = getArgumentReference(aggregate.getContext().groupByExpressions().get(i));
	names.add(fieldPath.getRootSegmentPath());
	}

	RelCollation collation = new NamedRelCollation(collations, names);

	RelTraitSet traits;

	if (aggregate.getContext().groupByExpressions().isEmpty()) {
	DrillDistributionTrait singleDist = DrillDistributionTrait.SINGLETON;
	RelTraitSet singleDistTrait = call.getPlanner().emptyTraitSet().plus(Prel.DRILL_PHYSICAL).plus(singleDist);

	createTransformRequest(call, aggregate, input, singleDistTrait);
	} else {
	// hash distribute on all grouping keys
	DrillDistributionTrait distOnAllKeys =
	new DrillDistributionTrait(DrillDistributionTrait.DistributionType.HASH_DISTRIBUTED,
	ImmutableList.copyOf(getDistributionFields(aggregate.getContext().groupByExpressions())));

	PlannerSettings settings = PrelUtil.getPlannerSettings(call.getPlanner());
	boolean smallInput =
	input.estimateRowCount(input.getCluster().getMetadataQuery()) < settings.getSliceTarget();

	// force 2-phase aggregation for bottom aggregate call
	// to produce sort locally before aggregation is produced for large inputs
	if (aggregate.getContext().createNewAggregations() && !smallInput) {
	traits = call.getPlanner().emptyTraitSet().plus(Prel.DRILL_PHYSICAL);
	RelNode convertedInput = convert(input, traits);

	new TwoPhaseMetadataAggSubsetTransformer(call, collation, distOnAllKeys)
	.go(aggregate, convertedInput);
	} else {
	// TODO: DRILL-7433 - replace DrillDistributionTrait.SINGLETON with distOnAllKeys when parallelization for MetadataHandler is implemented
	traits = call.getPlanner().emptyTraitSet().plus(Prel.DRILL_PHYSICAL).plus(collation).plus(DrillDistributionTrait.SINGLETON);
	createTransformRequest(call, aggregate, input, traits);
	}
	}
	}

	private void createTransformRequest(RelOptRuleCall call, MetadataAggRel aggregate,
	RelNode input, RelTraitSet traits) {

	RelNode convertedInput = convert(input, PrelUtil.fixTraits(call, traits));

	MetadataStreamAggPrel newAgg = new MetadataStreamAggPrel(
	aggregate.getCluster(),
	traits,
	convertedInput,
	aggregate.getContext(),
	OperatorPhase.PHASE_1of1);

	call.transformTo(newAgg);
	}

	/**
	* Returns list with named distribution fields which correspond to specified expressions arguments.
	*
	* @param namedExpressions expressions list
	* @return list of {@link NamedDistributionField} instances
	*/
	private static List<NamedDistributionField> getDistributionFields(List<NamedExpression> namedExpressions) {
	List<NamedDistributionField> distributionFields = new ArrayList<>();
	int groupByExprsSize = namedExpressions.size();

	for (int index = 0; index < groupByExprsSize; index++) {
	SchemaPath fieldPath = getArgumentReference(namedExpressions.get(index));
	NamedDistributionField field =
	new NamedDistributionField(index + 1, fieldPath.getRootSegmentPath());
	distributionFields.add(field);
	}

	return distributionFields;
	}

	/**
	* Returns {@link FieldReference} instance which corresponds to the argument of specified {@code namedExpression}.
	*
	* @param namedExpression expression
	* @return {@link FieldReference} instance
	*/
	private static FieldReference getArgumentReference(NamedExpression namedExpression) {
	Set<SchemaPath> arguments = namedExpression.getExpr().accept(FieldReferenceFinder.INSTANCE, null);
	assert arguments.size() == 1 : "Group by expression contains more than one argument";
	return new FieldReference(arguments.iterator().next());
	}

	/**
	* Implementation of {@link RelCollationImpl} with field name.
	* Stores {@link RelFieldCollation} list and corresponding field names to be used in sort operators.
	* Field name is required for the case of dynamic schema discovering
	* when field is not present in rel data type at planning time.
	*/
	public static class NamedRelCollation extends RelCollationImpl {
	private final List<String> names;

	protected NamedRelCollation(List<RelFieldCollation> fieldCollations, List<String> names) {
	super(com.google.common.collect.ImmutableList.copyOf(fieldCollations));
	this.names = Collections.unmodifiableList(names);
	}

	public String getName(int collationIndex) {
	return names.get(collationIndex - 1);
	}
	}

	/**
	* {@link SubsetTransformer} for creating two-phase metadata aggregation.
	*/
	private static class TwoPhaseMetadataAggSubsetTransformer
	extends SubsetTransformer<MetadataAggRel, RuntimeException> {

	private final RelCollation collation;
	private final DrillDistributionTrait distributionTrait;

	public TwoPhaseMetadataAggSubsetTransformer(RelOptRuleCall call,
	RelCollation collation, DrillDistributionTrait distributionTrait) {
	super(call);
	this.collation = collation;
	this.distributionTrait = distributionTrait;
	}

	@Override
	public RelNode convertChild(MetadataAggRel aggregate, RelNode child) {
	DrillDistributionTrait toDist = child.getTraitSet().getTrait(DrillDistributionTraitDef.INSTANCE);
	RelTraitSet traits = newTraitSet(Prel.DRILL_PHYSICAL, RelCollations.EMPTY, toDist);
	RelNode newInput = convert(child, traits);

	// maps group by expressions to themselves to be able to produce the second aggregation
	List<NamedExpression> identityExpressions = aggregate.getContext().groupByExpressions().stream()
	.map(namedExpression -> new NamedExpression(namedExpression.getExpr(), getArgumentReference(namedExpression)))
	.collect(Collectors.toList());

	// use hash aggregation for the first stage to avoid sorting raw data
	MetadataHashAggPrel phase1Agg = new MetadataHashAggPrel(
	aggregate.getCluster(),
	traits,
	newInput,
	aggregate.getContext().toBuilder().groupByExpressions(identityExpressions).build(),
	OperatorPhase.PHASE_1of2);

	traits = newTraitSet(Prel.DRILL_PHYSICAL, collation, toDist).plus(distributionTrait);
	SortPrel sort = new SortPrel(
	aggregate.getCluster(),
	traits,
	phase1Agg,
	(RelCollation) traits.getTrait(collation.getTraitDef()));

	int numEndPoints = PrelUtil.getSettings(phase1Agg.getCluster()).numEndPoints();

	HashToMergeExchangePrel exch =
	new HashToMergeExchangePrel(phase1Agg.getCluster(),
	traits,
	sort,
	ImmutableList.copyOf(getDistributionFields(aggregate.getContext().groupByExpressions())),
	collation,
	numEndPoints);

	return new MetadataStreamAggPrel(
	aggregate.getCluster(),
	newTraitSet(Prel.DRILL_PHYSICAL, collation, DrillDistributionTrait.SINGLETON),
	exch,
	aggregate.getContext(),
	OperatorPhase.PHASE_2of2);
	}
	}
	}