processing/src/main/java/org/apache/druid/segment/virtual/ExpressionPlanner.java - druid - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.druid.segment.virtual;

 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.Iterables;
 import com.google.common.collect.Sets;
 import org.apache.druid.math.expr.Expr;
 import org.apache.druid.math.expr.ExprType;
 import org.apache.druid.math.expr.Parser;
 import org.apache.druid.segment.ColumnInspector;
 import org.apache.druid.segment.column.ColumnCapabilities;
 import org.apache.druid.segment.column.ValueType;

 import java.util.EnumSet;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
 import java.util.stream.Collectors;

 public class ExpressionPlanner
 {
   private ExpressionPlanner()
   {
     // No instantiation.
   }

   /**
    * Druid tries to be chill to expressions to make up for not having a well defined table schema across segments. This
    * method performs some analysis to determine what sort of selectors can be constructed on top of an expression,
    * whether or not the expression will need implicitly mapped across multi-valued inputs, if the expression produces
    * multi-valued outputs, is vectorizable, and everything else interesting when making a selector.
    *
    * Results are stored in a {@link ExpressionPlan}, which can be examined to do whatever is necessary to make things
    * function properly.
    */
   public static ExpressionPlan plan(ColumnInspector inspector, Expr expression)
   {
     final Expr.BindingAnalysis analysis = expression.analyzeInputs();
     Parser.validateExpr(expression, analysis);

     EnumSet<ExpressionPlan.Trait> traits = EnumSet.noneOf(ExpressionPlan.Trait.class);
     Set<String> noCapabilities = new HashSet<>();
     Set<String> maybeMultiValued = new HashSet<>();
     List<String> needsApplied = ImmutableList.of();
     ValueType singleInputType = null;
     ExprType outputType = null;

     final Set<String> columns = analysis.getRequiredBindings();

     // check and set traits which allow optimized selectors to be created
     if (columns.isEmpty()) {
       traits.add(ExpressionPlan.Trait.CONSTANT);
     } else if (columns.size() == 1) {
       final String column = Iterables.getOnlyElement(columns);
       final ColumnCapabilities capabilities = inspector.getColumnCapabilities(column);

       // These flags allow for selectors that wrap a single underlying column to be optimized, through caching results
       // and via allowing deferred execution in the case of building dimension selectors.
       //    SINGLE_INPUT_SCALAR
       // is set if an input is single valued, and the output is definitely single valued, with an additional requirement
       // for strings that the column is dictionary encoded.
       //    SINGLE_INPUT_MAPPABLE
       // is set when a single input string column, which can be multi-valued, but if so, it must be implicitly mappable
       // (i.e. the expression is not treating its input as an array and not wanting to output an array)
       if (capabilities != null && !analysis.hasInputArrays() && !analysis.isOutputArray()) {
         boolean isSingleInputMappable = false;
         boolean isSingleInputScalar = capabilities.hasMultipleValues().isFalse();
         if (capabilities.getType() == ValueType.STRING) {
           isSingleInputScalar &= capabilities.isDictionaryEncoded().isTrue();
           isSingleInputMappable = capabilities.isDictionaryEncoded().isTrue() &&
                                   !capabilities.hasMultipleValues().isUnknown();
         }

         // if satisfied, set single input output type and flags
         if (isSingleInputScalar || isSingleInputMappable) {
           singleInputType = capabilities.getType();
           if (isSingleInputScalar) {
             traits.add(ExpressionPlan.Trait.SINGLE_INPUT_SCALAR);
           }
           if (isSingleInputMappable) {
             traits.add(ExpressionPlan.Trait.SINGLE_INPUT_MAPPABLE);
           }
         }
       }
     }

     // if we didn't eliminate this expression as a single input scalar or mappable expression, it might need
     // automatic transformation to map across multi-valued inputs (or row by row detection in the worst case)
     if (ExpressionPlan.none(traits, ExpressionPlan.Trait.SINGLE_INPUT_SCALAR)) {
       final Set<String> definitelyMultiValued = new HashSet<>();
       for (String column : analysis.getRequiredBindings()) {
         final ColumnCapabilities capabilities = inspector.getColumnCapabilities(column);
         if (capabilities != null) {
           if (capabilities.hasMultipleValues().isTrue()) {
             definitelyMultiValued.add(column);
           } else if (capabilities.getType().equals(ValueType.STRING) &&
                      capabilities.hasMultipleValues().isMaybeTrue() &&
                      !analysis.getArrayBindings().contains(column)
           ) {
             maybeMultiValued.add(column);
           }
         } else {
           noCapabilities.add(column);
         }
       }

       // find any inputs which will need implicitly mapped across multi-valued rows
       needsApplied =
           columns.stream()
                  .filter(c -> definitelyMultiValued.contains(c) && !analysis.getArrayBindings().contains(c))
                  .collect(Collectors.toList());

       // if any multi-value inputs, set flag for non-scalar inputs
       if (analysis.hasInputArrays()) {
         traits.add(ExpressionPlan.Trait.NON_SCALAR_INPUTS);
       }

       if (!noCapabilities.isEmpty()) {
         traits.add(ExpressionPlan.Trait.UNKNOWN_INPUTS);
       }

       if (!maybeMultiValued.isEmpty()) {
         traits.add(ExpressionPlan.Trait.INCOMPLETE_INPUTS);
       }

       // if expression needs transformed, lets do it
       if (!needsApplied.isEmpty()) {
         traits.add(ExpressionPlan.Trait.NEEDS_APPLIED);
       }
     }

     // only set output type if we are pretty confident about input types
     final boolean shouldComputeOutput = ExpressionPlan.none(
         traits,
         ExpressionPlan.Trait.UNKNOWN_INPUTS,
         ExpressionPlan.Trait.INCOMPLETE_INPUTS
     );

     if (shouldComputeOutput) {
       outputType = expression.getOutputType(inspector);
     }

     // if analysis predicts output, or inferred output type is array, output will be multi-valued
     if (analysis.isOutputArray() || ExprType.isArray(outputType)) {
       traits.add(ExpressionPlan.Trait.NON_SCALAR_OUTPUT);

       // single input mappable may not produce array output explicitly, only through implicit mapping
       traits.remove(ExpressionPlan.Trait.SINGLE_INPUT_SCALAR);
       traits.remove(ExpressionPlan.Trait.SINGLE_INPUT_MAPPABLE);
     }

     // vectorized expressions do not support incomplete, multi-valued inputs or outputs, or implicit mapping
     // they also do not support unknown inputs, but they also do not currently have to deal with them, as missing
     // capabilites is indicative of a non-existent column instead of an unknown schema. If this ever changes,
     // this check should also change
     boolean supportsVector = ExpressionPlan.none(
         traits,
         ExpressionPlan.Trait.INCOMPLETE_INPUTS,
         ExpressionPlan.Trait.NEEDS_APPLIED,
         ExpressionPlan.Trait.NON_SCALAR_INPUTS,
         ExpressionPlan.Trait.NON_SCALAR_OUTPUT
     );

     if (supportsVector && expression.canVectorize(inspector)) {
       // make sure to compute the output type for a vector expression though, because we might have skipped it earlier
       // due to unknown inputs, but that's ok here since it just means it doesnt exist
       outputType = expression.getOutputType(inspector);
       traits.add(ExpressionPlan.Trait.VECTORIZABLE);
     }

     return new ExpressionPlan(
         inspector,
         expression,
         analysis,
         traits,
         outputType,
         singleInputType,
         Sets.union(noCapabilities, maybeMultiValued),
         needsApplied
     );
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	package org.apache.druid.segment.virtual;

	import com.google.common.collect.ImmutableList;
	import com.google.common.collect.Iterables;
	import com.google.common.collect.Sets;
	import org.apache.druid.math.expr.Expr;
	import org.apache.druid.math.expr.ExprType;
	import org.apache.druid.math.expr.Parser;
	import org.apache.druid.segment.ColumnInspector;
	import org.apache.druid.segment.column.ColumnCapabilities;
	import org.apache.druid.segment.column.ValueType;

	import java.util.EnumSet;
	import java.util.HashSet;
	import java.util.List;
	import java.util.Set;
	import java.util.stream.Collectors;

	public class ExpressionPlanner
	{
	private ExpressionPlanner()
	{
	// No instantiation.
	}

	/**
	* Druid tries to be chill to expressions to make up for not having a well defined table schema across segments. This
	* method performs some analysis to determine what sort of selectors can be constructed on top of an expression,
	* whether or not the expression will need implicitly mapped across multi-valued inputs, if the expression produces
	* multi-valued outputs, is vectorizable, and everything else interesting when making a selector.
	*
	* Results are stored in a {@link ExpressionPlan}, which can be examined to do whatever is necessary to make things
	* function properly.
	*/
	public static ExpressionPlan plan(ColumnInspector inspector, Expr expression)
	{
	final Expr.BindingAnalysis analysis = expression.analyzeInputs();
	Parser.validateExpr(expression, analysis);

	EnumSet<ExpressionPlan.Trait> traits = EnumSet.noneOf(ExpressionPlan.Trait.class);
	Set<String> noCapabilities = new HashSet<>();
	Set<String> maybeMultiValued = new HashSet<>();
	List<String> needsApplied = ImmutableList.of();
	ValueType singleInputType = null;
	ExprType outputType = null;

	final Set<String> columns = analysis.getRequiredBindings();

	// check and set traits which allow optimized selectors to be created
	if (columns.isEmpty()) {
	traits.add(ExpressionPlan.Trait.CONSTANT);
	} else if (columns.size() == 1) {
	final String column = Iterables.getOnlyElement(columns);
	final ColumnCapabilities capabilities = inspector.getColumnCapabilities(column);

	// These flags allow for selectors that wrap a single underlying column to be optimized, through caching results
	// and via allowing deferred execution in the case of building dimension selectors.
	// SINGLE_INPUT_SCALAR
	// is set if an input is single valued, and the output is definitely single valued, with an additional requirement
	// for strings that the column is dictionary encoded.
	// SINGLE_INPUT_MAPPABLE
	// is set when a single input string column, which can be multi-valued, but if so, it must be implicitly mappable
	// (i.e. the expression is not treating its input as an array and not wanting to output an array)
	if (capabilities != null && !analysis.hasInputArrays() && !analysis.isOutputArray()) {
	boolean isSingleInputMappable = false;
	boolean isSingleInputScalar = capabilities.hasMultipleValues().isFalse();
	if (capabilities.getType() == ValueType.STRING) {
	isSingleInputScalar &= capabilities.isDictionaryEncoded().isTrue();
	isSingleInputMappable = capabilities.isDictionaryEncoded().isTrue() &&
	!capabilities.hasMultipleValues().isUnknown();
	}

	// if satisfied, set single input output type and flags
	if (isSingleInputScalar \|\| isSingleInputMappable) {
	singleInputType = capabilities.getType();
	if (isSingleInputScalar) {
	traits.add(ExpressionPlan.Trait.SINGLE_INPUT_SCALAR);
	}
	if (isSingleInputMappable) {
	traits.add(ExpressionPlan.Trait.SINGLE_INPUT_MAPPABLE);
	}
	}
	}
	}

	// if we didn't eliminate this expression as a single input scalar or mappable expression, it might need
	// automatic transformation to map across multi-valued inputs (or row by row detection in the worst case)
	if (ExpressionPlan.none(traits, ExpressionPlan.Trait.SINGLE_INPUT_SCALAR)) {
	final Set<String> definitelyMultiValued = new HashSet<>();
	for (String column : analysis.getRequiredBindings()) {
	final ColumnCapabilities capabilities = inspector.getColumnCapabilities(column);
	if (capabilities != null) {
	if (capabilities.hasMultipleValues().isTrue()) {
	definitelyMultiValued.add(column);
	} else if (capabilities.getType().equals(ValueType.STRING) &&
	capabilities.hasMultipleValues().isMaybeTrue() &&
	!analysis.getArrayBindings().contains(column)
	) {
	maybeMultiValued.add(column);
	}
	} else {
	noCapabilities.add(column);
	}
	}

	// find any inputs which will need implicitly mapped across multi-valued rows
	needsApplied =
	columns.stream()
	.filter(c -> definitelyMultiValued.contains(c) && !analysis.getArrayBindings().contains(c))
	.collect(Collectors.toList());

	// if any multi-value inputs, set flag for non-scalar inputs
	if (analysis.hasInputArrays()) {
	traits.add(ExpressionPlan.Trait.NON_SCALAR_INPUTS);
	}

	if (!noCapabilities.isEmpty()) {
	traits.add(ExpressionPlan.Trait.UNKNOWN_INPUTS);
	}

	if (!maybeMultiValued.isEmpty()) {
	traits.add(ExpressionPlan.Trait.INCOMPLETE_INPUTS);
	}

	// if expression needs transformed, lets do it
	if (!needsApplied.isEmpty()) {
	traits.add(ExpressionPlan.Trait.NEEDS_APPLIED);
	}
	}

	// only set output type if we are pretty confident about input types
	final boolean shouldComputeOutput = ExpressionPlan.none(
	traits,
	ExpressionPlan.Trait.UNKNOWN_INPUTS,
	ExpressionPlan.Trait.INCOMPLETE_INPUTS
	);

	if (shouldComputeOutput) {
	outputType = expression.getOutputType(inspector);
	}

	// if analysis predicts output, or inferred output type is array, output will be multi-valued
	if (analysis.isOutputArray() \|\| ExprType.isArray(outputType)) {
	traits.add(ExpressionPlan.Trait.NON_SCALAR_OUTPUT);

	// single input mappable may not produce array output explicitly, only through implicit mapping
	traits.remove(ExpressionPlan.Trait.SINGLE_INPUT_SCALAR);
	traits.remove(ExpressionPlan.Trait.SINGLE_INPUT_MAPPABLE);
	}

	// vectorized expressions do not support incomplete, multi-valued inputs or outputs, or implicit mapping
	// they also do not support unknown inputs, but they also do not currently have to deal with them, as missing
	// capabilites is indicative of a non-existent column instead of an unknown schema. If this ever changes,
	// this check should also change
	boolean supportsVector = ExpressionPlan.none(
	traits,
	ExpressionPlan.Trait.INCOMPLETE_INPUTS,
	ExpressionPlan.Trait.NEEDS_APPLIED,
	ExpressionPlan.Trait.NON_SCALAR_INPUTS,
	ExpressionPlan.Trait.NON_SCALAR_OUTPUT
	);

	if (supportsVector && expression.canVectorize(inspector)) {
	// make sure to compute the output type for a vector expression though, because we might have skipped it earlier
	// due to unknown inputs, but that's ok here since it just means it doesnt exist
	outputType = expression.getOutputType(inspector);
	traits.add(ExpressionPlan.Trait.VECTORIZABLE);
	}

	return new ExpressionPlan(
	inspector,
	expression,
	analysis,
	traits,
	outputType,
	singleInputType,
	Sets.union(noCapabilities, maybeMultiValued),
	needsApplied
	);
	}
	}