blob: d9079da6afde981b03680b2b7a73c723ccdc8b38 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.phoenix.hive.ql.index;
import org.apache.hadoop.hive.ql.exec.FunctionRegistry;
import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
import org.apache.hadoop.hive.ql.lib.Dispatcher;
import org.apache.hadoop.hive.ql.lib.GraphWalker;
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.lib.NodeProcessor;
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
import org.apache.hadoop.hive.ql.lib.Rule;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils;
import org.apache.hadoop.hive.ql.plan.ExprNodeFieldDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBaseCompare;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBetween;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFIn;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNot;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNotNull;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNull;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToBinary;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToChar;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToDate;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToDecimal;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToUnixTimeStamp;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToUtcTimestamp;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToVarchar;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.phoenix.hive.util.TypeInfoUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Stack;
/**
* Clone of org.apache.hadoop.hive.ql.index.IndexPredicateAnalyzer with modifying
* analyzePredicate method.
*
*
*/
public class IndexPredicateAnalyzer {
private static final Logger LOG = LoggerFactory.getLogger(IndexPredicateAnalyzer.class);
private final Set<String> udfNames;
private final Map<String, Set<String>> columnToUDFs;
private FieldValidator fieldValidator;
private boolean acceptsFields;
public IndexPredicateAnalyzer() {
udfNames = new HashSet<String>();
columnToUDFs = new HashMap<String, Set<String>>();
}
public void setFieldValidator(FieldValidator fieldValidator) {
this.fieldValidator = fieldValidator;
}
/**
* Registers a comparison operator as one which can be satisfied by an index
* search. Unless this is called, analyzePredicate will never find any
* indexable conditions.
*
* @param udfName name of comparison operator as returned by either
* {@link GenericUDFBridge#getUdfName} (for simple UDF's) or
* udf.getClass().getName() (for generic UDF's).
*/
public void addComparisonOp(String udfName) {
udfNames.add(udfName);
}
/**
* Clears the set of column names allowed in comparisons. (Initially, all
* column names are allowed.)
*/
public void clearAllowedColumnNames() {
columnToUDFs.clear();
}
/**
* Adds a column name to the set of column names allowed.
*
* @param columnName name of column to be allowed
*/
public void allowColumnName(String columnName) {
columnToUDFs.put(columnName, udfNames);
}
/**
* add allowed functions per column
*
* @param columnName
* @param udfs
*/
public void addComparisonOp(String columnName, String... udfs) {
Set<String> allowed = columnToUDFs.get(columnName);
if (allowed == null || allowed == udfNames) {
// override
columnToUDFs.put(columnName, new HashSet<String>(Arrays.asList(udfs)));
} else {
allowed.addAll(Arrays.asList(udfs));
}
}
/**
* Analyzes a predicate.
*
* @param predicate predicate to be analyzed
* @param searchConditions receives conditions produced by analysis
* @return residual predicate which could not be translated to
* searchConditions
*/
public ExprNodeDesc analyzePredicate(ExprNodeDesc predicate, final List<IndexSearchCondition>
searchConditions) {
Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
NodeProcessor nodeProcessor = new NodeProcessor() {
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object...
nodeOutputs) throws SemanticException {
// We can only push down stuff which appears as part of
// a pure conjunction: reject OR, CASE, etc.
for (Node ancestor : stack) {
if (nd == ancestor) {
break;
}
if (!FunctionRegistry.isOpAnd((ExprNodeDesc) ancestor)) {
return nd;
}
}
return analyzeExpr((ExprNodeGenericFuncDesc) nd, searchConditions, nodeOutputs);
}
};
Dispatcher disp = new DefaultRuleDispatcher(nodeProcessor, opRules, null);
GraphWalker ogw = new DefaultGraphWalker(disp);
ArrayList<Node> topNodes = new ArrayList<Node>();
topNodes.add(predicate);
HashMap<Node, Object> nodeOutput = new HashMap<Node, Object>();
try {
ogw.startWalking(topNodes, nodeOutput);
} catch (SemanticException ex) {
throw new RuntimeException(ex);
}
ExprNodeDesc residualPredicate = (ExprNodeDesc) nodeOutput.get(predicate);
return residualPredicate;
}
// Check if ExprNodeColumnDesc is wrapped in expr.
// If so, peel off. Otherwise return itself.
private ExprNodeDesc getColumnExpr(ExprNodeDesc expr) {
if (expr instanceof ExprNodeColumnDesc) {
return expr;
}
ExprNodeGenericFuncDesc funcDesc = null;
if (expr instanceof ExprNodeGenericFuncDesc) {
funcDesc = (ExprNodeGenericFuncDesc) expr;
}
if (null == funcDesc) {
return expr;
}
GenericUDF udf = funcDesc.getGenericUDF();
// check if its a simple cast expression.
if ((udf instanceof GenericUDFBridge || udf instanceof GenericUDFToBinary || udf
instanceof GenericUDFToChar
|| udf instanceof GenericUDFToVarchar || udf instanceof GenericUDFToDecimal
|| udf instanceof GenericUDFToDate || udf instanceof GenericUDFToUnixTimeStamp
|| udf instanceof GenericUDFToUtcTimestamp) && funcDesc.getChildren().size() == 1
&& funcDesc.getChildren().get(0) instanceof ExprNodeColumnDesc) {
return expr.getChildren().get(0);
}
return expr;
}
private void processingBetweenOperator(ExprNodeGenericFuncDesc expr,
List<IndexSearchCondition> searchConditions, Object...
nodeOutputs) {
String[] fields = null;
final boolean isNot = (Boolean) ((ExprNodeConstantDesc) nodeOutputs[0]).getValue();
ExprNodeDesc columnNodeDesc = (ExprNodeDesc) nodeOutputs[1];
if (columnNodeDesc instanceof ExprNodeFieldDesc) {
// rowKey field
ExprNodeFieldDesc fieldDesc = (ExprNodeFieldDesc) columnNodeDesc;
fields = ExprNodeDescUtils.extractFields(fieldDesc);
ExprNodeDesc[] extracted = ExprNodeDescUtils.extractComparePair((ExprNodeDesc)
nodeOutputs[1], (ExprNodeDesc) nodeOutputs[2]);
columnNodeDesc = extracted[0];
}
addSearchConditionIfPossible(expr, searchConditions, fields, isNot, columnNodeDesc,
Arrays.copyOfRange(nodeOutputs, 2, nodeOutputs.length));
}
private void addSearchConditionIfPossible(ExprNodeGenericFuncDesc expr,
List<IndexSearchCondition> searchConditions,
String[] fields,
boolean isNot,
ExprNodeDesc columnNodeDesc,
Object[] nodeOutputs) {
ExprNodeColumnDesc columnDesc;
columnNodeDesc = getColumnExpr(columnNodeDesc);
if (!(columnNodeDesc instanceof ExprNodeColumnDesc)) {
return;
}
columnDesc = (ExprNodeColumnDesc) columnNodeDesc;
String udfName = expr.getGenericUDF().getUdfName();
ExprNodeConstantDesc[] constantDescs = null;
if (nodeOutputs != null) {
constantDescs = extractConstants(columnDesc, nodeOutputs);
if (constantDescs == null) {
return;
}
}
searchConditions.add(new IndexSearchCondition(columnDesc, udfName, constantDescs,
expr, fields, isNot));
}
private boolean isAcceptableConstants(ExprNodeDesc columnDesc, ExprNodeDesc constant) {
// from(constant) -> to(columnDesc)
return TypeInfoUtils.implicitConvertible(constant.getTypeInfo(), columnDesc.getTypeInfo());
}
private ExprNodeConstantDesc[] extractConstants(ExprNodeColumnDesc columnDesc, Object... nodeOutputs) {
ExprNodeConstantDesc[] constantDescs = new ExprNodeConstantDesc[nodeOutputs.length];
for (int i = 0; i < nodeOutputs.length; i++) {
ExprNodeDesc[] extracted =
ExprNodeDescUtils.extractComparePair(columnDesc, (ExprNodeDesc) nodeOutputs[i]);
if (extracted == null || !isAcceptableConstants(columnDesc, extracted[1])) {
return null;
}
constantDescs[i] = (ExprNodeConstantDesc) extracted[1];
}
return constantDescs;
}
private void processingInOperator(ExprNodeGenericFuncDesc expr, List<IndexSearchCondition>
searchConditions, boolean isNot, Object... nodeOutputs) {
ExprNodeDesc columnDesc;
String[] fields = null;
if (LOG.isTraceEnabled()) {
LOG.trace("Processing In Operator. nodeOutputs : " + new ArrayList<>(Arrays.asList(nodeOutputs)));
}
columnDesc = (ExprNodeDesc) nodeOutputs[0];
if (columnDesc instanceof ExprNodeFieldDesc) {
// rowKey field
ExprNodeFieldDesc fieldDesc = (ExprNodeFieldDesc) columnDesc;
fields = ExprNodeDescUtils.extractFields(fieldDesc);
ExprNodeDesc[] extracted = ExprNodeDescUtils.extractComparePair((ExprNodeDesc)
nodeOutputs[0], (ExprNodeDesc) nodeOutputs[1]);
if (extracted == null) { // adding for tez
return;
}
if (LOG.isTraceEnabled()) {
LOG.trace("nodeOutputs[0] : " + nodeOutputs[0] + ", nodeOutputs[1] : " +
nodeOutputs[1] + " => " + new ArrayList<>(Arrays.asList(extracted)));
}
columnDesc = extracted[0];
}
addSearchConditionIfPossible(expr, searchConditions, fields, isNot, columnDesc,
Arrays.copyOfRange(nodeOutputs, 1, nodeOutputs.length));
}
private void processingNullOperator(ExprNodeGenericFuncDesc expr, List<IndexSearchCondition>
searchConditions, Object... nodeOutputs) {
ExprNodeDesc columnDesc = null;
String[] fields = null;
columnDesc = (ExprNodeDesc) nodeOutputs[0];
if (columnDesc instanceof ExprNodeFieldDesc) {
// rowKey field
ExprNodeFieldDesc fieldDesc = (ExprNodeFieldDesc) columnDesc;
fields = ExprNodeDescUtils.extractFields(fieldDesc);
ExprNodeDesc[] extracted = ExprNodeDescUtils.extractComparePair(columnDesc,
new ExprNodeConstantDesc());
columnDesc = extracted[0];
}
addSearchConditionIfPossible(expr, searchConditions, fields, false, columnDesc, null);
}
private void processingNotNullOperator(ExprNodeGenericFuncDesc expr,
List<IndexSearchCondition> searchConditions, Object...
nodeOutputs) {
ExprNodeDesc columnDesc;
String[] fields = null;
columnDesc = (ExprNodeDesc) nodeOutputs[0];
if (columnDesc instanceof ExprNodeFieldDesc) {
// rowKey field
ExprNodeFieldDesc fieldDesc = (ExprNodeFieldDesc) columnDesc;
fields = ExprNodeDescUtils.extractFields(fieldDesc);
ExprNodeDesc[] extracted = ExprNodeDescUtils.extractComparePair(columnDesc,
new ExprNodeConstantDesc());
columnDesc = extracted[0];
}
addSearchConditionIfPossible(expr, searchConditions, fields, true, columnDesc, null);
}
private ExprNodeDesc analyzeExpr(ExprNodeGenericFuncDesc expr, List<IndexSearchCondition>
searchConditions, Object... nodeOutputs) throws SemanticException {
if (FunctionRegistry.isOpAnd(expr)) {
List<ExprNodeDesc> residuals = new ArrayList<>();
// GenericUDFOPAnd can expect more than 2 arguments after HIVE-11398
for (Object nodeOutput : nodeOutputs) {
// The null value of nodeOutput means the predicate is pushed down to Phoenix. So
// we don't need to add it to the residual predicate list
if (nodeOutput != null) {
residuals.add((ExprNodeDesc) nodeOutput);
}
}
if (residuals.size() == 0) {
//All children were pushed down
return null;
}
if (residuals.size() == 1) {
//A single child remains
return residuals.get(0);
}
return new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo, FunctionRegistry
.getGenericUDFForAnd(), residuals);
}
GenericUDF genericUDF = expr.getGenericUDF();
if (!(genericUDF instanceof GenericUDFBaseCompare)) {
// 2015-10-22 Added by JeongMin Ju : Processing Between/In Operator
if (genericUDF instanceof GenericUDFBetween) {
// In case of not between, The value of first element of nodeOutputs is true.
// otherwise false.
processingBetweenOperator(expr, searchConditions, nodeOutputs);
return expr;
} else if (genericUDF instanceof GenericUDFIn) {
// In case of not in operator, in operator exist as child of not operator.
processingInOperator(expr, searchConditions, false, nodeOutputs);
return expr;
} else if (genericUDF instanceof GenericUDFOPNot &&
((ExprNodeGenericFuncDesc) expr.getChildren().get(0)).getGenericUDF()
instanceof GenericUDFIn) {
// In case of not in operator, in operator exist as child of not operator.
processingInOperator((ExprNodeGenericFuncDesc) expr.getChildren().get(0),
searchConditions, true, ((ExprNodeGenericFuncDesc) nodeOutputs[0])
.getChildren().toArray());
return expr;
} else if (genericUDF instanceof GenericUDFOPNull) {
processingNullOperator(expr, searchConditions, nodeOutputs);
return expr;
} else if (genericUDF instanceof GenericUDFOPNotNull) {
processingNotNullOperator(expr, searchConditions, nodeOutputs);
return expr;
} else {
return expr;
}
}
ExprNodeDesc expr1 = (ExprNodeDesc) nodeOutputs[0];
ExprNodeDesc expr2 = (ExprNodeDesc) nodeOutputs[1];
// We may need to peel off the GenericUDFBridge that is added by CBO or
// user
if (expr1.getTypeInfo().equals(expr2.getTypeInfo())) {
expr1 = getColumnExpr(expr1);
expr2 = getColumnExpr(expr2);
}
ExprNodeDesc[] extracted = ExprNodeDescUtils.extractComparePair(expr1, expr2);
if (extracted == null || (extracted.length > 2 && !acceptsFields)) {
return expr;
}
ExprNodeColumnDesc columnDesc;
ExprNodeConstantDesc constantDesc;
if (extracted[0] instanceof ExprNodeConstantDesc) {
genericUDF = genericUDF.flip();
columnDesc = (ExprNodeColumnDesc) extracted[1];
constantDesc = (ExprNodeConstantDesc) extracted[0];
} else {
columnDesc = (ExprNodeColumnDesc) extracted[0];
constantDesc = (ExprNodeConstantDesc) extracted[1];
}
Set<String> allowed = columnToUDFs.get(columnDesc.getColumn());
if (allowed == null) {
return expr;
}
String udfName = genericUDF.getUdfName();
if (!allowed.contains(genericUDF.getUdfName())) {
return expr;
}
String[] fields = null;
if (extracted.length > 2) {
ExprNodeFieldDesc fieldDesc = (ExprNodeFieldDesc) extracted[2];
if (!isValidField(fieldDesc)) {
return expr;
}
fields = ExprNodeDescUtils.extractFields(fieldDesc);
}
// We also need to update the expr so that the index query can be
// generated.
// Note that, hive does not support UDFToDouble etc in the query text.
List<ExprNodeDesc> list = new ArrayList<ExprNodeDesc>();
list.add(expr1);
list.add(expr2);
expr = new ExprNodeGenericFuncDesc(expr.getTypeInfo(), expr.getGenericUDF(), list);
searchConditions.add(new IndexSearchCondition(columnDesc, udfName, constantDesc, expr,
fields));
// we converted the expression to a search condition, so
// remove it from the residual predicate
return fields == null ? null : expr;
}
private boolean isValidField(ExprNodeFieldDesc field) {
return fieldValidator == null || fieldValidator.validate(field);
}
/**
* Translates search conditions back to ExprNodeDesc form (as a left-deep
* conjunction).
*
* @param searchConditions (typically produced by analyzePredicate)
* @return ExprNodeGenericFuncDesc form of search conditions
*/
public ExprNodeGenericFuncDesc translateSearchConditions(List<IndexSearchCondition>
searchConditions) {
ExprNodeGenericFuncDesc expr = null;
for (IndexSearchCondition searchCondition : searchConditions) {
if (expr == null) {
expr = searchCondition.getComparisonExpr();
continue;
}
List<ExprNodeDesc> children = new ArrayList<ExprNodeDesc>();
children.add(expr);
children.add(searchCondition.getComparisonExpr());
expr = new ExprNodeGenericFuncDesc(TypeInfoFactory.booleanTypeInfo, FunctionRegistry
.getGenericUDFForAnd(), children);
}
return expr;
}
public void setAcceptsFields(boolean acceptsFields) {
this.acceptsFields = acceptsFields;
}
public static interface FieldValidator {
boolean validate(ExprNodeFieldDesc exprNodeDesc);
}
public static IndexPredicateAnalyzer createAnalyzer(boolean equalOnly) {
IndexPredicateAnalyzer analyzer = new IndexPredicateAnalyzer();
analyzer.addComparisonOp("org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual");
if (equalOnly) {
return analyzer;
}
analyzer.addComparisonOp("org.apache.hadoop.hive.ql.udf.generic" +
".GenericUDFOPEqualOrGreaterThan");
analyzer.addComparisonOp("org.apache.hadoop.hive.ql.udf.generic" +
".GenericUDFOPEqualOrLessThan");
analyzer.addComparisonOp("org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPLessThan");
analyzer.addComparisonOp("org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPGreaterThan");
analyzer.addComparisonOp("org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNotEqual");
// apply !=
analyzer.addComparisonOp("org.apache.hadoop.hive.ql.udf.generic.GenericUDFBetween");
// apply (Not) Between
analyzer.addComparisonOp("org.apache.hadoop.hive.ql.udf.generic.GenericUDFIn"); //
// apply (Not) In
analyzer.addComparisonOp("org.apache.hadoop.hive.ql.udf.generic.GenericUDFIn"); //
// apply In
analyzer.addComparisonOp("org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNull");
// apply Null
analyzer.addComparisonOp("org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNotNull");
// apply Not Null
return analyzer;
}
}