| /** |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| package org.apache.lens.cube.parse; |
| |
| import static org.apache.lens.cube.error.LensCubeErrorCode.SYNTAX_ERROR; |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.List; |
| |
| import org.apache.lens.server.api.error.LensException; |
| import org.apache.lens.server.api.metrics.MethodMetricsContext; |
| import org.apache.lens.server.api.metrics.MethodMetricsFactory; |
| |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.hadoop.hive.conf.HiveConf; |
| import org.apache.hadoop.hive.ql.Context; |
| import org.apache.hadoop.hive.ql.parse.*; |
| |
| import lombok.extern.slf4j.Slf4j; |
| |
| /** |
| * Rewrites given cube query into simple storage table HQL. |
| */ |
| @Slf4j |
| public class CubeQueryRewriter { |
| private final Configuration conf; |
| private final List<ContextRewriter> rewriters = new ArrayList<ContextRewriter>(); |
| private final HiveConf hconf; |
| private Context qlCtx = null; |
| private boolean lightFactFirst; |
| |
| public CubeQueryRewriter(Configuration conf, HiveConf hconf) { |
| this.conf = conf; |
| this.hconf = hconf; |
| try { |
| qlCtx = new Context(conf); |
| } catch (IOException e) { |
| // IOException is ignorable |
| } |
| lightFactFirst = |
| conf.getBoolean(CubeQueryConfUtil.LIGHTEST_FACT_FIRST, CubeQueryConfUtil.DEFAULT_LIGHTEST_FACT_FIRST); |
| setupRewriters(); |
| } |
| |
| /* |
| * Here is the rewriter flow. |
| * |
| * Expression resolver: replaces the queried expression columns with their |
| * expression ASTs in the query AST. |
| * |
| * ColumnResolver : ColumnResolver finds all the columns in the query AST |
| * |
| * AliasReplacer: - Finds queried column to table alias. - Finds queried dim |
| * attributes and queried measures. - Does queried field validation wrt |
| * derived cubes, if all fields of queried cube cannot be queried together. - |
| * Replaces all the columns in all expressions with tablealias.column |
| * |
| * DenormalizationResolver: Phase 1: Finds all the queried column references |
| * if any. |
| * |
| * CandidateTableResolver: Phase 1: - Prune candidate fact tables if queried |
| * dim attributes are not present. - Also Figures out if queried column is not |
| * part of candidate table, but it is a denormalized field which can reached |
| * through a reference - Finds all the candidate fact sets containing queried |
| * measures. Prunes facts which do not contain any of the queried measures. |
| * |
| * JoinResolver : - Finds all the join chains for between tables queried. |
| * |
| * TimerangeResolver : - Finds all timeranges in the query and does validation |
| * wrt the queried field's life and the range queried |
| * |
| * CandidateTableResolver: Phase 2: - Prunes candidates tables if required |
| * join columns are not part of candidate tables - Required source |
| * columns(join columns) for reaching a denormalized field, are not part of |
| * candidate tables - Required denormalized fields are not part of refered |
| * tables, there by all the candidates which are using denormalized fields. |
| * |
| * AggregateResolver : - If non default aggregate or no aggregate queried for |
| * a measure, it prunes all aggregate facts from candidates. - Replaces |
| * measures with default aggregates. if enabled |
| * |
| * GroupbyResolver : - Promotes select to groupby and groupby to select, if |
| * enabled |
| * |
| * StorageTableResolver : - Resolves storages and partitions for all candidate |
| * tables. Prunes candidates if not storages are available. |
| * |
| * Whenever a candidate fact is pruned (because of no storages, no default |
| * aggregate and etc), the sets containing the fact are also pruned. |
| * |
| * LeastPartitionResolver and LightestFactResolver work on candidate fact sets |
| * and considers sets with least number of partitions required and lightest |
| * facts respectively. |
| * |
| * If LightestFact first flag is enabled, LightestFactResolver is applied |
| * before StorageTableResolver. |
| * |
| * MaxCoveringFactResolver runs just after all candidate facts' partitions |
| * are resolved. It then sees how much time range each fact set is able to cover |
| * and finds the maximum coverable range. It then prunes all fact sets that |
| * are covering less than that range. If fail on partial is true, then by the |
| * time this resolver runs, all the candidate fact sets cover full range. |
| * So there this resolver is a no-op. Same thing when fail on partial is false |
| * and no fact set has any data. This is most useful when facts actually have |
| * partial data. There it'll ensure the facts that are covering the maximum |
| * time range will be picked. |
| * |
| * |
| * Once all rewriters are done, finally picks up one of the available |
| * candidate sets to answer the query, after all the resolvers are done. Once |
| * the final candidate fact set is picked, if number of elements in the fact |
| * set is one, the query written as with cube query ASTs. If the number of |
| * fact sets is more, then query rewritten with MultifactHQLContext which |
| * writes a join query with each fact query. A fact query contains only the |
| * fields queried from that fact. The ASTs corresponding to the fact are AST |
| * copied from original query and the expressions missing from this fact |
| * removed. |
| */ |
| private void setupRewriters() { |
| // Resolve columns - the column alias and table alias |
| rewriters.add(new ColumnResolver()); |
| // Rewrite base trees (groupby, having, orderby, limit) using aliases |
| rewriters.add(new AliasReplacer()); |
| ExpressionResolver exprResolver = new ExpressionResolver(); |
| DenormalizationResolver denormResolver = new DenormalizationResolver(); |
| CandidateTableResolver candidateTblResolver = new CandidateTableResolver(); |
| StorageTableResolver storageTableResolver = new StorageTableResolver(conf); |
| |
| // Phase 1 of exprResolver: Resolve expressions |
| rewriters.add(exprResolver); |
| // Phase 1 of denormResolver: De-normalized columns resolved |
| rewriters.add(denormResolver); |
| // Resolve time ranges |
| rewriters.add(new TimerangeResolver()); |
| // Phase 1 of candidateTblResolver: Resolve candidate storages and dimension tables for columns queried |
| rewriters.add(candidateTblResolver); |
| // Resolve aggregations and generate base select tree |
| rewriters.add(new AggregateResolver()); |
| rewriters.add(new GroupbyResolver(conf)); |
| //validate fields queryability (in case of derived cubes setup) |
| rewriters.add(new FieldValidator()); |
| // Resolve joins and generate base join tree |
| rewriters.add(new JoinResolver()); |
| // Do col life validation for the time range(s) queried |
| rewriters.add(new ColumnLifetimeChecker()); |
| // Phase 1 of storageTableResolver: Validate and prune candidate storages |
| rewriters.add(storageTableResolver); |
| // Phase 2 of candidateTblResolver: Resolve candidate storages and dimension tables for columns included |
| // in join and denorm resolvers |
| rewriters.add(candidateTblResolver); |
| // Find Union and Join combinations over Storage Candidates that can answer the queried time range(s) and all |
| // queried measures |
| rewriters.add(new CandidateCoveringSetsResolver()); |
| |
| // If lightest fact first option is enabled for this driver (via lens.cube.query.pick.lightest.fact.first = true), |
| // run LightestFactResolver and keep only the lighted combination(s) generated by CandidateCoveringSetsResolver |
| if (lightFactFirst) { |
| // Prune candidate tables for which denorm column references do not exist |
| rewriters.add(denormResolver); |
| // Phase 2 of exprResolver:Prune candidate facts without any valid expressions |
| rewriters.add(exprResolver); |
| // Pick the least cost combination(s) (and prune others) out of a set of combinations produced |
| // by CandidateCoveringSetsResolver |
| rewriters.add(new LightestFactResolver()); |
| } |
| |
| // Phase 2 of storageTableResolver: resolve storage table partitions. |
| rewriters.add(storageTableResolver); |
| // In case partial data is allowed (via lens.cube.query.fail.if.data.partial = false) and there are many |
| // combinations with partial data, pick the one that covers the maximum part of time ranges(s) queried |
| rewriters.add(new MaxCoveringFactResolver(conf)); |
| // Phase 3 of storageTableResolver: resolve dimension tables and partitions. |
| rewriters.add(storageTableResolver); |
| // Prune candidate tables for which denorm column references do not exist |
| //TODO union: phase 2 of denormResolver needs to be moved before CoveringSetResolver.. check if this makes sense |
| rewriters.add(denormResolver); |
| // Phase 2 of exprResolver : Prune candidate facts without any valid expressions |
| rewriters.add(exprResolver); |
| |
| if (!lightFactFirst) { |
| // Pick the least cost combination(s) (and prune others) out of a set of combinations produced |
| // by CandidateCoveringSetsResolver |
| rewriters.add(new LightestFactResolver()); |
| } |
| // if two combinations have the same least weight/cost, then the combination with least number of time partitions |
| // queried will be picked. Rest of the combinations will be pruned |
| rewriters.add(new LeastPartitionResolver()); |
| rewriters.add(new LightestDimensionResolver()); |
| } |
| |
| public CubeQueryContext rewrite(ASTNode astnode) throws LensException { |
| CubeSemanticAnalyzer analyzer; |
| try { |
| analyzer = new CubeSemanticAnalyzer(conf, hconf); |
| analyzer.analyze(astnode, qlCtx); |
| } catch (SemanticException e) { |
| throw new LensException(SYNTAX_ERROR.getLensErrorInfo(), e, e.getMessage()); |
| } |
| CubeQueryContext ctx = new CubeQueryContext(astnode, analyzer.getCubeQB(), conf, hconf); |
| rewrite(rewriters, ctx); |
| return ctx; |
| } |
| |
| public CubeQueryContext rewrite(String command) throws LensException { |
| if (command != null) { |
| command = command.replace("\n", ""); |
| } |
| ASTNode tree; |
| try { |
| ParseDriver pd = new ParseDriver(); |
| tree = pd.parse(command, qlCtx, false); |
| tree = ParseUtils.findRootNonNullToken(tree); |
| } catch (ParseException e) { |
| throw new LensException(SYNTAX_ERROR.getLensErrorInfo(), e, e.getMessage()); |
| } |
| return rewrite(tree); |
| } |
| |
| private static final String ITER_STR = "-ITER-"; |
| |
| private void rewrite(List<ContextRewriter> rewriters, CubeQueryContext ctx) throws LensException { |
| int i = 0; |
| for (ContextRewriter rewriter : rewriters) { |
| /* |
| * Adding iteration number as part of gauge name since some rewriters are have more than one phase, and having |
| * iter number gives the idea which iteration the rewriter was run |
| */ |
| MethodMetricsContext mgauge = MethodMetricsFactory.createMethodGauge(ctx.getConf(), true, |
| rewriter.getClass().getCanonicalName() + ITER_STR + i); |
| |
| rewriter.rewriteContext(ctx); |
| mgauge.markSuccess(); |
| i++; |
| } |
| } |
| |
| public Context getQLContext() { |
| return qlCtx; |
| } |
| |
| public void clear() { |
| try { |
| if (qlCtx != null) { |
| qlCtx.clear(); |
| } |
| } catch (IOException e) { |
| log.info("Ignoring exception in clearing qlCtx:", e); |
| // ignoring exception in clear |
| } |
| } |
| } |