| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /*------------------------------------------------------------------------- |
| * |
| * costsize.c |
| * Routines to compute (and set) relation sizes and path costs |
| * |
| * Path costs are measured in arbitrary units established by these basic |
| * parameters: |
| * |
| * seq_page_cost Cost of a sequential page fetch |
| * random_page_cost Cost of a non-sequential page fetch |
| * cpu_tuple_cost Cost of typical CPU time to process a tuple |
| * cpu_index_tuple_cost Cost of typical CPU time to process an index tuple |
| * cpu_operator_cost Cost of CPU time to execute an operator or function |
| * |
| * We expect that the kernel will typically do some amount of read-ahead |
| * optimization; this in conjunction with seek costs means that seq_page_cost |
| * is normally considerably less than random_page_cost. (However, if the |
| * database is fully cached in RAM, it is reasonable to set them equal.) |
| * |
| * We also use a rough estimate "effective_cache_size" of the number of |
| * disk pages in Postgres + OS-level disk cache. (We can't simply use |
| * NBuffers for this purpose because that would ignore the effects of |
| * the kernel's disk cache.) |
| * |
| * Obviously, taking constants for these values is an oversimplification, |
| * but it's tough enough to get any useful estimates even at this level of |
| * detail. Note that all of these parameters are user-settable, in case |
| * the default values are drastically off for a particular platform. |
| * |
| * We compute two separate costs for each path: |
| * total_cost: total estimated cost to fetch all tuples |
| * startup_cost: cost that is expended before first tuple is fetched |
| * In some scenarios, such as when there is a LIMIT or we are implementing |
| * an EXISTS(...) sub-select, it is not necessary to fetch all tuples of the |
| * path's result. A caller can estimate the cost of fetching a partial |
| * result by interpolating between startup_cost and total_cost. In detail: |
| * actual_cost = startup_cost + |
| * (total_cost - startup_cost) * tuples_to_fetch / path->parent->rows; |
| * Note that a base relation's rows count (and, by extension, plan_rows for |
| * plan nodes below the LIMIT node) are set without regard to any LIMIT, so |
| * that this equation works properly. (Also, these routines guarantee not to |
| * set the rows count to zero, so there will be no zero divide.) The LIMIT is |
| * applied as a top-level plan node. |
| * |
| * For largely historical reasons, most of the routines in this module use |
| * the passed result Path only to store their startup_cost and total_cost |
| * results into. All the input data they need is passed as separate |
| * parameters, even though much of it could be extracted from the Path. |
| * An exception is made for the cost_XXXjoin() routines, which expect all |
| * the non-cost fields of the passed XXXPath to be filled in. |
| * |
| * |
| * Portions Copyright (c) 2005-2008, Greenplum inc |
| * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group |
| * Portions Copyright (c) 1994, Regents of the University of California |
| * |
| * IDENTIFICATION |
| * $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.169.2.2 2007/01/08 16:09:31 tgl Exp $ |
| * |
| *------------------------------------------------------------------------- |
| */ |
| |
| #include "postgres.h" |
| |
| #include <math.h> |
| |
| #include "executor/hashjoin.h" |
| #include "miscadmin.h" |
| #include "optimizer/clauses.h" |
| #include "optimizer/cost.h" |
| #include "optimizer/pathnode.h" |
| #include "parser/parsetree.h" |
| #include "utils/lsyscache.h" |
| #include "utils/selfuncs.h" |
| #include "utils/tuplesort.h" |
| |
| #include "cdb/cdbpath.h" /* cdbpath_rows() */ |
| |
| #define LOG2(x) (log(x) / 0.693147180559945) |
| |
| /* |
| * Some Paths return less than the nominal number of rows of their parent |
| * relations; join nodes need to do this to get the correct input count: |
| */ |
| #define PATH_ROWS(root, path) (cdbpath_rows((root), (path))) |
| |
| |
| double seq_page_cost = DEFAULT_SEQ_PAGE_COST; |
| double random_page_cost = DEFAULT_RANDOM_PAGE_COST; |
| double cpu_tuple_cost = DEFAULT_CPU_TUPLE_COST; |
| double cpu_index_tuple_cost = DEFAULT_CPU_INDEX_TUPLE_COST; |
| double cpu_operator_cost = DEFAULT_CPU_OPERATOR_COST; |
| |
| int effective_cache_size = DEFAULT_EFFECTIVE_CACHE_SIZE; |
| |
| Cost disable_cost = 1.0e9; |
| |
| /* CDB: The enable_xxx globals have been moved to allpaths.c */ |
| |
| typedef struct |
| { |
| PlannerInfo *root; |
| QualCost total; |
| } cost_qual_eval_context; |
| |
| static bool cost_qual_eval_walker(Node *node, cost_qual_eval_context *context); |
| static Selectivity approx_selectivity(PlannerInfo *root, List *quals, |
| JoinType jointype); |
| static Selectivity join_in_selectivity(JoinPath *path, PlannerInfo *root); |
| static double relation_byte_size(double tuples, int width); |
| static double page_size(double tuples, int width); |
| static Selectivity adjust_selectivity_for_nulltest(Selectivity selec, |
| Selectivity pselec, |
| List *pushed_quals, |
| JoinType jointype, |
| RelOptInfo *left, |
| RelOptInfo *right); |
| |
| static void hash_table_size(double ntuples, int tupwidth, double memory_bytes, |
| int *numbuckets, |
| int *numbatches); |
| |
| /* CDB: The clamp_row_est() function definition has been moved to cost.h */ |
| |
| /* |
| * cost_seqscan |
| * Determines and returns the cost of scanning a relation sequentially. |
| */ |
| void |
| cost_seqscan(Path *path, PlannerInfo *root, |
| RelOptInfo *baserel) |
| { |
| Cost startup_cost = 0; |
| Cost run_cost = 0; |
| Cost cpu_per_tuple; |
| |
| /* Should only be applied to base relations */ |
| Assert(baserel->relid > 0); |
| Assert(baserel->rtekind == RTE_RELATION); |
| |
| /* |
| * disk costs |
| */ |
| run_cost += seq_page_cost * baserel->pages; |
| |
| /* CPU costs */ |
| startup_cost += baserel->baserestrictcost.startup; |
| cpu_per_tuple = cpu_tuple_cost + baserel->baserestrictcost.per_tuple; |
| run_cost += cpu_per_tuple * baserel->tuples; |
| |
| path->startup_cost = startup_cost; |
| path->total_cost = startup_cost + run_cost; |
| } |
| |
| /* |
| * cost_appendonlyscan |
| * Determines and returns the cost of scanning a relation sequentially. |
| */ |
| void |
| cost_appendonlyscan(AppendOnlyPath *path, PlannerInfo *root, |
| RelOptInfo *baserel) |
| { |
| Cost startup_cost = 0; |
| Cost run_cost = 0; |
| Cost cpu_per_tuple; |
| |
| /* Should only be applied to base relations */ |
| Assert(baserel->relid > 0); |
| Assert(baserel->rtekind == RTE_RELATION); |
| |
| /* |
| * disk costs |
| */ |
| run_cost += seq_page_cost * baserel->pages; |
| |
| /* CPU costs */ |
| startup_cost += baserel->baserestrictcost.startup; |
| cpu_per_tuple = cpu_tuple_cost + baserel->baserestrictcost.per_tuple; |
| run_cost += cpu_per_tuple * baserel->tuples; |
| |
| path->path.startup_cost = startup_cost; |
| path->path.total_cost = startup_cost + run_cost; |
| } |
| |
| /* |
| * cost_parquetscan |
| * Determines and returns the cost of scanning a relation sequentially. |
| */ |
| void |
| cost_parquetscan(ParquetPath *path, PlannerInfo *root, |
| RelOptInfo *baserel) |
| { |
| Cost startup_cost = 0; |
| Cost run_cost = 0; |
| Cost cpu_per_tuple; |
| |
| /* Should only be applied to base relations */ |
| Assert(baserel->relid > 0); |
| Assert(baserel->rtekind == RTE_RELATION); |
| |
| /* |
| * disk costs |
| */ |
| run_cost += seq_page_cost * baserel->pages; |
| |
| /* CPU costs */ |
| startup_cost += baserel->baserestrictcost.startup; |
| cpu_per_tuple = cpu_tuple_cost + baserel->baserestrictcost.per_tuple; |
| run_cost += cpu_per_tuple * baserel->tuples; |
| |
| path->path.startup_cost = startup_cost; |
| path->path.total_cost = startup_cost + run_cost; |
| } |
| |
| /* |
| * cost_externalscan |
| * Determines and returns the cost of scanning an external relation. |
| * |
| * Right now this is not very meaningful at all but we'll probably |
| * want to make some good estimates in the future. |
| */ |
| void |
| cost_externalscan(ExternalPath *path, PlannerInfo *root, |
| RelOptInfo *baserel) |
| { |
| Cost startup_cost = 0; |
| Cost run_cost = 0; |
| Cost cpu_per_tuple; |
| |
| /* Should only be applied to external relations */ |
| Assert(baserel->relid > 0); |
| Assert(baserel->rtekind == RTE_RELATION); |
| |
| /* |
| * disk costs |
| */ |
| run_cost += seq_page_cost * baserel->pages; |
| |
| /* CPU costs */ |
| startup_cost += baserel->baserestrictcost.startup; |
| cpu_per_tuple = cpu_tuple_cost + baserel->baserestrictcost.per_tuple; |
| run_cost += cpu_per_tuple * baserel->tuples; |
| |
| path->path.startup_cost = startup_cost; |
| path->path.total_cost = startup_cost + run_cost; |
| } |
| |
| /* |
| * cost_index |
| * Determines and returns the cost of scanning a relation using an index. |
| * |
| * 'index' is the index to be used |
| * 'indexQuals' is the list of applicable qual clauses (implicit AND semantics) |
| * 'outer_rel' is the outer relation when we are considering using the index |
| * scan as the inside of a nestloop join (hence, some of the indexQuals |
| * are join clauses, and we should expect repeated scans of the index); |
| * NULL for a plain index scan |
| * |
| * cost_index() takes an IndexPath not just a Path, because it sets a few |
| * additional fields of the IndexPath besides startup_cost and total_cost. |
| * These fields are needed if the IndexPath is used in a BitmapIndexScan. |
| * |
| * NOTE: 'indexQuals' must contain only clauses usable as index restrictions. |
| * Any additional quals evaluated as qpquals may reduce the number of returned |
| * tuples, but they won't reduce the number of tuples we have to fetch from |
| * the table, so they don't reduce the scan cost. |
| * |
| * NOTE: as of 8.0, indexQuals is a list of RestrictInfo nodes, where formerly |
| * it was a list of bare clause expressions. |
| */ |
| void |
| cost_index(IndexPath *path, PlannerInfo *root, |
| IndexOptInfo *index, |
| List *indexQuals, |
| RelOptInfo *outer_rel) |
| { |
| RelOptInfo *baserel = index->rel; |
| Cost startup_cost = 0; |
| Cost run_cost = 0; |
| Cost indexStartupCost; |
| Cost indexTotalCost; |
| Selectivity indexSelectivity; |
| double indexCorrelation, |
| csquared; |
| Cost min_IO_cost, |
| max_IO_cost; |
| Cost cpu_per_tuple; |
| double tuples_fetched; |
| double pages_fetched; |
| |
| /* Should only be applied to base relations */ |
| Assert(IsA(baserel, RelOptInfo) && |
| IsA(index, IndexOptInfo)); |
| Assert(baserel->relid > 0); |
| Assert(baserel->rtekind == RTE_RELATION); |
| |
| /* |
| * Call index-access-method-specific code to estimate the processing cost |
| * for scanning the index, as well as the selectivity of the index (ie, |
| * the fraction of main-table tuples we will have to retrieve) and its |
| * correlation to the main-table tuple order. |
| */ |
| index->num_leading_eq = 0; |
| OidFunctionCall8(index->amcostestimate, |
| PointerGetDatum(root), |
| PointerGetDatum(index), |
| PointerGetDatum(indexQuals), |
| PointerGetDatum(outer_rel), |
| PointerGetDatum(&indexStartupCost), |
| PointerGetDatum(&indexTotalCost), |
| PointerGetDatum(&indexSelectivity), |
| PointerGetDatum(&indexCorrelation)); |
| |
| /* |
| * CDB: Note whether all of the key columns are matched by equality |
| * predicates. |
| * |
| * The index->num_leading_eq field is kludgily used as an implicit result |
| * parameter from the amcostestimate proc, to avoid changing its externally |
| * exposed interface. Transfer to IndexPath, then zap to discourage misuse. |
| */ |
| path->num_leading_eq = index->num_leading_eq; |
| index->num_leading_eq = 0; |
| |
| /* |
| * clamp index correlation to 99% or less, so that we always account for at least a little bit |
| * of random_page_cost in our calculation. Otherwise, perfectly correlated indexes look too fast. |
| * Chuck |
| */ |
| if (indexCorrelation >= 0.99) |
| indexCorrelation = 0.99; |
| else if (indexCorrelation <= -0.99) |
| indexCorrelation = -0.99; |
| |
| /* |
| * Save amcostestimate's results for possible use in bitmap scan planning. |
| * We don't bother to save indexStartupCost or indexCorrelation, because a |
| * bitmap scan doesn't care about either. |
| */ |
| path->indextotalcost = indexTotalCost; |
| path->indexselectivity = indexSelectivity; |
| |
| /* all costs for touching index itself included here */ |
| startup_cost += indexStartupCost; |
| run_cost += indexTotalCost - indexStartupCost; |
| |
| /* estimate number of main-table tuples fetched */ |
| tuples_fetched = clamp_row_est(indexSelectivity * baserel->tuples); |
| |
| /*---------- |
| * Estimate number of main-table pages fetched, and compute I/O cost. |
| * |
| * When the index ordering is uncorrelated with the table ordering, |
| * we use an approximation proposed by Mackert and Lohman (see |
| * index_pages_fetched() for details) to compute the number of pages |
| * fetched, and then charge random_page_cost per page fetched. |
| * |
| * When the index ordering is exactly correlated with the table ordering |
| * (just after a CLUSTER, for example), the number of pages fetched should |
| * be exactly selectivity * table_size. What's more, all but the first |
| * will be sequential fetches, not the random fetches that occur in the |
| * uncorrelated case. So if the number of pages is more than 1, we |
| * ought to charge |
| * random_page_cost + (pages_fetched - 1) * seq_page_cost |
| * For partially-correlated indexes, we ought to charge somewhere between |
| * these two estimates. We currently interpolate linearly between the |
| * estimates based on the correlation squared (XXX is that appropriate?). |
| *---------- |
| */ |
| if (outer_rel != NULL && outer_rel->rows > 1) |
| { |
| /* |
| * For repeated indexscans, the appropriate estimate for the |
| * uncorrelated case is to scale up the number of tuples fetched in |
| * the Mackert and Lohman formula by the number of scans, so that we |
| * estimate the number of pages fetched by all the scans; then |
| * pro-rate the costs for one scan. In this case we assume all the |
| * fetches are random accesses. |
| */ |
| double num_scans = outer_rel->rows; |
| |
| pages_fetched = index_pages_fetched(tuples_fetched * num_scans, |
| baserel->pages, |
| (double) index->pages, |
| root); |
| |
| max_IO_cost = (pages_fetched * random_page_cost) / num_scans; |
| |
| /* |
| * In the perfectly correlated case, the number of pages touched |
| * by each scan is selectivity * table_size, and we can use the |
| * Mackert and Lohman formula at the page level to estimate how |
| * much work is saved by caching across scans. We still assume |
| * all the fetches are random, though, which is an overestimate |
| * that's hard to correct for without double-counting the cache |
| * effects. (But in most cases where such a plan is actually |
| * interesting, only one page would get fetched per scan anyway, |
| * so it shouldn't matter much.) |
| */ |
| pages_fetched = ceil(indexSelectivity * (double) baserel->pages); |
| |
| pages_fetched = index_pages_fetched(pages_fetched * num_scans, |
| baserel->pages, |
| (double) index->pages, |
| root); |
| |
| min_IO_cost = (pages_fetched * random_page_cost) / num_scans; |
| } |
| else |
| { |
| /* |
| * Normal case: apply the Mackert and Lohman formula, and then |
| * interpolate between that and the correlation-derived result. |
| */ |
| pages_fetched = index_pages_fetched(tuples_fetched, |
| baserel->pages, |
| (double) index->pages, |
| root); |
| |
| /* max_IO_cost is for the perfectly uncorrelated case (csquared=0) */ |
| max_IO_cost = pages_fetched * random_page_cost; |
| |
| /* min_IO_cost is for the perfectly correlated case (csquared=1) */ |
| pages_fetched = ceil(indexSelectivity * (double) baserel->pages); |
| min_IO_cost = random_page_cost; |
| if (pages_fetched > 1) |
| min_IO_cost += (pages_fetched - 1) * seq_page_cost; |
| } |
| |
| /* |
| * Now interpolate based on estimated index order correlation to get |
| * total disk I/O cost for main table accesses. |
| */ |
| csquared = indexCorrelation * indexCorrelation; |
| |
| run_cost += max_IO_cost + csquared * (min_IO_cost - max_IO_cost); |
| |
| /* |
| * Estimate CPU costs per tuple. |
| * |
| * Normally the indexquals will be removed from the list of restriction |
| * clauses that we have to evaluate as qpquals, so we should subtract |
| * their costs from baserestrictcost. But if we are doing a join then |
| * some of the indexquals are join clauses and shouldn't be subtracted. |
| * Rather than work out exactly how much to subtract, we don't subtract |
| * anything. |
| */ |
| startup_cost += baserel->baserestrictcost.startup; |
| cpu_per_tuple = cpu_tuple_cost + baserel->baserestrictcost.per_tuple; |
| |
| if (outer_rel == NULL) |
| { |
| QualCost index_qual_cost; |
| |
| cost_qual_eval(&index_qual_cost, indexQuals, root); |
| /* any startup cost still has to be paid ... */ |
| cpu_per_tuple -= index_qual_cost.per_tuple; |
| } |
| |
| run_cost += cpu_per_tuple * tuples_fetched; |
| |
| path->path.startup_cost = startup_cost; |
| path->path.total_cost = startup_cost + run_cost; |
| } |
| |
| /* |
| * index_pages_fetched |
| * Estimate the number of pages actually fetched after accounting for |
| * cache effects. |
| * |
| * We use an approximation proposed by Mackert and Lohman, "Index Scans |
| * Using a Finite LRU Buffer: A Validated I/O Model", ACM Transactions |
| * on Database Systems, Vol. 14, No. 3, September 1989, Pages 401-424. |
| * The Mackert and Lohman approximation is that the number of pages |
| * fetched is |
| * PF = |
| * min(2TNs/(2T+Ns), T) when T <= b |
| * 2TNs/(2T+Ns) when T > b and Ns <= 2Tb/(2T-b) |
| * b + (Ns - 2Tb/(2T-b))*(T-b)/T when T > b and Ns > 2Tb/(2T-b) |
| * where |
| * T = # pages in table |
| * N = # tuples in table |
| * s = selectivity = fraction of table to be scanned |
| * b = # buffer pages available (we include kernel space here) |
| * |
| * We assume that effective_cache_size is the total number of buffer pages |
| * available for the whole query, and pro-rate that space across all the |
| * tables in the query and the index currently under consideration. (This |
| * ignores space needed for other indexes used by the query, but since we |
| * don't know which indexes will get used, we can't estimate that very well; |
| * and in any case counting all the tables may well be an overestimate, since |
| * depending on the join plan not all the tables may be scanned concurrently.) |
| * |
| * The product Ns is the number of tuples fetched; we pass in that |
| * product rather than calculating it here. "pages" is the number of pages |
| * in the object under consideration (either an index or a table). |
| * "index_pages" is the amount to add to the total table space, which was |
| * computed for us by query_planner. |
| * |
| * Caller is expected to have ensured that tuples_fetched is greater than zero |
| * and rounded to integer (see clamp_row_est). The result will likewise be |
| * greater than zero and integral. |
| */ |
| double |
| index_pages_fetched(double tuples_fetched, BlockNumber pages, |
| double index_pages, PlannerInfo *root) |
| { |
| double pages_fetched; |
| double total_pages; |
| double T, |
| b; |
| |
| /* T is # pages in table, but don't allow it to be zero */ |
| T = (pages > 1) ? (double) pages : 1.0; |
| |
| /* Compute number of pages assumed to be competing for cache space */ |
| total_pages = root->total_table_pages + index_pages; |
| total_pages = Max(total_pages, 1.0); |
| Assert(T <= total_pages); |
| |
| /* b is pro-rated share of effective_cache_size */ |
| b = (double) effective_cache_size *T / total_pages; |
| |
| /* force it positive and integral */ |
| if (b <= 1.0) |
| b = 1.0; |
| else |
| b = ceil(b); |
| |
| /* This part is the Mackert and Lohman formula */ |
| if (T <= b) |
| { |
| pages_fetched = |
| (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched); |
| if (pages_fetched >= T) |
| pages_fetched = T; |
| else |
| pages_fetched = ceil(pages_fetched); |
| } |
| else |
| { |
| double lim; |
| |
| lim = (2.0 * T * b) / (2.0 * T - b); |
| if (tuples_fetched <= lim) |
| { |
| pages_fetched = |
| (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched); |
| } |
| else |
| { |
| pages_fetched = |
| b + (tuples_fetched - lim) * (T - b) / T; |
| } |
| pages_fetched = ceil(pages_fetched); |
| } |
| return pages_fetched; |
| } |
| |
| /* |
| * get_indexpath_pages |
| * Determine the total size of the indexes used in a bitmap index path. |
| * |
| * Note: if the same index is used more than once in a bitmap tree, we will |
| * count it multiple times, which perhaps is the wrong thing ... but it's |
| * not completely clear, and detecting duplicates is difficult, so ignore it |
| * for now. |
| */ |
| static double |
| get_indexpath_pages(Path *bitmapqual) |
| { |
| double result = 0; |
| ListCell *l; |
| |
| if (IsA(bitmapqual, BitmapAndPath)) |
| { |
| BitmapAndPath *apath = (BitmapAndPath *) bitmapqual; |
| |
| foreach(l, apath->bitmapquals) |
| { |
| result += get_indexpath_pages((Path *) lfirst(l)); |
| } |
| } |
| else if (IsA(bitmapqual, BitmapOrPath)) |
| { |
| BitmapOrPath *opath = (BitmapOrPath *) bitmapqual; |
| |
| foreach(l, opath->bitmapquals) |
| { |
| result += get_indexpath_pages((Path *) lfirst(l)); |
| } |
| } |
| else if (IsA(bitmapqual, IndexPath)) |
| { |
| IndexPath *ipath = (IndexPath *) bitmapqual; |
| |
| result = (double) ipath->indexinfo->pages; |
| } |
| else |
| elog(ERROR, "unrecognized node type: %d", nodeTag(bitmapqual)); |
| |
| return result; |
| } |
| |
| /* |
| * cost_bitmap_heap_scan |
| * Determines and returns the cost of scanning a relation using a bitmap |
| * index-then-heap plan. |
| * |
| * 'baserel' is the relation to be scanned |
| * 'bitmapqual' is a tree of IndexPaths, BitmapAndPaths, and BitmapOrPaths |
| * 'outer_rel' is the outer relation when we are considering using the bitmap |
| * scan as the inside of a nestloop join (hence, some of the indexQuals |
| * are join clauses, and we should expect repeated scans of the table); |
| * NULL for a plain bitmap scan |
| * |
| * Note: if this is a join inner path, the component IndexPaths in bitmapqual |
| * should have been costed accordingly. |
| */ |
| void |
| cost_bitmap_heap_scan(Path *path, PlannerInfo *root, RelOptInfo *baserel, |
| Path *bitmapqual, RelOptInfo *outer_rel) |
| { |
| Cost startup_cost = 0; |
| Cost run_cost = 0; |
| Cost indexTotalCost; |
| Selectivity indexSelectivity; |
| Cost cpu_per_tuple; |
| Cost cost_per_page; |
| double tuples_fetched; |
| double pages_fetched; |
| double T; |
| |
| /* Should only be applied to base relations */ |
| Assert(IsA(baserel, RelOptInfo)); |
| Assert(baserel->relid > 0); |
| Assert(baserel->rtekind == RTE_RELATION); |
| |
| /* |
| * Fetch total cost of obtaining the bitmap, as well as its total |
| * selectivity. |
| */ |
| cost_bitmap_tree_node(bitmapqual, &indexTotalCost, &indexSelectivity); |
| |
| startup_cost += indexTotalCost; |
| |
| /* |
| * Estimate number of main-table pages fetched. |
| */ |
| tuples_fetched = clamp_row_est(indexSelectivity * baserel->tuples); |
| |
| T = (baserel->pages > 1) ? (double) baserel->pages : 1.0; |
| |
| if (outer_rel != NULL && outer_rel->rows > 1) |
| { |
| /* |
| * For repeated bitmap scans, scale up the number of tuples fetched in |
| * the Mackert and Lohman formula by the number of scans, so that we |
| * estimate the number of pages fetched by all the scans. Then |
| * pro-rate for one scan. |
| */ |
| double num_scans = outer_rel->rows; |
| |
| pages_fetched = index_pages_fetched(tuples_fetched * num_scans, |
| baserel->pages, |
| get_indexpath_pages(bitmapqual), |
| root); |
| pages_fetched /= num_scans; |
| } |
| else |
| { |
| /* |
| * For a single scan, the number of heap pages that need to be fetched |
| * is the same as the Mackert and Lohman formula for the case T <= b |
| * (ie, no re-reads needed). |
| */ |
| pages_fetched = (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched); |
| } |
| if (pages_fetched >= T) |
| pages_fetched = T; |
| else |
| pages_fetched = ceil(pages_fetched); |
| |
| /* |
| * For small numbers of pages we should charge random_page_cost apiece, |
| * while if nearly all the table's pages are being read, it's more |
| * appropriate to charge seq_page_cost apiece. The effect is nonlinear, |
| * too. For lack of a better idea, interpolate like this to determine the |
| * cost per page. |
| */ |
| if (pages_fetched >= 2.0) |
| cost_per_page = random_page_cost - |
| (random_page_cost - seq_page_cost) * sqrt(pages_fetched / T); |
| else |
| cost_per_page = random_page_cost; |
| |
| run_cost += pages_fetched * cost_per_page; |
| |
| /* |
| * Estimate CPU costs per tuple. |
| * |
| * Often the indexquals don't need to be rechecked at each tuple ... but |
| * not always, especially not if there are enough tuples involved that the |
| * bitmaps become lossy. For the moment, just assume they will be |
| * rechecked always. |
| */ |
| startup_cost += baserel->baserestrictcost.startup; |
| cpu_per_tuple = cpu_tuple_cost + baserel->baserestrictcost.per_tuple; |
| |
| run_cost += cpu_per_tuple * tuples_fetched; |
| |
| path->startup_cost = startup_cost; |
| path->total_cost = startup_cost + run_cost; |
| } |
| |
| /* |
| * cost_bitmap_appendonly_scan |
| * |
| * NOTE: This is a copy of cost_bitmap_heap_scan. |
| */ |
| void |
| cost_bitmap_appendonly_scan(Path *path, PlannerInfo *root, RelOptInfo *baserel, |
| Path *bitmapqual, RelOptInfo *outer_rel) |
| { |
| Cost startup_cost = 0; |
| Cost run_cost = 0; |
| Cost indexTotalCost; |
| Selectivity indexSelectivity; |
| Cost cpu_per_tuple; |
| Cost cost_per_page; |
| double tuples_fetched; |
| double pages_fetched; |
| double T; |
| |
| /* Should only be applied to base relations */ |
| Assert(IsA(baserel, RelOptInfo)); |
| Assert(baserel->relid > 0); |
| Assert(baserel->rtekind == RTE_RELATION); |
| |
| /* |
| * Fetch total cost of obtaining the bitmap, as well as its total |
| * selectivity. |
| */ |
| cost_bitmap_tree_node(bitmapqual, &indexTotalCost, &indexSelectivity); |
| |
| startup_cost += indexTotalCost; |
| |
| /* |
| * Estimate number of main-table pages fetched. |
| */ |
| tuples_fetched = clamp_row_est(indexSelectivity * baserel->tuples); |
| |
| T = (baserel->pages > 1) ? (double) baserel->pages : 1.0; |
| |
| if (outer_rel != NULL && outer_rel->rows > 1) |
| { |
| /* |
| * For repeated bitmap scans, scale up the number of tuples fetched in |
| * the Mackert and Lohman formula by the number of scans, so that we |
| * estimate the number of pages fetched by all the scans. Then |
| * pro-rate for one scan. |
| */ |
| double num_scans = outer_rel->rows; |
| |
| pages_fetched = index_pages_fetched(tuples_fetched * num_scans, |
| baserel->pages, |
| get_indexpath_pages(bitmapqual), |
| root); |
| pages_fetched /= num_scans; |
| } |
| else |
| { |
| /* |
| * For a single scan, the number of heap pages that need to be fetched |
| * is the same as the Mackert and Lohman formula for the case T <= b |
| * (ie, no re-reads needed). |
| */ |
| pages_fetched = (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched); |
| } |
| if (pages_fetched >= T) |
| pages_fetched = T; |
| else |
| pages_fetched = ceil(pages_fetched); |
| |
| /* |
| * For small numbers of pages we should charge random_page_cost apiece, |
| * while if nearly all the table's pages are being read, it's more |
| * appropriate to charge seq_page_cost apiece. The effect is nonlinear, |
| * too. For lack of a better idea, interpolate like this to determine the |
| * cost per page. |
| */ |
| if (pages_fetched >= 2.0) |
| cost_per_page = random_page_cost - |
| (random_page_cost - seq_page_cost) * sqrt(pages_fetched / T); |
| else |
| cost_per_page = random_page_cost; |
| |
| run_cost += pages_fetched * cost_per_page; |
| |
| /* |
| * Estimate CPU costs per tuple. |
| * |
| * Often the indexquals don't need to be rechecked at each tuple ... but |
| * not always, especially not if there are enough tuples involved that the |
| * bitmaps become lossy. For the moment, just assume they will be |
| * rechecked always. |
| */ |
| startup_cost += baserel->baserestrictcost.startup; |
| cpu_per_tuple = cpu_tuple_cost + baserel->baserestrictcost.per_tuple; |
| |
| run_cost += cpu_per_tuple * tuples_fetched; |
| |
| path->startup_cost = startup_cost; |
| path->total_cost = startup_cost + run_cost; |
| } |
| |
| /* |
| * cost_bitmap_tree_node |
| * Extract cost and selectivity from a bitmap tree node (index/and/or) |
| */ |
| void |
| cost_bitmap_tree_node(Path *path, Cost *cost, Selectivity *selec) |
| { |
| if (IsA(path, IndexPath)) |
| { |
| *cost = ((IndexPath *) path)->indextotalcost; |
| *selec = ((IndexPath *) path)->indexselectivity; |
| /* |
| * Charge a small amount per retrieved tuple to reflect the costs of |
| * manipulating the bitmap. This is mostly to make sure that a bitmap |
| * scan doesn't look to be the same cost as an indexscan to retrieve |
| * a single tuple. |
| */ |
| *cost += 0.1 * cpu_operator_cost * ((IndexPath *) path)->rows; |
| } |
| else if (IsA(path, BitmapAndPath)) |
| { |
| *cost = path->total_cost; |
| *selec = ((BitmapAndPath *) path)->bitmapselectivity; |
| } |
| else if (IsA(path, BitmapOrPath)) |
| { |
| *cost = path->total_cost; |
| *selec = ((BitmapOrPath *) path)->bitmapselectivity; |
| } |
| else |
| { |
| elog(ERROR, "unrecognized node type: %d", nodeTag(path)); |
| *cost = *selec = 0; /* keep compiler quiet */ |
| } |
| } |
| |
| /* |
| * cost_bitmap_and_node |
| * Estimate the cost of a BitmapAnd node |
| * |
| * Note that this considers only the costs of index scanning and bitmap |
| * creation, not the eventual heap access. In that sense the object isn't |
| * truly a Path, but it has enough path-like properties (costs in particular) |
| * to warrant treating it as one. |
| */ |
| void |
| cost_bitmap_and_node(BitmapAndPath *path, PlannerInfo *root) |
| { |
| Cost totalCost; |
| Selectivity selec; |
| ListCell *l; |
| |
| /* |
| * We estimate AND selectivity on the assumption that the inputs are |
| * independent. This is probably often wrong, but we don't have the info |
| * to do better. |
| * |
| * The runtime cost of the BitmapAnd itself is estimated at 100x |
| * cpu_operator_cost for each tbm_intersect needed. Probably too small, |
| * definitely too simplistic? |
| */ |
| totalCost = 0.0; |
| selec = 1.0; |
| foreach(l, path->bitmapquals) |
| { |
| Path *subpath = (Path *) lfirst(l); |
| Cost subCost; |
| Selectivity subselec; |
| |
| cost_bitmap_tree_node(subpath, &subCost, &subselec); |
| |
| selec *= subselec; |
| |
| totalCost += subCost; |
| if (l != list_head(path->bitmapquals)) |
| totalCost += 100.0 * cpu_operator_cost; |
| } |
| path->bitmapselectivity = selec; |
| path->path.startup_cost = totalCost; |
| path->path.total_cost = totalCost; |
| } |
| |
| /* |
| * cost_bitmap_or_node |
| * Estimate the cost of a BitmapOr node |
| * |
| * See comments for cost_bitmap_and_node. |
| */ |
| void |
| cost_bitmap_or_node(BitmapOrPath *path, PlannerInfo *root) |
| { |
| Cost totalCost; |
| Selectivity selec; |
| ListCell *l; |
| |
| /* |
| * We estimate OR selectivity on the assumption that the inputs are |
| * non-overlapping, since that's often the case in "x IN (list)" type |
| * situations. Of course, we clamp to 1.0 at the end. |
| * |
| * The runtime cost of the BitmapOr itself is estimated at 100x |
| * cpu_operator_cost for each tbm_union needed. Probably too small, |
| * definitely too simplistic? We are aware that the tbm_unions are |
| * optimized out when the inputs are BitmapIndexScans. |
| */ |
| totalCost = 0.0; |
| selec = 0.0; |
| foreach(l, path->bitmapquals) |
| { |
| Path *subpath = (Path *) lfirst(l); |
| Cost subCost; |
| Selectivity subselec; |
| |
| cost_bitmap_tree_node(subpath, &subCost, &subselec); |
| |
| selec += subselec; |
| |
| totalCost += subCost; |
| if (l != list_head(path->bitmapquals) && |
| !IsA(subpath, IndexPath)) |
| totalCost += 100.0 * cpu_operator_cost; |
| } |
| path->bitmapselectivity = Min(selec, 1.0); |
| path->path.startup_cost = totalCost; |
| path->path.total_cost = totalCost; |
| } |
| |
| /* |
| * cost_tidscan |
| * Determines and returns the cost of scanning a relation using TIDs. |
| */ |
| void |
| cost_tidscan(Path *path, PlannerInfo *root, |
| RelOptInfo *baserel, List *tidquals) |
| { |
| Cost startup_cost = 0; |
| Cost run_cost = 0; |
| Cost cpu_per_tuple; |
| int ntuples; |
| ListCell *l; |
| |
| /* Should only be applied to base relations */ |
| Assert(baserel->relid > 0); |
| Assert(baserel->rtekind == RTE_RELATION); |
| |
| /* Count how many tuples we expect to retrieve */ |
| ntuples = 0; |
| foreach(l, tidquals) |
| { |
| if (IsA(lfirst(l), ScalarArrayOpExpr)) |
| { |
| /* Each element of the array yields 1 tuple */ |
| ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) lfirst(l); |
| Node *arraynode = (Node *) lsecond(saop->args); |
| |
| ntuples += estimate_array_length(arraynode); |
| } |
| else |
| { |
| /* It's just CTID = something, count 1 tuple */ |
| ntuples++; |
| } |
| } |
| |
| /* disk costs --- assume each tuple on a different page */ |
| run_cost += random_page_cost * ntuples; |
| |
| /* CPU costs */ |
| startup_cost += baserel->baserestrictcost.startup; |
| cpu_per_tuple = cpu_tuple_cost + baserel->baserestrictcost.per_tuple; |
| run_cost += cpu_per_tuple * ntuples; |
| |
| path->startup_cost = startup_cost; |
| path->total_cost = startup_cost + run_cost; |
| } |
| |
| /* |
| * cost_subqueryscan |
| * Determines and returns the cost of scanning a subquery RTE. |
| */ |
| void |
| cost_subqueryscan(Path *path, RelOptInfo *baserel) |
| { |
| Cost startup_cost; |
| Cost run_cost; |
| Cost cpu_per_tuple; |
| |
| /* Should only be applied to base relations that are subqueries */ |
| Assert(baserel->relid > 0); |
| Assert(baserel->rtekind == RTE_SUBQUERY); |
| |
| /* |
| * Cost of path is cost of evaluating the subplan, plus cost of evaluating |
| * any restriction clauses that will be attached to the SubqueryScan node, |
| * plus cpu_tuple_cost to account for selection and projection overhead. |
| */ |
| path->startup_cost = baserel->subplan->startup_cost; |
| path->total_cost = baserel->subplan->total_cost; |
| |
| startup_cost = baserel->baserestrictcost.startup; |
| cpu_per_tuple = cpu_tuple_cost + baserel->baserestrictcost.per_tuple; |
| run_cost = cpu_per_tuple * baserel->tuples; |
| |
| path->startup_cost += startup_cost; |
| path->total_cost += startup_cost + run_cost; |
| } |
| |
| /* |
| * cost_functionscan |
| * Determines and returns the cost of scanning a function RTE. |
| */ |
| void |
| cost_functionscan(Path *path, PlannerInfo *root, RelOptInfo *baserel) |
| { |
| Cost startup_cost = 0; |
| Cost run_cost = 0; |
| Cost cpu_per_tuple; |
| |
| /* Should only be applied to base relations that are functions */ |
| Assert(baserel->relid > 0); |
| Assert(baserel->rtekind == RTE_FUNCTION); |
| |
| /* |
| * For now, estimate function's cost at one operator eval per function |
| * call. Someday we should revive the function cost estimate columns in |
| * pg_proc... |
| */ |
| cpu_per_tuple = cpu_operator_cost; |
| |
| /* Add scanning CPU costs */ |
| startup_cost += baserel->baserestrictcost.startup; |
| cpu_per_tuple += cpu_tuple_cost + baserel->baserestrictcost.per_tuple; |
| run_cost += cpu_per_tuple * baserel->tuples; |
| |
| path->startup_cost = startup_cost; |
| path->total_cost = startup_cost + run_cost; |
| } |
| |
| /* |
| * cost_tablefunction |
| * Determines and returns the cost of scanning a table function RTE. |
| */ |
| void |
| cost_tablefunction(Path *path, PlannerInfo *root, RelOptInfo *baserel) |
| { |
| Cost startup_cost; |
| Cost run_cost; |
| Cost cpu_per_tuple; |
| |
| /* Should only be applied to base relations that are functions */ |
| Assert(baserel->relid > 0); |
| Assert(baserel->rtekind == RTE_TABLEFUNCTION); |
| |
| /* Initialize cost of the subquery input */ |
| path->startup_cost = baserel->subplan->startup_cost; |
| path->total_cost = baserel->subplan->total_cost; |
| |
| /* |
| * For now, estimate function's cost at one operator eval per function |
| * call. Someday we should revive the function cost estimate columns in |
| * pg_proc... (see cost_functionscan above) |
| */ |
| cpu_per_tuple = cpu_operator_cost; |
| |
| /* Calculate additional cost of the table function node */ |
| cpu_per_tuple += cpu_tuple_cost + baserel->baserestrictcost.per_tuple; |
| startup_cost = baserel->baserestrictcost.startup; |
| run_cost = cpu_per_tuple * baserel->tuples; |
| |
| /* Add in the additional cost */ |
| path->startup_cost += startup_cost; |
| path->total_cost += startup_cost + run_cost; |
| } |
| |
| /* |
| * cost_valuesscan |
| * Determines and returns the cost of scanning a VALUES RTE. |
| */ |
| void |
| cost_valuesscan(Path *path, PlannerInfo *root, RelOptInfo *baserel) |
| { |
| Cost startup_cost = 0; |
| Cost run_cost = 0; |
| Cost cpu_per_tuple; |
| |
| /* Should only be applied to base relations that are values lists */ |
| Assert(baserel->relid > 0); |
| Assert(baserel->rtekind == RTE_VALUES); |
| |
| /* |
| * For now, estimate list evaluation cost at one operator eval per list |
| * (probably pretty bogus, but is it worth being smarter?) |
| */ |
| cpu_per_tuple = cpu_operator_cost; |
| |
| /* Add scanning CPU costs */ |
| startup_cost += baserel->baserestrictcost.startup; |
| cpu_per_tuple += cpu_tuple_cost + baserel->baserestrictcost.per_tuple; |
| run_cost += cpu_per_tuple * baserel->tuples; |
| |
| path->startup_cost = startup_cost; |
| path->total_cost = startup_cost + run_cost; |
| } |
| |
| /* |
| * cost_ctescan |
| * Determines and returns the cost of scanning a CTE RTE. |
| */ |
| void |
| cost_ctescan(Path *path, PlannerInfo *root, RelOptInfo *baserel) |
| { |
| /* Should only be applied to base relations that are CTEs */ |
| Assert(baserel->relid > 0); |
| Assert(baserel->rtekind == RTE_CTE); |
| |
| /* |
| * For now, there should be no extra cost of scanning a CTE RTE, |
| * besides the cost of evaluating the subplan. |
| */ |
| path->startup_cost = baserel->subplan->startup_cost; |
| path->total_cost = baserel->subplan->total_cost; |
| } |
| |
| /* |
| * cost_sort |
| * Determines and returns the cost of sorting a relation, including |
| * the cost of reading the input data. |
| * |
| * If the total volume of data to sort is less than work_mem, we will do |
| * an in-memory sort, which requires no I/O and about t*log2(t) tuple |
| * comparisons for t tuples. |
| * |
| * If the total volume exceeds work_mem, we switch to a tape-style merge |
| * algorithm. There will still be about t*log2(t) tuple comparisons in |
| * total, but we will also need to write and read each tuple once per |
| * merge pass. We expect about ceil(logM(r)) merge passes where r is the |
| * number of initial runs formed and M is the merge order used by tuplesort.c. |
| * Since the average initial run should be about twice work_mem, we have |
| * disk traffic = 2 * relsize * ceil(logM(p / (2*work_mem))) |
| * cpu = comparison_cost * t * log2(t) |
| * |
| * The disk traffic is assumed to be 3/4ths sequential and 1/4th random |
| * accesses (XXX can't we refine that guess?) |
| * |
| * We charge two operator evals per tuple comparison, which should be in |
| * the right ballpark in most cases. |
| * |
| * 'pathkeys' is a list of sort keys |
| * 'input_cost' is the total cost for reading the input data |
| * 'tuples' is the number of tuples in the relation |
| * 'width' is the average tuple width in bytes |
| * |
| * NOTE: some callers currently pass NIL for pathkeys because they |
| * can't conveniently supply the sort keys. Since this routine doesn't |
| * currently do anything with pathkeys anyway, that doesn't matter... |
| * but if it ever does, it should react gracefully to lack of key data. |
| * (Actually, the thing we'd most likely be interested in is just the number |
| * of sort keys, which all callers *could* supply.) |
| */ |
| void |
| cost_sort(Path *path, PlannerInfo *root, |
| List *pathkeys, Cost input_cost, double tuples, int width) |
| { |
| Cost startup_cost = input_cost; |
| Cost run_cost = 0; |
| double nbytes = relation_byte_size(tuples, width); |
| long work_mem_bytes = (long) global_work_mem(root); |
| |
| /* |
| * We want to be sure the cost of a sort is never estimated as zero, even |
| * if passed-in tuple count is zero. Besides, mustn't do log(0)... |
| */ |
| if (tuples < 2.0) |
| tuples = 2.0; |
| |
| /* |
| * CPU costs |
| * |
| * Assume about two operator evals per tuple comparison and N log2 N |
| * comparisons |
| */ |
| startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(tuples); |
| |
| /* disk costs */ |
| if (nbytes > work_mem_bytes) |
| { |
| double npages = ceil(nbytes / BLCKSZ); |
| double nruns = (nbytes / work_mem_bytes) * 0.5; |
| double mergeorder = tuplesort_merge_order(work_mem_bytes); |
| double log_runs; |
| double npageaccesses; |
| |
| /* Compute logM(r) as log(r) / log(M) */ |
| if (nruns > mergeorder) |
| log_runs = ceil(log(nruns) / log(mergeorder)); |
| else |
| log_runs = 1.0; |
| npageaccesses = 2.0 * npages * log_runs; |
| /* Assume 3/4ths of accesses are sequential, 1/4th are not */ |
| startup_cost += npageaccesses * |
| (seq_page_cost * 0.75 + random_page_cost * 0.25); |
| } |
| |
| /* |
| * Also charge a small amount (arbitrarily set equal to operator cost) per |
| * extracted tuple. |
| */ |
| run_cost += cpu_operator_cost * tuples; |
| |
| path->startup_cost = startup_cost; |
| path->total_cost = startup_cost + run_cost; |
| } |
| |
| /* |
| * cost_material |
| * Determines and returns the cost of materializing a relation, including |
| * the cost of reading the input data. |
| * |
| * If the total volume of data to materialize exceeds work_mem, we will need |
| * to write it to disk, so the cost is much higher in that case. |
| */ |
| void |
| cost_material(Path *path, PlannerInfo *root, |
| Cost input_cost, double tuples, int width) |
| { |
| Cost startup_cost = input_cost; |
| Cost run_cost = 0; |
| double nbytes = relation_byte_size(tuples, width); |
| |
| /* disk costs */ |
| if (nbytes > global_work_mem(root)) |
| { |
| double npages = ceil(nbytes / BLCKSZ); |
| |
| /* We'll write during startup and read during retrieval */ |
| startup_cost += seq_page_cost * npages; |
| run_cost += seq_page_cost * npages; |
| } |
| |
| /* |
| * Charge a very small amount per inserted tuple, to reflect bookkeeping |
| * costs. We use cpu_tuple_cost/10 for this. This is needed to break the |
| * tie that would otherwise exist between nestloop with A outer, |
| * materialized B inner and nestloop with B outer, materialized A inner. |
| * The extra cost ensures we'll prefer materializing the smaller rel. |
| */ |
| startup_cost += cpu_tuple_cost * 0.1 * tuples; |
| |
| /* |
| * Also charge a small amount per extracted tuple. We use cpu_tuple_cost |
| * so that it doesn't appear worthwhile to materialize a bare seqscan. |
| */ |
| run_cost += cpu_tuple_cost * tuples; |
| |
| path->startup_cost = startup_cost; |
| path->total_cost = startup_cost + run_cost; |
| } |
| |
| /* |
| * cost_agg |
| * Determines and returns the cost of performing an Agg plan node, |
| * including the cost of its input. |
| * |
| * Note: when aggstrategy == AGG_SORTED, caller must ensure that input costs |
| * are for appropriately-sorted input. |
| */ |
| void |
| cost_agg(Path *path, PlannerInfo *root, |
| AggStrategy aggstrategy, int numAggs, |
| int numGroupCols, double numGroups, |
| Cost input_startup_cost, Cost input_total_cost, |
| double input_tuples, double input_width, |
| double hash_batches, double hashentry_width, |
| bool hash_streaming) |
| { |
| Cost startup_cost; |
| Cost total_cost; |
| |
| /* |
| * We charge one cpu_operator_cost per aggregate function per input tuple, |
| * and another one per output tuple (corresponding to transfn and finalfn |
| * calls respectively). If we are grouping, we charge an additional |
| * cpu_operator_cost per grouping column per input tuple for grouping |
| * comparisons. |
| * |
| * We will produce a single output tuple if not grouping, and a tuple per |
| * group otherwise. We charge cpu_tuple_cost for each output tuple. |
| * |
| * Note: in this cost model, AGG_SORTED and AGG_HASHED have exactly the |
| * same total CPU cost, but AGG_SORTED has lower startup cost. If the |
| * input path is already sorted appropriately, AGG_SORTED should be |
| * preferred (since it has no risk of memory overflow). This will happen |
| * as long as the computed total costs are indeed exactly equal --- but if |
| * there's roundoff error we might do the wrong thing. So be sure that |
| * the computations below form the same intermediate values in the same |
| * order. |
| */ |
| if (aggstrategy == AGG_PLAIN) |
| { |
| startup_cost = input_total_cost; |
| startup_cost += cpu_operator_cost * (input_tuples + 1) * numAggs; |
| /* we aren't grouping */ |
| total_cost = startup_cost + cpu_tuple_cost; |
| } |
| else if (aggstrategy == AGG_SORTED) |
| { |
| /* Here we are able to deliver output on-the-fly */ |
| startup_cost = input_startup_cost; |
| total_cost = input_total_cost; |
| /* calcs phrased this way to match HASHED case, see note above */ |
| total_cost += cpu_operator_cost * input_tuples * numGroupCols; |
| total_cost += cpu_operator_cost * input_tuples * numAggs; |
| total_cost += cpu_operator_cost * numGroups * numAggs; |
| total_cost += cpu_tuple_cost * numGroups; |
| } |
| else |
| { |
| double spilled_bytes = 0.0; |
| double spilled_groups = 0.0; |
| |
| /* must be AGG_HASHED */ |
| startup_cost = input_total_cost; |
| startup_cost += cpu_operator_cost * input_tuples * numGroupCols; |
| startup_cost += cpu_operator_cost * input_tuples * numAggs; |
| |
| /* account for some disk I/O if we expect to spill */ |
| if (hash_batches > 0) |
| { |
| /* |
| * Estimate the number of spilled groups. We know that it is between |
| * numGroups and input_tuples. However, we do not have a good measure |
| * to know the exact number. Currently, we choose 0.5 of |
| * (input_tuples - numGroups) as additional groups to be spilled. |
| */ |
| spilled_groups = numGroups + (input_tuples - numGroups) * 0.5; |
| |
| if (!hash_streaming) |
| { |
| double spilled_bytes_for_batch = |
| (spilled_groups * hashentry_width) / hash_batches; |
| double partitions = spilled_bytes_for_batch / (global_work_mem(root)); |
| double tree_depth = 1; |
| if (partitions != 0) |
| tree_depth += ceil(log(partitions) / log(gp_hashagg_default_nbatches)); |
| |
| spilled_bytes = tree_depth * spilled_groups * hashentry_width; |
| |
| /* startup gets charged the write-cost */ |
| startup_cost += seq_page_cost * (spilled_bytes / BLCKSZ); |
| } |
| } |
| |
| if (!hash_streaming) |
| { |
| total_cost = startup_cost; |
| total_cost += cpu_operator_cost * numGroups * numAggs; |
| total_cost += cpu_tuple_cost * numGroups; |
| } |
| else |
| { |
| total_cost = startup_cost; |
| total_cost += cpu_operator_cost * spilled_groups * numAggs; |
| total_cost += cpu_tuple_cost * spilled_groups; |
| } |
| |
| if (hash_batches > 2) |
| { |
| /* total gets charged the read-cost */ |
| total_cost += seq_page_cost * (spilled_bytes / BLCKSZ); |
| } |
| } |
| |
| path->startup_cost = startup_cost; |
| path->total_cost = total_cost; |
| } |
| |
| /* |
| * cost_group |
| * Determines and returns the cost of performing a Group plan node, |
| * including the cost of its input. |
| * |
| * Note: caller must ensure that input costs are for appropriately-sorted |
| * input. |
| */ |
| void |
| cost_group(Path *path, PlannerInfo *root, |
| int numGroupCols, double numGroups, |
| Cost input_startup_cost, Cost input_total_cost, |
| double input_tuples) |
| { |
| Cost startup_cost; |
| Cost total_cost; |
| |
| startup_cost = input_startup_cost; |
| total_cost = input_total_cost; |
| |
| /* |
| * Charge one cpu_operator_cost per comparison per input tuple. We assume |
| * all columns get compared at most of the tuples. |
| */ |
| total_cost += cpu_operator_cost * input_tuples * numGroupCols; |
| |
| path->startup_cost = startup_cost; |
| path->total_cost = total_cost; |
| } |
| |
| /* |
| * cost_window |
| * Determines and returns the cost of performing a Window plan node, |
| * including the cost of its input. |
| * |
| * Note: caller must ensure that input costs are for appropriately-sorted |
| * input. |
| */ |
| void |
| cost_window(Path *path, PlannerInfo *root, |
| int numOrderCols, |
| Cost input_startup_cost, Cost input_total_cost, |
| double input_tuples) |
| { |
| Cost startup_cost; |
| Cost total_cost; |
| |
| startup_cost = input_startup_cost; |
| total_cost = input_total_cost; |
| |
| /* |
| * Charge one cpu_operator_cost per comparison per input tuple. We assume |
| * all columns get compared at most of the tuples. |
| * |
| * XXX Should we also charge for function calls in the targetlist? |
| */ |
| total_cost += cpu_operator_cost * input_tuples * numOrderCols; |
| |
| path->startup_cost = startup_cost; |
| path->total_cost = total_cost; |
| } |
| |
| /* |
| * cost_shareinputscan |
| * compute the cost of shareinputscan. Shareinput scan scans from |
| * a material or sort. It may read disk, but should be costed |
| * less than material node. |
| */ |
| void |
| cost_shareinputscan(Path *path, PlannerInfo *root, Cost sharecost, double tuples, int width) |
| { |
| double nbytes = relation_byte_size(tuples, width); |
| double npages = ceil(nbytes/BLCKSZ); |
| |
| path->startup_cost = sharecost; |
| path->total_cost = sharecost; |
| |
| /* I/O cost */ |
| if (nbytes > global_work_mem(root)) |
| { |
| path->total_cost += seq_page_cost * npages; |
| } |
| else |
| { |
| /* Charge a small amount of I/O cost */ |
| path->total_cost += seq_page_cost * npages * 0.2; |
| } |
| |
| /* charge a small CPU cost. */ |
| path->total_cost += cpu_tuple_cost * tuples * 0.1; |
| } |
| |
| /* |
| * If a nestloop's inner path is an indexscan, be sure to use its estimated |
| * output row count, which may be lower than the restriction-clause-only row |
| * count of its parent. (We don't include this case in the PATH_ROWS macro |
| * because it applies *only* to a nestloop's inner relation.) We have to |
| * be prepared to recurse through Append nodes in case of an appendrel. |
| */ |
| static double |
| nestloop_inner_path_rows(PlannerInfo *root, Path *path) |
| { |
| double result; |
| |
| if (IsA(path, AppendPath)) |
| { |
| ListCell *l; |
| |
| result = 0; |
| foreach(l, ((AppendPath *) path)->subpaths) |
| { |
| result += nestloop_inner_path_rows(root, (Path *) lfirst(l)); |
| } |
| } |
| else |
| result = PATH_ROWS(root, path); |
| |
| return result; |
| } |
| |
| /* |
| * cost_nestloop |
| * Determines and returns the cost of joining two relations using the |
| * nested loop algorithm. |
| * |
| * 'path' is already filled in except for the cost fields |
| */ |
| void |
| cost_nestloop(NestPath *path, PlannerInfo *root) |
| { |
| Path *outer_path = path->jpath.outerjoinpath; |
| Path *inner_path = path->jpath.innerjoinpath; |
| Cost startup_cost = 0; |
| Cost run_cost = 0; |
| Cost cpu_per_tuple; |
| QualCost restrict_qual_cost; |
| double outer_path_rows = PATH_ROWS(root, outer_path); |
| double inner_path_rows = nestloop_inner_path_rows(root, inner_path); |
| double ntuples; |
| Selectivity joininfactor; |
| |
| /* |
| * If we're doing JOIN_IN then we will stop scanning inner tuples for an |
| * outer tuple as soon as we have one match. Account for the effects of |
| * this by scaling down the cost estimates in proportion to the JOIN_IN |
| * selectivity. (This assumes that all the quals attached to the join are |
| * IN quals, which should be true.) |
| */ |
| joininfactor = join_in_selectivity(&path->jpath, root); |
| |
| /* cost of source data */ |
| |
| /* |
| * NOTE: clearly, we must pay both outer and inner paths' startup_cost |
| * before we can start returning tuples, so the join's startup cost is |
| * their sum. What's not so clear is whether the inner path's |
| * startup_cost must be paid again on each rescan of the inner path. This |
| * is not true if the inner path is materialized or is a hashjoin, but |
| * probably is true otherwise. |
| */ |
| startup_cost += outer_path->startup_cost + inner_path->startup_cost; |
| run_cost += outer_path->total_cost - outer_path->startup_cost; |
| if (IsA(inner_path, MaterialPath) || |
| IsA(inner_path, HashPath)) |
| { |
| /* charge only run cost for each iteration of inner path */ |
| } |
| else |
| { |
| /* |
| * charge startup cost for each iteration of inner path, except we |
| * already charged the first startup_cost in our own startup |
| */ |
| run_cost += (outer_path_rows - 1) * inner_path->startup_cost; |
| } |
| run_cost += outer_path_rows * |
| (inner_path->total_cost - inner_path->startup_cost) * joininfactor; |
| |
| /* |
| * Compute number of tuples processed (not number emitted!) |
| */ |
| ntuples = outer_path_rows * inner_path_rows * joininfactor; |
| |
| /* CPU costs */ |
| cost_qual_eval(&restrict_qual_cost, path->jpath.joinrestrictinfo, root); |
| startup_cost += restrict_qual_cost.startup; |
| cpu_per_tuple = cpu_tuple_cost + restrict_qual_cost.per_tuple; |
| run_cost += cpu_per_tuple * ntuples; |
| |
| path->jpath.path.startup_cost = startup_cost; |
| path->jpath.path.total_cost = startup_cost + run_cost; |
| } |
| |
| /* |
| * cost_mergejoin |
| * Determines and returns the cost of joining two relations using the |
| * merge join algorithm. |
| * |
| * 'path' is already filled in except for the cost fields |
| * |
| * Notes: path's mergeclauses should be a subset of the joinrestrictinfo list; |
| * outersortkeys and innersortkeys are lists of the keys to be used |
| * to sort the outer and inner relations, or NIL if no explicit |
| * sort is needed because the source path is already ordered. |
| */ |
| void |
| cost_mergejoin(MergePath *path, PlannerInfo *root) |
| { |
| Path *outer_path = path->jpath.outerjoinpath; |
| Path *inner_path = path->jpath.innerjoinpath; |
| List *mergeclauses = path->path_mergeclauses; |
| List *outersortkeys = path->outersortkeys; |
| List *innersortkeys = path->innersortkeys; |
| Cost startup_cost = 0; |
| Cost run_cost = 0; |
| Cost cpu_per_tuple; |
| Selectivity merge_selec; |
| QualCost merge_qual_cost; |
| QualCost qp_qual_cost; |
| RestrictInfo *firstclause; |
| double outer_path_rows = PATH_ROWS(root, outer_path); |
| double inner_path_rows = PATH_ROWS(root, inner_path); |
| double outer_rows, |
| inner_rows; |
| double mergejointuples, |
| rescannedtuples; |
| double rescanratio; |
| Selectivity outerscansel, |
| innerscansel; |
| Selectivity joininfactor; |
| Path sort_path; /* dummy for result of cost_sort */ |
| |
| /* |
| * Compute cost and selectivity of the mergequals and qpquals (other |
| * restriction clauses) separately. We use approx_selectivity here for |
| * speed --- in most cases, any errors won't affect the result much. |
| * |
| * Note: it's probably bogus to use the normal selectivity calculation |
| * here when either the outer or inner path is a UniquePath. |
| */ |
| merge_selec = approx_selectivity(root, mergeclauses, |
| path->jpath.jointype); |
| cost_qual_eval(&merge_qual_cost, mergeclauses, root); |
| cost_qual_eval(&qp_qual_cost, path->jpath.joinrestrictinfo, root); |
| qp_qual_cost.startup -= merge_qual_cost.startup; |
| qp_qual_cost.per_tuple -= merge_qual_cost.per_tuple; |
| |
| /* approx # tuples passing the merge quals */ |
| mergejointuples = clamp_row_est(merge_selec * outer_path_rows * inner_path_rows); |
| |
| /* |
| * When there are equal merge keys in the outer relation, the mergejoin |
| * must rescan any matching tuples in the inner relation. This means |
| * re-fetching inner tuples. Our cost model for this is that a re-fetch |
| * costs the same as an original fetch, which is probably an overestimate; |
| * but on the other hand we ignore the bookkeeping costs of mark/restore. |
| * Not clear if it's worth developing a more refined model. |
| * |
| * The number of re-fetches can be estimated approximately as size of |
| * merge join output minus size of inner relation. Assume that the |
| * distinct key values are 1, 2, ..., and denote the number of values of |
| * each key in the outer relation as m1, m2, ...; in the inner relation, |
| * n1, n2, ... Then we have |
| * |
| * size of join = m1 * n1 + m2 * n2 + ... |
| * |
| * number of rescanned tuples = (m1 - 1) * n1 + (m2 - 1) * n2 + ... = m1 * |
| * n1 + m2 * n2 + ... - (n1 + n2 + ...) = size of join - size of inner |
| * relation |
| * |
| * This equation works correctly for outer tuples having no inner match |
| * (nk = 0), but not for inner tuples having no outer match (mk = 0); we |
| * are effectively subtracting those from the number of rescanned tuples, |
| * when we should not. Can we do better without expensive selectivity |
| * computations? |
| */ |
| if (IsA(outer_path, UniquePath)) |
| rescannedtuples = 0; |
| else |
| { |
| rescannedtuples = mergejointuples - inner_path_rows; |
| /* Must clamp because of possible underestimate */ |
| if (rescannedtuples < 0) |
| rescannedtuples = 0; |
| } |
| /* We'll inflate inner run cost this much to account for rescanning */ |
| rescanratio = 1.0 + (rescannedtuples / inner_path_rows); |
| |
| /* |
| * A merge join will stop as soon as it exhausts either input stream |
| * (unless it's an outer join, in which case the outer side has to be |
| * scanned all the way anyway). Estimate fraction of the left and right |
| * inputs that will actually need to be scanned. We use only the first |
| * (most significant) merge clause for this purpose. |
| * |
| * Since this calculation is somewhat expensive, and will be the same for |
| * all mergejoin paths associated with the merge clause, we cache the |
| * results in the RestrictInfo node. |
| */ |
| if (mergeclauses && path->jpath.jointype != JOIN_FULL) |
| { |
| firstclause = (RestrictInfo *) linitial(mergeclauses); |
| if (firstclause->left_mergescansel < 0) /* not computed yet? */ |
| mergejoinscansel(root, (Node *) firstclause->clause, |
| &firstclause->left_mergescansel, |
| &firstclause->right_mergescansel); |
| |
| if (bms_is_subset(firstclause->left_relids, outer_path->parent->relids)) |
| { |
| /* left side of clause is outer */ |
| outerscansel = firstclause->left_mergescansel; |
| innerscansel = firstclause->right_mergescansel; |
| } |
| else |
| { |
| /* left side of clause is inner */ |
| outerscansel = firstclause->right_mergescansel; |
| innerscansel = firstclause->left_mergescansel; |
| } |
| if (path->jpath.jointype == JOIN_LEFT || |
| path->jpath.jointype == JOIN_LASJ || |
| path->jpath.jointype == JOIN_LASJ_NOTIN) |
| outerscansel = 1.0; |
| else if (path->jpath.jointype == JOIN_RIGHT) |
| innerscansel = 1.0; |
| } |
| else |
| { |
| /* cope with clauseless or full mergejoin */ |
| outerscansel = innerscansel = 1.0; |
| } |
| |
| /* convert selectivity to row count; must scan at least one row */ |
| outer_rows = clamp_row_est(outer_path_rows * outerscansel); |
| inner_rows = clamp_row_est(inner_path_rows * innerscansel); |
| |
| /* |
| * Readjust scan selectivities to account for above rounding. This is |
| * normally an insignificant effect, but when there are only a few rows in |
| * the inputs, failing to do this makes for a large percentage error. |
| */ |
| outerscansel = outer_rows / outer_path_rows; |
| innerscansel = inner_rows / inner_path_rows; |
| |
| /* cost of source data */ |
| |
| if (outersortkeys) /* do we need to sort outer? */ |
| { |
| cost_sort(&sort_path, |
| root, |
| outersortkeys, |
| outer_path->total_cost, |
| outer_path_rows, |
| outer_path->parent->width); |
| startup_cost += sort_path.startup_cost; |
| run_cost += (sort_path.total_cost - sort_path.startup_cost) |
| * outerscansel; |
| } |
| else |
| { |
| startup_cost += outer_path->startup_cost; |
| run_cost += (outer_path->total_cost - outer_path->startup_cost) |
| * outerscansel; |
| } |
| |
| if (innersortkeys) /* do we need to sort inner? */ |
| { |
| cost_sort(&sort_path, |
| root, |
| innersortkeys, |
| inner_path->total_cost, |
| inner_path_rows, |
| inner_path->parent->width); |
| startup_cost += sort_path.startup_cost; |
| run_cost += (sort_path.total_cost - sort_path.startup_cost) |
| * innerscansel * rescanratio; |
| } |
| else |
| { |
| startup_cost += inner_path->startup_cost; |
| run_cost += (inner_path->total_cost - inner_path->startup_cost) |
| * innerscansel * rescanratio; |
| } |
| |
| /* CPU costs */ |
| |
| /* |
| * If we're doing JOIN_IN then we will stop outputting inner tuples for an |
| * outer tuple as soon as we have one match. Account for the effects of |
| * this by scaling down the cost estimates in proportion to the expected |
| * output size. (This assumes that all the quals attached to the join are |
| * IN quals, which should be true.) |
| */ |
| joininfactor = join_in_selectivity(&path->jpath, root); |
| |
| /* |
| * The number of tuple comparisons needed is approximately number of outer |
| * rows plus number of inner rows plus number of rescanned tuples (can we |
| * refine this?). At each one, we need to evaluate the mergejoin quals. |
| * NOTE: JOIN_IN mode does not save any work here, so do NOT include |
| * joininfactor. |
| */ |
| startup_cost += merge_qual_cost.startup; |
| run_cost += merge_qual_cost.per_tuple * |
| (outer_rows + inner_rows * rescanratio); |
| |
| /* |
| * For each tuple that gets through the mergejoin proper, we charge |
| * cpu_tuple_cost plus the cost of evaluating additional restriction |
| * clauses that are to be applied at the join. (This is pessimistic since |
| * not all of the quals may get evaluated at each tuple.) This work is |
| |
| * skipped in JOIN_IN mode, so apply the factor. |
| */ |
| startup_cost += qp_qual_cost.startup; |
| cpu_per_tuple = cpu_tuple_cost + qp_qual_cost.per_tuple; |
| run_cost += cpu_per_tuple * mergejointuples * joininfactor; |
| |
| path->jpath.path.startup_cost = startup_cost; |
| path->jpath.path.total_cost = startup_cost + run_cost; |
| } |
| |
| /* |
| * cost_hashjoin |
| * Determines and returns the cost of joining two relations using the |
| * hash join algorithm. |
| * |
| * 'path' is already filled in except for the cost fields |
| * |
| * Note: path's hashclauses should be a subset of the joinrestrictinfo list |
| */ |
| void |
| cost_hashjoin(HashPath *path, PlannerInfo *root) |
| { |
| Path *outer_path = path->jpath.outerjoinpath; |
| Path *inner_path = path->jpath.innerjoinpath; |
| List *hashclauses = path->path_hashclauses; |
| Cost startup_cost = 0; |
| Cost run_cost = 0; |
| Selectivity hash_selec; |
| QualCost hash_qual_cost; |
| QualCost qp_qual_cost; |
| double hashjointuples; |
| double outer_path_rows = PATH_ROWS(root, outer_path); |
| double inner_path_rows = PATH_ROWS(root, inner_path); |
| int num_hashclauses = list_length(hashclauses); |
| int numbuckets; |
| int numbatches; |
| double virtualbuckets; |
| Selectivity innerbucketsize; |
| Selectivity joininfactor; |
| ListCell *hcl; |
| |
| /* |
| * Compute cost and selectivity of the hashquals and qpquals (other |
| * restriction clauses) separately. We use approx_selectivity here for |
| * speed --- in most cases, any errors won't affect the result much. |
| * |
| * Note: it's probably bogus to use the normal selectivity calculation |
| * here when either the outer or inner path is a UniquePath. |
| */ |
| hash_selec = approx_selectivity(root, hashclauses, |
| path->jpath.jointype); |
| cost_qual_eval(&hash_qual_cost, hashclauses, root); |
| cost_qual_eval(&qp_qual_cost, path->jpath.joinrestrictinfo, root); |
| qp_qual_cost.startup -= hash_qual_cost.startup; |
| qp_qual_cost.per_tuple -= hash_qual_cost.per_tuple; |
| |
| /* approx # tuples passing the hash quals */ |
| hashjointuples = clamp_row_est(hash_selec * outer_path_rows * inner_path_rows); |
| |
| /* cost of source data */ |
| startup_cost += outer_path->startup_cost; |
| run_cost += outer_path->total_cost - outer_path->startup_cost; |
| startup_cost += inner_path->total_cost; |
| |
| /* |
| * Cost of computing hash function: must do it once per input tuple. We |
| * charge one cpu_operator_cost for each column's hash function. Also, |
| * tack on one cpu_tuple_cost per inner row, to model the costs of |
| * inserting the row into the hashtable. |
| * |
| * XXX when a hashclause is more complex than a single operator, we really |
| * should charge the extra eval costs of the left or right side, as |
| * appropriate, here. This seems more work than it's worth at the moment. |
| */ |
| startup_cost += (cpu_operator_cost * num_hashclauses + cpu_tuple_cost) |
| * inner_path_rows; |
| run_cost += cpu_operator_cost * num_hashclauses * outer_path_rows; |
| |
| /* Get hash table size that executor would use for inner relation */ |
| hash_table_size(inner_path_rows, |
| inner_path->parent->width, |
| global_work_mem(root), |
| &numbuckets, |
| &numbatches); |
| virtualbuckets = (double) numbuckets *(double) numbatches; |
| |
| /* |
| * Determine bucketsize fraction for inner relation. We use the smallest |
| * bucketsize estimated for any individual hashclause; this is undoubtedly |
| * conservative. |
| * |
| * BUT: if inner relation has been unique-ified, we can assume it's good |
| * for hashing. This is important both because it's the right answer, and |
| * because we avoid contaminating the cache with a value that's wrong for |
| * non-unique-ified paths. |
| */ |
| if (IsA(inner_path, UniquePath)) |
| innerbucketsize = 1.0 / virtualbuckets; |
| else |
| { |
| innerbucketsize = 1.0; |
| foreach(hcl, hashclauses) |
| { |
| RestrictInfo *restrictinfo = (RestrictInfo *) lfirst(hcl); |
| Expr *clause = restrictinfo->clause; |
| Selectivity thisbucketsize; |
| |
| Assert(IsA(restrictinfo, RestrictInfo)); |
| |
| /** |
| * If this is a IS NOT FALSE boolean test, we can peek underneath. |
| */ |
| if (IsA(clause, BooleanTest)) |
| { |
| BooleanTest *bt = (BooleanTest *) clause; |
| |
| if (bt->booltesttype == IS_NOT_FALSE) |
| { |
| clause = bt->arg; |
| } |
| } |
| |
| /* |
| * First we have to figure out which side of the hashjoin clause |
| * is the inner side. |
| * |
| * Since we tend to visit the same clauses over and over when |
| * planning a large query, we cache the bucketsize estimate in the |
| * RestrictInfo node to avoid repeated lookups of statistics. |
| */ |
| if (bms_is_subset(restrictinfo->right_relids, |
| inner_path->parent->relids)) |
| { |
| /* righthand side is inner */ |
| thisbucketsize = restrictinfo->right_bucketsize; |
| if (thisbucketsize < 0) |
| { |
| /* not cached yet */ |
| thisbucketsize = |
| estimate_hash_bucketsize(root, |
| get_rightop(clause), |
| virtualbuckets); |
| restrictinfo->right_bucketsize = thisbucketsize; |
| } |
| } |
| else |
| { |
| Assert(bms_is_subset(restrictinfo->left_relids, |
| inner_path->parent->relids)); |
| /* lefthand side is inner */ |
| thisbucketsize = restrictinfo->left_bucketsize; |
| if (thisbucketsize < 0) |
| { |
| /* not cached yet */ |
| thisbucketsize = |
| estimate_hash_bucketsize(root, |
| get_leftop(clause), |
| virtualbuckets); |
| restrictinfo->left_bucketsize = thisbucketsize; |
| } |
| } |
| |
| if (innerbucketsize > thisbucketsize) |
| innerbucketsize = thisbucketsize; |
| } |
| } |
| |
| /* |
| * If inner relation is too big then we will need to "batch" the join, |
| * which implies writing and reading most of the tuples to disk an extra |
| * time. Charge seq_page_cost per page, since the I/O should be nice and |
| * sequential. Writing the inner rel counts as startup cost, |
| * all the rest as run cost. |
| */ |
| if (numbatches > 1) |
| { |
| double outerpages = page_size(outer_path_rows, |
| outer_path->parent->width); |
| double innerpages = page_size(inner_path_rows, |
| inner_path->parent->width); |
| |
| startup_cost += seq_page_cost * innerpages; |
| run_cost += seq_page_cost * (innerpages + 2 * outerpages); |
| } |
| |
| /* CPU costs */ |
| |
| /* |
| * If we're doing JOIN_IN then we will stop comparing inner tuples to an |
| * outer tuple as soon as we have one match. Account for the effects of |
| * this by scaling down the cost estimates in proportion to the expected |
| * output size. (This assumes that all the quals attached to the join are |
| * IN quals, which should be true.) |
| */ |
| joininfactor = join_in_selectivity(&path->jpath, root); |
| |
| /* |
| * The number of tuple comparisons needed is the number of outer tuples |
| * times the typical number of tuples in a hash bucket, which is the inner |
| * relation size times its bucketsize fraction. At each one, we need to |
| * evaluate the hashjoin quals. But actually, charging the full qual eval |
| * cost at each tuple is pessimistic, since we don't evaluate the quals |
| * unless the hash values match exactly. For lack of a better idea, halve |
| * the cost estimate to allow for that. |
| * |
| * CDB: Assume there are no rows that pass the hash value comparison but |
| * fail the full qual eval. Thus the full comparison is charged for just |
| * 'hashjointuples', i.e. those rows that pass the hashjoin quals. |
| */ |
| startup_cost += hash_qual_cost.startup; |
| /* run_cost += hash_qual_cost.per_tuple * |
| outer_path_rows * clamp_row_est(inner_path_rows * innerbucketsize) * |
| joininfactor * 0.5;*/ |
| run_cost += hash_qual_cost.per_tuple * hashjointuples; |
| |
| if (gp_cost_hashjoin_chainwalk) |
| { |
| /* CDB: Add a small charge for walking the hash chains. */ |
| run_cost += 0.05 * cpu_operator_cost * |
| outer_path_rows * inner_path_rows * innerbucketsize * joininfactor; |
| } |
| |
| /* |
| * For each tuple that gets through the hashjoin proper, we charge |
| * cpu_tuple_cost plus the cost of evaluating additional restriction |
| * clauses that are to be applied at the join. (This is pessimistic since |
| * not all of the quals may get evaluated at each tuple.) |
| * |
| * CDB: Charge the cpu_tuple_cost only for tuples that pass all the quals. |
| */ |
| startup_cost += qp_qual_cost.startup; |
| run_cost += qp_qual_cost.per_tuple * hashjointuples; |
| run_cost += cpu_tuple_cost * path->jpath.path.parent->rows; |
| |
| path->jpath.path.startup_cost = startup_cost; |
| path->jpath.path.total_cost = startup_cost + run_cost; |
| } |
| |
| |
| /* |
| * cost_qual_eval |
| * Estimate the CPU costs of evaluating a WHERE clause. |
| * The input can be either an implicitly-ANDed list of boolean |
| * expressions, or a list of RestrictInfo nodes. |
| * The result includes both a one-time (startup) component, |
| * and a per-evaluation component. |
| */ |
| void |
| cost_qual_eval(QualCost *cost, List *quals, PlannerInfo *root) |
| { |
| cost_qual_eval_context context; |
| ListCell *l; |
| |
| context.root = root; |
| context.total.startup = 0; |
| context.total.per_tuple = 0; |
| |
| /* We don't charge any cost for the implicit ANDing at top level ... */ |
| |
| foreach(l, quals) |
| { |
| Node *qual = (Node *) lfirst(l); |
| |
| cost_qual_eval_walker(qual, &context); |
| } |
| |
| *cost = context.total; |
| } |
| |
| static bool |
| cost_qual_eval_walker(Node *node, cost_qual_eval_context *context) |
| { |
| if (node == NULL) |
| return false; |
| |
| /* |
| * RestrictInfo nodes contain an eval_cost field reserved for this |
| * routine's use, so that it's not necessary to evaluate the qual clause's |
| * cost more than once. If the clause's cost hasn't been computed yet, |
| * the field's startup value will contain -1. |
| */ |
| if (IsA(node, RestrictInfo)) |
| { |
| RestrictInfo *rinfo = (RestrictInfo *) node; |
| |
| if (rinfo->eval_cost.startup < 0) |
| { |
| cost_qual_eval_context locContext; |
| |
| locContext.root = context->root; |
| locContext.total.startup = 0; |
| locContext.total.per_tuple = 0; |
| |
| /* |
| * For an OR clause, recurse into the marked-up tree so that we |
| * set the eval_cost for contained RestrictInfos too. |
| */ |
| if (rinfo->orclause) |
| cost_qual_eval_walker((Node *) rinfo->orclause, &locContext); |
| else |
| cost_qual_eval_walker((Node *) rinfo->clause, &locContext); |
| |
| /* |
| * If the RestrictInfo is marked pseudoconstant, it will be tested |
| * only once, so treat its cost as all startup cost. |
| */ |
| if (rinfo->pseudoconstant) |
| { |
| /* count one execution during startup */ |
| locContext.total.startup += locContext.total.per_tuple; |
| locContext.total.per_tuple = 0; |
| } |
| rinfo->eval_cost = locContext.total; |
| } |
| context->total.startup += rinfo->eval_cost.startup; |
| context->total.per_tuple += rinfo->eval_cost.per_tuple; |
| /* do NOT recurse into children */ |
| return false; |
| } |
| |
| |
| /* |
| * Our basic strategy is to charge one cpu_operator_cost for each operator |
| * or function node in the given tree. Vars and Consts are charged zero, |
| * and so are boolean operators (AND, OR, NOT). Simplistic, but a lot |
| * better than no model at all. |
| * |
| * Should we try to account for the possibility of short-circuit |
| * evaluation of AND/OR? |
| */ |
| if (IsA(node, FuncExpr) || |
| IsA(node, OpExpr) || |
| IsA(node, DistinctExpr) || |
| IsA(node, NullIfExpr)) |
| context->total.per_tuple += cpu_operator_cost; |
| else if (IsA(node, ScalarArrayOpExpr)) |
| { |
| /* |
| * Estimate that the operator will be applied to about half of the |
| * array elements before the answer is determined. |
| */ |
| ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) node; |
| Node *arraynode = (Node *) lsecond(saop->args); |
| |
| context->total.per_tuple += |
| cpu_operator_cost * estimate_array_length(arraynode) * 0.5; |
| } |
| else if (IsA(node, RowCompareExpr)) |
| { |
| /* Conservatively assume we will check all the columns */ |
| RowCompareExpr *rcexpr = (RowCompareExpr *) node; |
| |
| context->total.per_tuple += cpu_operator_cost * list_length(rcexpr->opnos); |
| } |
| else if (IsA(node, CurrentOfExpr)) |
| { |
| /* This is noticeably more expensive than a typical operator */ |
| context->total.per_tuple += 100 * cpu_operator_cost; |
| } |
| else if (IsA(node, SubLink)) |
| { |
| /* This routine should not be applied to un-planned expressions */ |
| elog(ERROR, "cannot handle unplanned sub-select"); |
| } |
| else if (IsA(node, SubPlan)) |
| { |
| if (!context->root) |
| { |
| /* Cannot cost subplans without root. */ |
| return 0; |
| } |
| |
| /* |
| * A subplan node in an expression typically indicates that the |
| * subplan will be executed on each evaluation, so charge accordingly. |
| * (Sub-selects that can be executed as InitPlans have already been |
| * removed from the expression.) |
| * |
| * An exception occurs when we have decided we can implement the |
| * subplan by hashing. |
| */ |
| SubPlan *subplan = (SubPlan *) node; |
| Plan *plan = planner_subplan_get_plan(context->root, subplan); |
| |
| if (subplan->useHashTable) |
| { |
| /* |
| * If we are using a hash table for the subquery outputs, then the |
| * cost of evaluating the query is a one-time cost. We charge one |
| * cpu_operator_cost per tuple for the work of loading the |
| * hashtable, too. |
| */ |
| context->total.startup += plan->total_cost + |
| cpu_operator_cost * plan->plan_rows; |
| |
| /* |
| * The per-tuple costs include the cost of evaluating the lefthand |
| * expressions, plus the cost of probing the hashtable. Recursion |
| * into the testexpr will handle the lefthand expressions |
| * properly, and will count one cpu_operator_cost for each |
| * comparison operator. That is probably too low for the probing |
| * cost, but it's hard to make a better estimate, so live with it |
| * for now. |
| */ |
| } |
| else |
| { |
| /* |
| * Otherwise we will be rescanning the subplan output on each |
| * evaluation. We need to estimate how much of the output we will |
| * actually need to scan. NOTE: this logic should agree with the |
| * estimates used by make_subplan() in plan/subselect.c. |
| */ |
| Cost plan_run_cost = plan->total_cost - plan->startup_cost; |
| |
| if (subplan->subLinkType == EXISTS_SUBLINK) |
| { |
| /* we only need to fetch 1 tuple */ |
| context->total.per_tuple += plan_run_cost / plan->plan_rows; |
| } |
| else if (subplan->subLinkType == ALL_SUBLINK || |
| subplan->subLinkType == ANY_SUBLINK) |
| { |
| /* assume we need 50% of the tuples */ |
| context->total.per_tuple += 0.50 * plan_run_cost; |
| /* also charge a cpu_operator_cost per row examined */ |
| context->total.per_tuple += 0.50 * plan->plan_rows * cpu_operator_cost; |
| } |
| else |
| { |
| /* assume we need all tuples */ |
| context->total.per_tuple += plan_run_cost; |
| } |
| |
| /* |
| * Also account for subplan's startup cost. If the subplan is |
| * uncorrelated or undirect correlated, AND its topmost node is a |
| * Sort or Material node, assume that we'll only need to pay its |
| * startup cost once; otherwise assume we pay the startup cost |
| * every time. |
| */ |
| if (subplan->parParam == NIL && |
| (IsA(plan, Sort) || |
| IsA(plan, Material))) |
| context->total.startup += plan->startup_cost; |
| else |
| context->total.per_tuple += plan->startup_cost; |
| } |
| } |
| |
| return expression_tree_walker(node, cost_qual_eval_walker, |
| (void *) context); |
| } |
| |
| |
| /* |
| * approx_selectivity |
| * Quick-and-dirty estimation of clause selectivities. |
| * The input can be either an implicitly-ANDed list of boolean |
| * expressions, or a list of RestrictInfo nodes (typically the latter). |
| * |
| * This is quick-and-dirty because we bypass clauselist_selectivity, and |
| * simply multiply the independent clause selectivities together. Now |
| * clauselist_selectivity often can't do any better than that anyhow, but |
| * for some situations (such as range constraints) it is smarter. However, |
| * we can't effectively cache the results of clauselist_selectivity, whereas |
| * the individual clause selectivities can be and are cached. |
| * |
| * Since we are only using the results to estimate how many potential |
| * output tuples are generated and passed through qpqual checking, it |
| * seems OK to live with the approximation. |
| */ |
| static Selectivity |
| approx_selectivity(PlannerInfo *root, List *quals, JoinType jointype) |
| { |
| Selectivity total = 1.0; |
| ListCell *l; |
| |
| foreach(l, quals) |
| { |
| Node *qual = (Node *) lfirst(l); |
| |
| /* Note that clause_selectivity will be able to cache its result */ |
| total *= clause_selectivity(root, qual, 0, jointype, |
| false /* use_damping */); |
| } |
| return total; |
| } |
| |
| |
| /* |
| * set_baserel_size_estimates |
| * Set the size estimates for the given base relation. |
| * |
| * The rel's targetlist and restrictinfo list must have been constructed |
| * already. |
| * |
| * We set the following fields of the rel node: |
| * rows: the estimated number of output tuples (after applying |
| * restriction clauses). |
| * width: the estimated average output tuple width in bytes. |
| * baserestrictcost: estimated cost of evaluating baserestrictinfo clauses. |
| */ |
| void |
| set_baserel_size_estimates(PlannerInfo *root, RelOptInfo *rel) |
| { |
| double nrows; |
| |
| /* Should only be applied to base relations */ |
| Assert(rel->relid > 0); |
| |
| nrows = rel->tuples * |
| clauselist_selectivity(root, |
| rel->baserestrictinfo, |
| 0, |
| JOIN_INNER, |
| gp_selectivity_damping_for_scans); |
| |
| rel->rows = clamp_row_est(nrows); |
| |
| cost_qual_eval(&rel->baserestrictcost, rel->baserestrictinfo, root); |
| |
| set_rel_width(root, rel); |
| } |
| |
| |
| |
| /* |
| * adjust_selectivity_for_nulltest |
| * adjust selectivity of a nulltest on the inner side of an |
| * outer join |
| * |
| * This is a patch to make the workaround for (NOT) IN subqueries |
| * |
| * ... FROM T1 LEFT OUTER JOIN T2 ON ... WHERE T2.X IS (NOT) NULL |
| * |
| * work. This is not a comprehensive fix but addresses only |
| * this very special case. |
| * |
| */ |
| static Selectivity |
| adjust_selectivity_for_nulltest(Selectivity selec, |
| Selectivity pselec, |
| List *pushed_quals, |
| JoinType jointype, |
| RelOptInfo *left, |
| RelOptInfo *right) |
| { |
| Assert(IS_OUTER_JOIN(jointype)); |
| |
| /* |
| * consider only singletons; the case of multiple |
| * nulltests on the inner side of an outer join is not very |
| * useful in practice; |
| */ |
| if (JOIN_FULL != jointype && |
| 1 == list_length(pushed_quals)) |
| { |
| Node *clause = (Node *) lfirst(list_head(pushed_quals)); |
| |
| if (IsA(clause, RestrictInfo)) |
| { |
| clause = (Node *)((RestrictInfo*)clause) -> clause; |
| |
| if (IsA(clause, NullTest)) |
| { |
| int nulltesttype = 0; |
| Node *node = NULL; |
| Node *basenode = NULL; |
| |
| /* extract information */ |
| nulltesttype = ((NullTest *) clause)->nulltesttype; |
| node = (Node *) ((NullTest *) clause)->arg; |
| |
| /* CONSIDER: is this really necessary? */ |
| if (IsA(node, RelabelType)) |
| basenode = (Node *) ((RelabelType *) node)->arg; |
| else |
| basenode = node; |
| |
| if (IsA(basenode, Var)) |
| { |
| #ifdef USE_ASSERT_CHECKING |
| Var *var = (Var *) basenode; |
| #endif /* USE_ASSERT_CHECKING */ |
| double nullfrac = 1 - selec; |
| |
| /* |
| * a pushed qual must be applied on the inner side only; type implies |
| * where to find the var in the inputs |
| */ |
| Assert(!(JOIN_RIGHT == jointype) || bms_is_member(var->varno, left->relids)); |
| Assert(!(JOIN_LEFT == jointype) || bms_is_member(var->varno, right->relids)); |
| |
| /* adjust selectivity according to test */ |
| switch (((NullTest *) clause)->nulltesttype) |
| { |
| case IS_NULL: |
| pselec = nullfrac + ((1 - nullfrac ) * pselec); |
| break; |
| |
| case IS_NOT_NULL: |
| pselec = (1 - nullfrac) + (nullfrac * pselec); |
| break; |
| |
| default: |
| /* unknown null test*/ |
| Assert(false); |
| } |
| } |
| } |
| } |
| } |
| |
| Assert(pselec >= 0.0 && pselec <= 1.0); |
| return pselec; |
| } |
| |
| |
| /* |
| * set_joinrel_size_estimates |
| * Set the size estimates for the given join relation. |
| * |
| * The rel's targetlist must have been constructed already, and a |
| * restriction clause list that matches the given component rels must |
| * be provided. |
| * |
| * Since there is more than one way to make a joinrel for more than two |
| * base relations, the results we get here could depend on which component |
| * rel pair is provided. In theory we should get the same answers no matter |
| * which pair is provided; in practice, since the selectivity estimation |
| * routines don't handle all cases equally well, we might not. But there's |
| * not much to be done about it. (Would it make sense to repeat the |
| * calculations for each pair of input rels that's encountered, and somehow |
| * average the results? Probably way more trouble than it's worth.) |
| * |
| * It's important that the results for symmetric JoinTypes be symmetric, |
| * eg, (rel1, rel2, JOIN_LEFT) should produce the same result as (rel2, |
| * rel1, JOIN_RIGHT). |
| * |
| * We set only the rows field here. The width field was already set by |
| * build_joinrel_tlist, and baserestrictcost is not used for join rels. |
| */ |
| void |
| set_joinrel_size_estimates(PlannerInfo *root, RelOptInfo *rel, |
| RelOptInfo *outer_rel, |
| RelOptInfo *inner_rel, |
| JoinType jointype, |
| List *restrictlist) |
| { |
| Selectivity jselec; |
| Selectivity pselec; |
| double nrows; |
| double adjnrows; |
| |
| /* |
| * Compute joinclause selectivity. Note that we are only considering |
| * clauses that become restriction clauses at this join level; we are not |
| * double-counting them because they were not considered in estimating the |
| * sizes of the component rels. |
| * |
| * For an outer join, we have to distinguish the selectivity of the |
| * join's own clauses (JOIN/ON conditions) from any clauses that were |
| * "pushed down". For inner joins we just count them all as joinclauses. |
| */ |
| if (IS_OUTER_JOIN(jointype)) |
| { |
| List *joinquals = NIL; |
| List *pushedquals = NIL; |
| ListCell *l; |
| |
| /* Grovel through the clauses to separate into two lists */ |
| foreach(l, restrictlist) |
| { |
| RestrictInfo *rinfo = (RestrictInfo *) lfirst(l); |
| |
| Assert(IsA(rinfo, RestrictInfo)); |
| if (rinfo->is_pushed_down) |
| pushedquals = lappend(pushedquals, rinfo); |
| else |
| joinquals = lappend(joinquals, rinfo); |
| } |
| |
| /* Get the separate selectivities */ |
| jselec = clauselist_selectivity(root, |
| joinquals, |
| 0, |
| jointype, |
| gp_selectivity_damping_for_joins); |
| pselec = clauselist_selectivity(root, |
| pushedquals, |
| 0, |
| jointype, |
| gp_selectivity_damping_for_joins); |
| |
| /* |
| * special case where a pushed qual probes the inner |
| * side of an outer join to be NULL |
| */ |
| if (gp_adjust_selectivity_for_outerjoins) |
| pselec = adjust_selectivity_for_nulltest(jselec, |
| pselec, |
| pushedquals, |
| jointype, |
| outer_rel, |
| inner_rel); |
| |
| /* Avoid leaking a lot of ListCells */ |
| list_free(joinquals); |
| list_free(pushedquals); |
| } |
| else |
| { |
| jselec = clauselist_selectivity(root, |
| restrictlist, |
| 0, |
| jointype, |
| gp_selectivity_damping_for_joins); |
| pselec = 0.0; /* not used, keep compiler quiet */ |
| } |
| |
| /* |
| * Basically, we multiply size of Cartesian product by selectivity. |
| * |
| * If we are doing an outer join, take that into account: the joinqual |
| * selectivity has to be clamped using the knowledge that the output must |
| * be at least as large as the non-nullable input. However, any |
| * pushed-down quals are applied after the outer join, so their |
| * selectivity applies fully. |
| */ |
| switch (jointype) |
| { |
| case JOIN_INNER: |
| nrows = outer_rel->rows * inner_rel->rows * jselec; |
| break; |
| case JOIN_LEFT: |
| nrows = outer_rel->rows * inner_rel->rows * jselec; |
| if (nrows < outer_rel->rows) |
| nrows = outer_rel->rows; |
| nrows *= pselec; |
| break; |
| case JOIN_RIGHT: |
| nrows = outer_rel->rows * inner_rel->rows * jselec; |
| if (nrows < inner_rel->rows) |
| nrows = inner_rel->rows; |
| nrows *= pselec; |
| break; |
| case JOIN_FULL: |
| nrows = outer_rel->rows * inner_rel->rows * jselec; |
| if (nrows < outer_rel->rows) |
| nrows = outer_rel->rows; |
| if (nrows < inner_rel->rows) |
| nrows = inner_rel->rows; |
| nrows *= pselec; |
| break; |
| case JOIN_LASJ: |
| case JOIN_LASJ_NOTIN: |
| nrows = outer_rel->rows * jselec; |
| Assert (0.0 == pselec); |
| break; |
| default: |
| elog(ERROR, "unrecognized join type: %d", (int) jointype); |
| nrows = 0; /* keep compiler quiet */ |
| break; |
| } |
| |
| /* |
| * CDB: Force estimated number of join output rows to be at least 2. |
| * Otherwise a later nested join could take this join as its outer input, |
| * thinking that there will be only one pass over its inner table, |
| * which could be very slow if the actual number of rows is > 1. |
| * Someday we should improve the join selectivity estimates. |
| */ |
| adjnrows = Max(10, outer_rel->rows); |
| adjnrows = Max(adjnrows, inner_rel->rows); |
| adjnrows = LOG2(adjnrows); |
| if (nrows < adjnrows) |
| nrows = adjnrows; |
| |
| rel->rows = nrows; |
| |
| } |
| |
| /* |
| * join_in_selectivity |
| * Determines the factor by which a JOIN_IN join's result is expected |
| * to be smaller than an ordinary inner join. |
| * |
| * 'path' is already filled in except for the cost fields |
| */ |
| static Selectivity |
| join_in_selectivity(JoinPath *path, PlannerInfo *root) |
| { |
| RelOptInfo *innerrel; |
| Selectivity selec; |
| double nrows; |
| |
| /* Return 1.0 whenever it's not JOIN_IN */ |
| if (path->jointype != JOIN_IN) |
| return 1.0; |
| |
| /* |
| * Return 1.0 if the inner side is already known unique. The case where |
| * the inner path is already a UniquePath probably cannot happen in |
| * current usage, but check it anyway for completeness. The interesting |
| * case is where we've determined the inner relation itself is unique, |
| * which we can check by looking at the rows estimate for its UniquePath. |
| */ |
| if (IsA(path->innerjoinpath, UniquePath)) |
| return 1.0; |
| innerrel = path->innerjoinpath->parent; |
| if (innerrel->onerow) |
| return 1.0; |
| |
| /* |
| * Compute same result set_joinrel_size_estimates would compute for |
| * JOIN_INNER. Note that we use the input rels' absolute size estimates, |
| * not PATH_ROWS() which might be less; if we used PATH_ROWS() we'd be |
| * double-counting the effects of any join clauses used in input scans. |
| */ |
| selec = clauselist_selectivity(root, |
| path->joinrestrictinfo, |
| 0, |
| JOIN_INNER, |
| gp_selectivity_damping_for_joins); |
| nrows = path->outerjoinpath->parent->rows * innerrel->rows * selec; |
| |
| nrows = clamp_row_est(nrows); |
| |
| /* See if it's larger than the actual JOIN_IN size estimate */ |
| if (nrows > path->path.parent->rows) |
| return path->path.parent->rows / nrows; |
| else |
| return 1.0; |
| } |
| |
| /* |
| * set_function_size_estimates |
| * Set the size estimates for a base relation that is a function call. |
| * |
| * The rel's targetlist and restrictinfo list must have been constructed |
| * already. |
| * |
| * We set the same fields as set_baserel_size_estimates. |
| */ |
| void |
| set_function_size_estimates(PlannerInfo *root, RelOptInfo *rel) |
| { |
| /* |
| * Estimate number of rows the function itself will return. |
| * |
| * XXX no idea how to do this yet; but we can at least check whether |
| * function returns set or not... |
| */ |
| if (rel->onerow) |
| rel->tuples = 1; |
| else |
| rel->tuples = 1000; |
| |
| /* Now estimate number of output rows, etc */ |
| set_baserel_size_estimates(root, rel); |
| } |
| |
| /* |
| * set_table_function_size_estimates |
| * Set the size estimates for a base relation that is a table function call. |
| * |
| * The rel's targetlist and restrictinfo list must have been constructed |
| * already. |
| * |
| * We set the same fields as set_baserel_size_estimates. |
| */ |
| void |
| set_table_function_size_estimates(PlannerInfo *root, RelOptInfo *rel) |
| { |
| /* |
| * Estimate number of rows the function itself will return. |
| * |
| * If the function can return more than a single row then simply do |
| * a best guess that it returns the same number of rows as the subscan. |
| * |
| * This will obviously be way wrong in many cases, to improve we would |
| * need a stats callback function for table functions. |
| */ |
| if (rel->onerow) |
| rel->tuples = 1; |
| else |
| rel->tuples = rel->subplan->plan_rows; |
| |
| /* Now estimate number of output rows, etc */ |
| set_baserel_size_estimates(root, rel); |
| } |
| |
| |
| /* |
| * set_values_size_estimates |
| * Set the size estimates for a base relation that is a values list. |
| * |
| * The rel's targetlist and restrictinfo list must have been constructed |
| * already. |
| * |
| * We set the same fields as set_baserel_size_estimates. |
| */ |
| void |
| set_values_size_estimates(PlannerInfo *root, RelOptInfo *rel) |
| { |
| RangeTblEntry *rte; |
| |
| /* Should only be applied to base relations that are values lists */ |
| Assert(rel->relid > 0); |
| rte = rt_fetch(rel->relid, root->parse->rtable); |
| Assert(rte->rtekind == RTE_VALUES); |
| |
| /* |
| * Estimate number of rows the values list will return. We know this |
| * precisely based on the list length (well, barring set-returning |
| * functions in list items, but that's a refinement not catered for |
| * anywhere else either). |
| */ |
| rel->tuples = list_length(rte->values_lists); |
| |
| /* Now estimate number of output rows, etc */ |
| set_baserel_size_estimates(root, rel); |
| } |
| |
| /* |
| * set_cte_size_estimates |
| * Set the size estimates for a base relation that is a CTE reference. |
| * |
| * The rel's targetlist and restrictinfo list must have been constructed |
| * already, and we need the completed plan for the CTE (if a regular CTE) |
| * or the non-recursive term (if a self-reference). |
| * |
| * We set the same fields as set_baserel_size_estimates. |
| */ |
| void |
| set_cte_size_estimates(PlannerInfo *root, RelOptInfo *rel, Plan *cteplan) |
| { |
| Assert(rel->relid > 0); |
| |
| #ifdef USE_ASSERT_CHECKING |
| /* Should only be applied to base relations that are CTE references */ |
| RangeTblEntry *rte = planner_rt_fetch(rel->relid, root); |
| Assert(rte->rtekind == RTE_CTE); |
| #endif |
| |
| /* Set the number of tuples to the CTE plan's output estimate */ |
| rel->tuples = cteplan->plan_rows; |
| |
| /* Now estimate number of output rows, etc */ |
| set_baserel_size_estimates(root, rel); |
| } |
| |
| /* |
| * set_rel_width |
| * Set the estimated output width of a base relation. |
| * |
| * NB: this works best on plain relations because it prefers to look at |
| * real Vars. It will fail to make use of pg_statistic info when applied |
| * to a subquery relation, even if the subquery outputs are simple vars |
| * that we could have gotten info for. Is it worth trying to be smarter |
| * about subqueries? |
| * |
| * The per-attribute width estimates are cached for possible re-use while |
| * building join relations. |
| */ |
| void |
| set_rel_width(PlannerInfo *root, RelOptInfo *rel) |
| { |
| int32 tuple_width = 0; |
| ListCell *tllist; |
| |
| foreach(tllist, rel->reltargetlist) |
| { |
| Var *var = (Var *) lfirst(tllist); |
| int ndx; |
| Oid relid; |
| int32 item_width; |
| |
| /* For now, punt on whole-row child Vars */ |
| if (!IsA(var, Var)) |
| { |
| tuple_width += 32; /* arbitrary */ |
| continue; |
| } |
| |
| /* Virtual column? */ |
| if (var->varattno <= FirstLowInvalidHeapAttributeNumber) |
| { |
| CdbRelColumnInfo *rci = cdb_find_pseudo_column(root, var); |
| |
| tuple_width += rci->attr_width; |
| continue; |
| } |
| |
| ndx = var->varattno - rel->min_attr; |
| |
| /* |
| * The width probably hasn't been cached yet, but may as well check |
| */ |
| if (rel->attr_widths[ndx] > 0) |
| { |
| tuple_width += rel->attr_widths[ndx]; |
| continue; |
| } |
| |
| relid = getrelid(var->varno, root->parse->rtable); |
| if (relid != InvalidOid) |
| { |
| item_width = get_attavgwidth(relid, var->varattno); |
| if (item_width > 0) |
| { |
| rel->attr_widths[ndx] = item_width; |
| tuple_width += item_width; |
| continue; |
| } |
| } |
| |
| /* |
| * Not a plain relation, or can't find statistics for it. Estimate |
| * using just the type info. |
| */ |
| item_width = get_typavgwidth(var->vartype, var->vartypmod); |
| Assert(item_width > 0); |
| rel->attr_widths[ndx] = item_width; |
| tuple_width += item_width; |
| } |
| Assert(tuple_width >= 0); |
| rel->width = tuple_width; |
| } |
| |
| /* |
| * relation_byte_size |
| * Estimate the storage space in bytes for a given number of tuples |
| * of a given width (size in bytes). |
| */ |
| static double |
| relation_byte_size(double tuples, int width) |
| { |
| return tuples * (MAXALIGN(width) + MAXALIGN(sizeof(HeapTupleHeaderData))); |
| } |
| |
| /* |
| * page_size |
| * Returns an estimate of the number of pages covered by a given |
| * number of tuples of a given width (size in bytes). |
| */ |
| static double |
| page_size(double tuples, int width) |
| { |
| return ceil(relation_byte_size(tuples, width) / BLCKSZ); |
| } |
| |
| /** |
| * Determine the number of segments the planner should use. The result of this |
| * calculation is ordinarily saved in root->cdbpath_segments. Functions that |
| * need it in contexts in which root is not defined may call this function to |
| * derive it. |
| */ |
| int planner_segment_count(void) |
| { |
| if ( Gp_role != GP_ROLE_DISPATCH ) |
| return 1; |
| else if ( gp_segments_for_planner > 0 ) |
| return gp_segments_for_planner; |
| else |
| return GetPlannerSegmentNum(); |
| } |
| |
| /** |
| * Determines the total amount of memory available. This method is to be used |
| * during planning only. When planning in dispatch mode, it calculates total |
| * memory as sum work_mem on segments. In utility mode, it returns work_mem. |
| * Output: |
| * total memory in bytes. |
| */ |
| double global_work_mem(PlannerInfo *root) |
| { |
| int segment_count; |
| if (root) |
| { |
| Assert(root->config->cdbpath_segments > 0); |
| segment_count = root->config->cdbpath_segments; |
| } |
| else |
| segment_count = planner_segment_count(); |
| |
| return (double) planner_work_mem * 1024L * segment_count; |
| } |
| |
| /* CDB -- The incremental cost functions below are for use outside the |
| * the usual optimizer (in the aggregation planner, etc.) They |
| * are modeled on corresponding cost function, but address the |
| * specific needs of the planner. |
| */ |
| |
| /* incremental_hashjoin_cost |
| * |
| * Globals: seq_page_cost, cpu_operator_cost. |
| */ |
| Cost incremental_hashjoin_cost(double rows, int inner_width, int outer_width, List *hashclauses, PlannerInfo *root) |
| { |
| Cost startup_cost; |
| Cost run_cost; |
| QualCost hash_qual_cost; |
| int numbuckets; |
| int numbatches; |
| double virtualbuckets; |
| Selectivity innerbucketsize; |
| int num_hashclauses = list_length(hashclauses); |
| |
| /* Each inner row joins to a single outer row and vice versa, |
| * no selectivity issues. */ |
| startup_cost = 0; |
| run_cost = 0; |
| |
| /* Cost of computing hash function: must do it once per input tuple. We |
| * charge one cpu_operator_cost for each column's hash function. Also, |
| * tack on one cpu_tuple_cost per inner row, to model the costs of |
| * inserting the row into the hashtable. */ |
| startup_cost += (cpu_operator_cost * num_hashclauses + cpu_tuple_cost) * rows; |
| run_cost += cpu_operator_cost * num_hashclauses * rows; |
| |
| /* Get hash table size that executor would use for inner relation */ |
| hash_table_size(rows, inner_width, global_work_mem(root), &numbuckets, &numbatches); |
| virtualbuckets = (double) numbuckets *(double) numbatches; |
| |
| /* |
| * Determine bucketsize fraction for inner relation. Both inner and |
| * outer relations are unique in the join key. |
| */ |
| innerbucketsize = 1.0 / virtualbuckets; |
| |
| /* |
| * If inner relation is too big then we will need to "batch" the join, |
| * which implies writing and reading most of the tuples to disk an extra |
| * time. Charge seq_page_cost per page, since the I/O should be nice and |
| * sequential. Writing the inner rel counts as startup cost, |
| * all the rest as run cost. |
| */ |
| if (numbatches > 1) |
| { |
| double outerpages = page_size(rows, outer_width); |
| double innerpages = page_size(rows, inner_width); |
| |
| startup_cost += seq_page_cost * innerpages; |
| run_cost += seq_page_cost * (innerpages + 2 * outerpages); |
| } |
| |
| /* |
| * The number of tuple comparisons needed is the number of outer tuples |
| * times half the typical number of tuples in a hash bucket, which is |
| * the inner relation size times its bucketsize fraction. At each one, |
| * we need to evaluate the hashjoin quals. But actually, charging the |
| * full qual eval cost at each tuple is pessimistic, since we don't |
| * evaluate the quals unless the hash values match exactly. For lack |
| * of a better idea, halve the cost estimate to allow for that. |
| * |
| * CDB: Assume there are no rows that pass the hash value comparison but |
| * fail the full qual eval. Thus the full comparison is charged for just |
| * 'hashjointuples', i.e. those rows that pass the hashjoin quals. |
| */ |
| cost_qual_eval(&hash_qual_cost, hashclauses, root); |
| startup_cost += hash_qual_cost.startup; |
| run_cost += hash_qual_cost.per_tuple * rows * 0.5; |
| |
| if (gp_cost_hashjoin_chainwalk) |
| { |
| /* CDB: Add a small charge for walking the hash chains. */ |
| run_cost += 0.05 * cpu_operator_cost * 2 * rows * innerbucketsize; |
| } |
| |
| return startup_cost + run_cost; |
| } |
| |
| |
| |
| /* incremental_mergejoin_cost |
| * |
| * Globals: cpu_tuple_cost |
| */ |
| Cost incremental_mergejoin_cost(double rows, List *mergeclauses, PlannerInfo *root) |
| { |
| QualCost merge_qual_cost; |
| Cost startup_cost = 0; |
| Cost per_tuple_cost = 0; |
| Cost run_cost = 0; |
| |
| cost_qual_eval(&merge_qual_cost, mergeclauses, root); |
| |
| startup_cost += merge_qual_cost.startup; |
| per_tuple_cost = merge_qual_cost.per_tuple; |
| |
| /* CPU costs */ |
| |
| /* |
| * The number of tuple comparisons needed is number of outer |
| * rows plus number of inner rows. |
| */ |
| startup_cost += merge_qual_cost.startup; |
| run_cost += merge_qual_cost.per_tuple * 2 * rows; |
| run_cost += (cpu_tuple_cost + per_tuple_cost) * rows; |
| |
| return startup_cost + run_cost; |
| } |
| |
| /** |
| * This method determines the number of batches and buckets |
| * required to build a hash table over a set of tuples. |
| * NOTE: this is a clone of ExecChooseHashTableSize() in nodeHash.c. Please |
| * keep the two functions in sync. |
| * |
| * Input: |
| * ntuples - number of tuples |
| * tupwidth - width of tuple |
| * memory_bytes - total memory available, in bytes |
| * Output: |
| * numbuckets - number of buckets in hash table |
| * numbatches - number of batches needed |
| */ |
| static void hash_table_size(double ntuples, int tupwidth, double memory_bytes, |
| int *numbuckets, |
| int *numbatches) |
| { |
| int tupsize; |
| double inner_rel_bytes; |
| int nbatch; |
| int nbuckets; |
| |
| /* Force a plausible relation size if no info */ |
| if (ntuples <= 0.0) |
| ntuples = 1000.0; |
| |
| /* |
| * Estimate tupsize based on footprint of tuple in hashtable... note this |
| * does not allow for any palloc overhead. The manipulations of spaceUsed |
| * don't count palloc overhead either. |
| */ |
| tupsize = HJTUPLE_OVERHEAD + MAXALIGN(sizeof(MemTupleData)) + |
| MAXALIGN(tupwidth); |
| inner_rel_bytes = ntuples * tupsize; |
| |
| /* |
| * Set nbuckets to achieve an average bucket load of gp_hashjoin_tuples_per_bucket when |
| * memory is filled. Set nbatch to the smallest power of 2 that appears |
| * sufficient. |
| */ |
| if (inner_rel_bytes > memory_bytes) |
| { |
| /* We'll need multiple batches */ |
| long lbuckets; |
| double dbatch; |
| int minbatch; |
| |
| lbuckets = (memory_bytes / tupsize) / gp_hashjoin_tuples_per_bucket; |
| lbuckets = Min(lbuckets, INT_MAX / 32); |
| nbuckets = (int) lbuckets; |
| |
| dbatch = ceil(inner_rel_bytes / memory_bytes); |
| dbatch = Min(dbatch, INT_MAX / 32); |
| minbatch = (int) dbatch; |
| nbatch = 2; |
| while (nbatch < minbatch) |
| nbatch <<= 1; |
| } |
| else |
| { |
| /* We expect the hashtable to fit in memory, we want to use |
| * more buckets if we have memory to spare */ |
| double dbuckets_lower; |
| double dbuckets_upper; |
| double dbuckets; |
| |
| /* divide our tuple row-count estimate by our the number of |
| * tuples we'd like in a bucket: this produces a small bucket |
| * count independent of our work_mem setting */ |
| dbuckets_lower = (double)ntuples / (double)gp_hashjoin_tuples_per_bucket; |
| |
| /* if we have work_mem to spare, we'd like to use it -- so |
| * divide up our memory evenly (see the spill case above) */ |
| dbuckets_upper = (double)memory_bytes / ((double)tupsize * gp_hashjoin_tuples_per_bucket); |
| |
| /* we'll use our "lower" work_mem independent guess as a lower |
| * limit; but if we've got memory to spare we'll take the mean |
| * of the lower-limit and the upper-limit */ |
| if (dbuckets_upper > dbuckets_lower) |
| dbuckets = (dbuckets_lower + dbuckets_upper)/2.0; |
| else |
| dbuckets = dbuckets_lower; |
| |
| dbuckets = ceil(dbuckets); |
| dbuckets = Min(dbuckets, INT_MAX); |
| |
| nbuckets = (int)dbuckets; |
| |
| nbatch = 1; |
| } |
| |
| *numbuckets = nbuckets; |
| *numbatches = nbatch; |
| } |
| |
| |