| /*------------------------------------------------------------------------- |
| * |
| * analyze.c |
| * the Postgres statistics generator |
| * |
| * |
| * There are a few things in Cloudberry that make this more complicated |
| * than in upstream: |
| * |
| * Dispatching |
| * ----------- |
| * |
| * Cloudberry is an MPP system, so we need to collect the statistics from |
| * all the segments. The segment servers don't keep statistics (unless you |
| * connect to a segment in utility node and run ANALYZE directly), and |
| * the orchestration of ANALYZE happens in the dispatcher. The high |
| * level logic is the same as in upstream, but a few functions have been |
| * modified to gather data from the segments, instead of reading directly |
| * from local disk: |
| * |
| * acquire_sample_rows(), when called in the dispatcher, calls into the |
| * segments to acquire the sample across all segments. |
| * RelationGetNumberOfBlocks() calls have been replaced with a wrapper |
| * function, AcquireNumberOfBlocks(), which likewise calls into the |
| * segments, to get total relation size across all segments. |
| * |
| * AcquireNumberOfBlocks() calls pg_relation_size(), which already |
| * contains the logic to gather the size from all segments. |
| * |
| * Acquiring the sample rows is more tricky. When called in dispatcher, |
| * acquire_sample_rows() calls a helper function called gp_acquire_sample_rows() |
| * in the segments, to collect a sample on each segment. It then merges |
| * the sample rows from each segment to produce a sample of the whole |
| * cluster. gp_acquire_sample_rows() in turn calls acquire_sample_rows(), to |
| * collect the sample on the segment. |
| * |
| * One complication with collecting the sample is the way that very |
| * large datums are handled. We don't want to transfer multi-gigabyte |
| * tuples from each segment. That would slow things down, and risk |
| * running out of memory, if the sample contains a lot of them. They |
| * are not very useful for statistics, anyway; hardly anyone builds an |
| * index or does lookups where the histogram or MCV is meaningful for |
| * very large keys. PostgreSQL also ignores any datums larger than |
| * WIDTH_THRESHOLD (1kB) in the statistics computation, and we use the |
| * same limit to restrict what gets transferred from the segments. |
| * We substitute the very large datums with NULLs in the sample, but |
| * keep track separately, which datums came out as NULLs because they |
| * were too large, as opposed to "real" NULLs. |
| * |
| * |
| * Merging leaf statistics with hyperloglog |
| * ---------------------------------------- |
| * |
| * TODO: explain how this works. |
| * |
| * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group |
| * Portions Copyright (c) 1994, Regents of the University of California |
| * |
| * |
| * IDENTIFICATION |
| * src/backend/commands/analyze.c |
| * |
| *------------------------------------------------------------------------- |
| */ |
| #include "postgres.h" |
| |
| #include <math.h> |
| |
| #include "access/detoast.h" |
| #include "access/genam.h" |
| #include "access/heapam.h" |
| #include "access/multixact.h" |
| #include "access/relation.h" |
| #include "access/sysattr.h" |
| #include "access/table.h" |
| #include "access/tableam.h" |
| #include "access/transam.h" |
| #include "access/tupconvert.h" |
| #include "access/visibilitymap.h" |
| #include "access/xact.h" |
| #include "catalog/catalog.h" |
| #include "catalog/index.h" |
| #include "catalog/indexing.h" |
| #include "catalog/pg_collation.h" |
| #include "catalog/pg_inherits.h" |
| #include "catalog/pg_namespace.h" |
| #include "catalog/pg_statistic_ext.h" |
| #include "commands/dbcommands.h" |
| #include "commands/progress.h" |
| #include "commands/tablecmds.h" |
| #include "commands/vacuum.h" |
| #include "executor/executor.h" |
| #include "foreign/fdwapi.h" |
| #include "miscadmin.h" |
| #include "nodes/nodeFuncs.h" |
| #include "parser/parse_oper.h" |
| #include "parser/parse_relation.h" |
| #include "pgstat.h" |
| #include "postmaster/autovacuum.h" |
| #include "statistics/extended_stats_internal.h" |
| #include "statistics/statistics.h" |
| #include "storage/bufmgr.h" |
| #include "storage/lmgr.h" |
| #include "storage/proc.h" |
| #include "storage/procarray.h" |
| #include "utils/acl.h" |
| #include "utils/attoptcache.h" |
| #include "utils/builtins.h" |
| #include "utils/datum.h" |
| #include "utils/fmgroids.h" |
| #include "utils/guc.h" |
| #include "utils/lsyscache.h" |
| #include "utils/memutils.h" |
| #include "utils/pg_rusage.h" |
| #include "utils/rel.h" |
| #include "utils/sampling.h" |
| #include "utils/sortsupport.h" |
| #include "utils/spccache.h" |
| #include "utils/syscache.h" |
| #include "utils/timestamp.h" |
| |
| #include "access/appendonlywriter.h" |
| #include "catalog/heap.h" |
| #include "catalog/pg_am.h" |
| #include "cdb/cdbappendonlyam.h" |
| #include "cdb/cdbaocsam.h" |
| #include "cdb/cdbdisp_query.h" |
| #include "cdb/cdbdispatchresult.h" |
| #include "cdb/cdbtm.h" |
| #include "cdb/cdbutil.h" |
| #include "cdb/cdbvars.h" |
| #include "commands/analyzeutils.h" |
| #include "executor/spi.h" |
| #include "funcapi.h" |
| #include "libpq-fe.h" |
| #include "utils/builtins.h" |
| #include "utils/faultinjector.h" |
| #include "utils/hyperloglog/gp_hyperloglog.h" |
| #include "utils/snapmgr.h" |
| #include "utils/typcache.h" |
| #include "executor/tstoreReceiver.h" |
| #include "tcop/utility.h" |
| |
| /* |
| * For Hyperloglog, we define an error margin of 0.3%. If the number of |
| * distinct values estimated by hyperloglog is within an error of 0.3%, |
| * we consider everything as distinct. |
| */ |
| #define GP_HLL_ERROR_MARGIN 0.003 |
| |
| /* Per-index data for ANALYZE */ |
| typedef struct AnlIndexData |
| { |
| IndexInfo *indexInfo; /* BuildIndexInfo result */ |
| double tupleFract; /* fraction of rows for partial index */ |
| VacAttrStats **vacattrstats; /* index attrs to analyze */ |
| int attr_cnt; |
| } AnlIndexData; |
| |
| |
| /* Default statistics target (GUC parameter) */ |
| int default_statistics_target = 100; |
| |
| /* A few variables that don't seem worth passing around as parameters */ |
| static MemoryContext anl_context = NULL; |
| static BufferAccessStrategy vac_strategy; |
| |
| Bitmapset **acquire_func_colLargeRowIndexes; |
| double *acquire_func_colLargeRowLength; |
| double *acquire_func_colNDVBySeg; |
| |
| static void do_analyze_rel(Relation onerel, |
| VacuumParams *params, List *va_cols, |
| AcquireSampleRowsFunc acquirefunc, BlockNumber relpages, |
| bool inh, bool in_outer_xact, int elevel, |
| gp_acquire_sample_rows_context *ctx); |
| static void compute_index_stats(Relation onerel, double totalrows, |
| AnlIndexData *indexdata, int nindexes, |
| HeapTuple *rows, int numrows, |
| MemoryContext col_context); |
| static VacAttrStats *examine_attribute(Relation onerel, int attnum, |
| Node *index_expr, int elevel); |
| static int acquire_sample_rows(Relation onerel, int elevel, |
| HeapTuple *rows, int targrows, |
| double *totalrows, double *totaldeadrows); |
| static int acquire_sample_rows_dispatcher(Relation onerel, bool inh, int elevel, |
| HeapTuple *rows, int targrows, |
| double *totalrows, double *totaldeadrows); |
| static int gp_acquire_sample_rows_func(Relation onerel, int elevel, |
| HeapTuple *rows, int targrows, |
| double *totalrows, double *totaldeadrows); |
| static BlockNumber acquire_index_number_of_blocks(Relation indexrel, Relation tablerel); |
| |
| static void gp_acquire_correlations_dispatcher(Oid relOid, bool inh, float4 *correlations, bool *correlationsIsNull); |
| static int compare_rows(const void *a, const void *b); |
| static int acquire_inherited_sample_rows(Relation onerel, int elevel, |
| HeapTuple *rows, int targrows, |
| double *totalrows, double *totaldeadrows); |
| static void update_attstats(Oid relid, bool inh, |
| int natts, VacAttrStats **vacattrstats); |
| static Datum std_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull); |
| static Datum ind_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull); |
| |
| static void analyze_rel_internal(Oid relid, RangeVar *relation, |
| VacuumParams *params, List *va_cols, |
| bool in_outer_xact, BufferAccessStrategy bstrategy, |
| gp_acquire_sample_rows_context *ctx); |
| static void acquire_hll_by_query(Relation onerel, int nattrs, VacAttrStats **attrstats, int elevel); |
| |
| static int16 AcquireCountOfSegmentFile(Relation onerel); |
| |
| /* |
| * analyze_rel() -- analyze one relation |
| * |
| * relid identifies the relation to analyze. If relation is supplied, use |
| * the name therein for reporting any failure to open/lock the rel; do not |
| * use it once we've successfully opened the rel, since it might be stale. |
| */ |
| void |
| analyze_rel(Oid relid, RangeVar *relation, |
| VacuumParams *params, List *va_cols, bool in_outer_xact, |
| BufferAccessStrategy bstrategy, gp_acquire_sample_rows_context *ctx) |
| { |
| bool optimizerBackup; |
| |
| /* |
| * Temporarily disable ORCA because it's slow to start up, and it |
| * wouldn't come up with any better plan for the simple queries that |
| * we run. |
| */ |
| optimizerBackup = optimizer; |
| optimizer = false; |
| |
| PG_TRY(); |
| { |
| analyze_rel_internal(relid, relation, params, va_cols, |
| in_outer_xact, bstrategy, ctx); |
| } |
| /* Clean up in case of error. */ |
| PG_CATCH(); |
| { |
| optimizer = optimizerBackup; |
| |
| /* Carry on with error handling. */ |
| PG_RE_THROW(); |
| } |
| PG_END_TRY(); |
| |
| optimizer = optimizerBackup; |
| } |
| |
| static void |
| analyze_rel_internal(Oid relid, RangeVar *relation, |
| VacuumParams *params, List *va_cols, bool in_outer_xact, |
| BufferAccessStrategy bstrategy, gp_acquire_sample_rows_context *ctx) |
| { |
| Relation onerel; |
| int elevel; |
| AcquireSampleRowsFunc acquirefunc = NULL; |
| BlockNumber relpages = 0; |
| |
| /* Select logging level */ |
| if (params->options & VACOPT_VERBOSE) |
| elevel = INFO; |
| else |
| elevel = DEBUG2; |
| |
| /* Set up static variables */ |
| vac_strategy = bstrategy; |
| |
| /* |
| * Check for user-requested abort. |
| */ |
| CHECK_FOR_INTERRUPTS(); |
| |
| /* |
| * Open the relation, getting ShareUpdateExclusiveLock to ensure that two |
| * ANALYZEs don't run on it concurrently. (This also locks out a |
| * concurrent VACUUM, which doesn't matter much at the moment but might |
| * matter if we ever try to accumulate stats on dead tuples.) If the rel |
| * has been dropped since we last saw it, we don't need to process it. |
| * |
| * Make sure to generate only logs for ANALYZE in this case. |
| */ |
| onerel = vacuum_open_relation(relid, relation, params->options & ~(VACOPT_VACUUM), |
| params->log_min_duration >= 0, |
| ShareUpdateExclusiveLock); |
| |
| /* leave if relation could not be opened or locked */ |
| if (!onerel) |
| return; |
| |
| #ifdef FAULT_INJECTOR |
| FaultInjector_InjectFaultIfSet( |
| "analyze_after_hold_lock", DDLNotSpecified, |
| "", RelationGetRelationName(onerel)); |
| #endif |
| |
| /* |
| * analyze_rel can be called in 3 different contexts: explicitly by the user |
| * (eg. ANALYZE, VACUUM ANALYZE), implicitly by autovacuum, or implicitly by |
| * autostats. |
| * |
| * In the first case, we always want to make sure the user is the owner of the |
| * table. In the autovacuum case, it will be called as superuser so we don't |
| * really care, but the ownership check should always succeed. For autostats, |
| * we only do the check if gp_autostats_allow_nonowner=false, otherwise we can |
| * proceed with the analyze. |
| * |
| * This check happens also when building the relation list to analyze for a |
| * manual operation, and needs to be done additionally here as ANALYZE could |
| * happen across multiple transactions where relation ownership could have |
| * changed in-between. Make sure to generate only logs for ANALYZE in |
| * this case. |
| */ |
| |
| if (!(params->auto_stats && gp_autostats_allow_nonowner)) |
| { |
| if (!vacuum_is_relation_owner(RelationGetRelid(onerel), |
| onerel->rd_rel, |
| params->options & VACOPT_ANALYZE)) |
| { |
| { |
| relation_close(onerel, ShareUpdateExclusiveLock); |
| return; |
| } |
| } |
| } |
| |
| /* |
| * Silently ignore tables that are temp tables of other backends --- |
| * trying to analyze these is rather pointless, since their contents are |
| * probably not up-to-date on disk. (We don't throw a warning here; it |
| * would just lead to chatter during a database-wide ANALYZE.) |
| */ |
| if (RELATION_IS_OTHER_TEMP(onerel)) |
| { |
| relation_close(onerel, ShareUpdateExclusiveLock); |
| return; |
| } |
| |
| /* |
| * We can ANALYZE any table except pg_statistic. See update_attstats |
| */ |
| if (RelationGetRelid(onerel) == StatisticRelationId) |
| { |
| relation_close(onerel, ShareUpdateExclusiveLock); |
| return; |
| } |
| |
| /* |
| * Check that it's of an analyzable relkind, and set up appropriately. |
| */ |
| if (onerel->rd_rel->relkind == RELKIND_RELATION || |
| onerel->rd_rel->relkind == RELKIND_MATVIEW || |
| onerel->rd_rel->relkind == RELKIND_DIRECTORY_TABLE) |
| { |
| /* |
| * If the TableAmRoutine's gp_acquire_sample_rows_func if NULL, we use |
| * gp_acquire_sample_rows_func as default. |
| */ |
| acquirefunc = gp_acquire_sample_rows_func; |
| |
| /* Also get regular table's size */ |
| relpages = AcquireNumberOfBlocks(onerel); |
| } |
| else if (onerel->rd_rel->relkind == RELKIND_FOREIGN_TABLE) |
| { |
| /* |
| * For a foreign table, call the FDW's hook function to see whether it |
| * supports analysis. |
| */ |
| FdwRoutine *fdwroutine; |
| bool ok = false; |
| |
| fdwroutine = GetFdwRoutineForRelation(onerel, false); |
| |
| if (fdwroutine->AnalyzeForeignTable != NULL) |
| ok = fdwroutine->AnalyzeForeignTable(onerel, |
| &acquirefunc, |
| &relpages); |
| |
| if (!ok) |
| { |
| ereport(WARNING, |
| (errmsg("skipping \"%s\" --- cannot analyze this foreign table", |
| RelationGetRelationName(onerel)))); |
| relation_close(onerel, ShareUpdateExclusiveLock); |
| return; |
| } |
| } |
| else if (onerel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) |
| { |
| /* |
| * For partitioned tables, we want to do the recursive ANALYZE below. |
| */ |
| } |
| else |
| { |
| /* No need for a WARNING if we already complained during VACUUM */ |
| if (!(params->options & VACOPT_VACUUM)) |
| ereport(WARNING, |
| (errmsg("skipping \"%s\" --- cannot analyze non-tables or special system tables", |
| RelationGetRelationName(onerel)))); |
| relation_close(onerel, ShareUpdateExclusiveLock); |
| return; |
| } |
| |
| /* |
| * OK, let's do it. First, initialize progress reporting. |
| */ |
| pgstat_progress_start_command(PROGRESS_COMMAND_ANALYZE, |
| RelationGetRelid(onerel)); |
| |
| /* |
| * Do the normal non-recursive ANALYZE. We can skip this for partitioned |
| * tables, which don't contain any rows. |
| * |
| * On QE, when receiving ANALYZE request through gp_acquire_sample_rows. |
| * We should only perform do_analyze_rel for the parent table only |
| * or all it's children tables. Because, QD will send two acquire sample |
| * rows requests to QE. |
| * To distinguish the two requests, we check the ctx->inherited value here. |
| */ |
| if (onerel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE && (!ctx || !ctx->inherited)) |
| do_analyze_rel(onerel, params, va_cols, acquirefunc, |
| relpages, false, in_outer_xact, elevel, ctx); |
| |
| /* |
| * If there are child tables, do recursive ANALYZE. |
| */ |
| if (onerel->rd_rel->relhassubclass && (!ctx || ctx->inherited)) |
| do_analyze_rel(onerel, params, va_cols, acquirefunc, relpages, |
| true, in_outer_xact, elevel, ctx); |
| |
| /* MPP-6929: metadata tracking */ |
| if (!vacuumStatement_IsTemporary(onerel) && (Gp_role == GP_ROLE_DISPATCH)) |
| { |
| char *asubtype = ""; |
| |
| if (IsAutoVacuumWorkerProcess()) |
| asubtype = "AUTO"; |
| |
| MetaTrackUpdObject(RelationRelationId, |
| RelationGetRelid(onerel), |
| GetUserId(), |
| "ANALYZE", |
| asubtype |
| ); |
| } |
| |
| /* |
| * Close source relation now, but keep lock so that no one deletes it |
| * before we commit. (If someone did, they'd fail to clean up the entries |
| * we made in pg_statistic. Also, releasing the lock before commit would |
| * expose us to concurrent-update failures in update_attstats.) |
| */ |
| relation_close(onerel, NoLock); |
| |
| pgstat_progress_end_command(); |
| } |
| |
| /* |
| * do_analyze_rel() -- analyze one relation, recursively or not |
| * |
| * Note that "acquirefunc" is only relevant for the non-inherited case. |
| * For the inherited case, acquire_inherited_sample_rows() determines the |
| * appropriate acquirefunc for each child table. |
| */ |
| static void |
| do_analyze_rel(Relation onerel, VacuumParams *params, |
| List *va_cols, AcquireSampleRowsFunc acquirefunc, |
| BlockNumber relpages, bool inh, bool in_outer_xact, |
| int elevel, gp_acquire_sample_rows_context *ctx) |
| { |
| int attr_cnt, |
| tcnt, |
| i, |
| ind; |
| Relation *Irel; |
| int nindexes; |
| bool hasindex; |
| VacAttrStats **vacattrstats; |
| AnlIndexData *indexdata; |
| int targrows, |
| numrows, |
| minrows; |
| double totalrows, |
| totaldeadrows; |
| HeapTuple *rows; |
| PGRUsage ru0; |
| TimestampTz starttime = 0; |
| MemoryContext caller_context; |
| Oid save_userid; |
| int save_sec_context; |
| int save_nestlevel; |
| |
| Bitmapset **colLargeRowIndexes; |
| double *colLargeRowLength; |
| double *colNDVBySeg; |
| bool sample_needed; |
| |
| int64 AnalyzePageHit = VacuumPageHit; |
| int64 AnalyzePageMiss = VacuumPageMiss; |
| int64 AnalyzePageDirty = VacuumPageDirty; |
| PgStat_Counter startreadtime = 0; |
| PgStat_Counter startwritetime = 0; |
| int max_natts = onerel->rd_att->natts; |
| |
| |
| if (inh) |
| ereport(elevel, |
| (errmsg("analyzing \"%s.%s\" inheritance tree", |
| get_namespace_name(RelationGetNamespace(onerel)), |
| RelationGetRelationName(onerel)))); |
| else |
| ereport(elevel, |
| (errmsg("analyzing \"%s.%s\"", |
| get_namespace_name(RelationGetNamespace(onerel)), |
| RelationGetRelationName(onerel)))); |
| |
| /* |
| * Set up a working context so that we can easily free whatever junk gets |
| * created. |
| */ |
| anl_context = AllocSetContextCreate(CurrentMemoryContext, |
| "Analyze", |
| ALLOCSET_DEFAULT_SIZES); |
| caller_context = MemoryContextSwitchTo(anl_context); |
| |
| /* |
| * Switch to the table owner's userid, so that any index functions are run |
| * as that user. Also lock down security-restricted operations and |
| * arrange to make GUC variable changes local to this command. |
| */ |
| GetUserIdAndSecContext(&save_userid, &save_sec_context); |
| SetUserIdAndSecContext(onerel->rd_rel->relowner, |
| save_sec_context | SECURITY_RESTRICTED_OPERATION); |
| save_nestlevel = NewGUCNestLevel(); |
| |
| /* measure elapsed time iff autovacuum logging requires it */ |
| if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0) |
| { |
| if (track_io_timing) |
| { |
| startreadtime = pgStatBlockReadTime; |
| startwritetime = pgStatBlockWriteTime; |
| } |
| |
| pg_rusage_init(&ru0); |
| if (params->log_min_duration >= 0) |
| starttime = GetCurrentTimestamp(); |
| } |
| |
| /* |
| * Determine which columns to analyze |
| * |
| * Note that system attributes are never analyzed, so we just reject them |
| * at the lookup stage. We also reject duplicate column mentions. (We |
| * could alternatively ignore duplicates, but analyzing a column twice |
| * won't work; we'd end up making a conflicting update in pg_statistic.) |
| */ |
| if (va_cols != NIL) |
| { |
| Bitmapset *unique_cols = NULL; |
| ListCell *le; |
| |
| vacattrstats = (VacAttrStats **) palloc(list_length(va_cols) * |
| sizeof(VacAttrStats *)); |
| tcnt = 0; |
| foreach(le, va_cols) |
| { |
| char *col = strVal(lfirst(le)); |
| |
| i = attnameAttNum(onerel, col, false); |
| if (i == InvalidAttrNumber) |
| ereport(ERROR, |
| (errcode(ERRCODE_UNDEFINED_COLUMN), |
| errmsg("column \"%s\" of relation \"%s\" does not exist", |
| col, RelationGetRelationName(onerel)))); |
| if (bms_is_member(i, unique_cols)) |
| ereport(ERROR, |
| (errcode(ERRCODE_DUPLICATE_COLUMN), |
| errmsg("column \"%s\" of relation \"%s\" appears more than once", |
| col, RelationGetRelationName(onerel)))); |
| unique_cols = bms_add_member(unique_cols, i); |
| |
| vacattrstats[tcnt] = examine_attribute(onerel, i, NULL, elevel); |
| if (vacattrstats[tcnt] != NULL) |
| tcnt++; |
| } |
| attr_cnt = tcnt; |
| } |
| else |
| { |
| attr_cnt = onerel->rd_att->natts; |
| vacattrstats = (VacAttrStats **) |
| palloc(attr_cnt * sizeof(VacAttrStats *)); |
| tcnt = 0; |
| for (i = 1; i <= attr_cnt; i++) |
| { |
| vacattrstats[tcnt] = examine_attribute(onerel, i, NULL, elevel); |
| if (vacattrstats[tcnt] != NULL) |
| tcnt++; |
| } |
| attr_cnt = tcnt; |
| } |
| |
| /* |
| * Open all indexes of the relation, and see if there are any analyzable |
| * columns in the indexes. We do not analyze index columns if there was |
| * an explicit column list in the ANALYZE command, however. |
| * |
| * If we are doing a recursive scan, we don't want to touch the parent's |
| * indexes at all. If we're processing a partitioned table, we need to |
| * know if there are any indexes, but we don't want to process them. |
| */ |
| if (onerel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) |
| { |
| List *idxs = RelationGetIndexList(onerel); |
| |
| Irel = NULL; |
| nindexes = 0; |
| hasindex = idxs != NIL; |
| list_free(idxs); |
| } |
| else if (!inh) |
| { |
| vac_open_indexes(onerel, AccessShareLock, &nindexes, &Irel); |
| hasindex = nindexes > 0; |
| } |
| else |
| { |
| Irel = NULL; |
| nindexes = 0; |
| hasindex = false; |
| } |
| indexdata = NULL; |
| if (nindexes > 0) |
| { |
| indexdata = (AnlIndexData *) palloc0(nindexes * sizeof(AnlIndexData)); |
| for (ind = 0; ind < nindexes; ind++) |
| { |
| AnlIndexData *thisdata = &indexdata[ind]; |
| IndexInfo *indexInfo; |
| |
| thisdata->indexInfo = indexInfo = BuildIndexInfo(Irel[ind]); |
| thisdata->tupleFract = 1.0; /* fix later if partial */ |
| if (indexInfo->ii_Expressions != NIL && va_cols == NIL) |
| { |
| ListCell *indexpr_item = list_head(indexInfo->ii_Expressions); |
| |
| thisdata->vacattrstats = (VacAttrStats **) |
| palloc(indexInfo->ii_NumIndexAttrs * sizeof(VacAttrStats *)); |
| tcnt = 0; |
| for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++) |
| { |
| int keycol = indexInfo->ii_IndexAttrNumbers[i]; |
| |
| if (keycol == 0) |
| { |
| /* Found an index expression */ |
| Node *indexkey; |
| |
| if (indexpr_item == NULL) /* shouldn't happen */ |
| elog(ERROR, "too few entries in indexprs list"); |
| indexkey = (Node *) lfirst(indexpr_item); |
| indexpr_item = lnext(indexInfo->ii_Expressions, |
| indexpr_item); |
| thisdata->vacattrstats[tcnt] = |
| examine_attribute(Irel[ind], i + 1, indexkey, elevel); |
| if (thisdata->vacattrstats[tcnt] != NULL) |
| tcnt++; |
| } |
| } |
| thisdata->attr_cnt = tcnt; |
| } |
| } |
| } |
| |
| /* |
| * Determine how many rows we need to sample, using the worst case from |
| * all analyzable columns. We use a lower bound of 100 rows to avoid |
| * possible overflow in Vitter's algorithm. (Note: that will also be the |
| * target in the corner case where there are no analyzable columns.) |
| * |
| * GPDB: If the caller specified the 'targrows', just use that. |
| */ |
| if (ctx) |
| { |
| targrows = ctx->targrows; |
| } |
| else /* funny indentation to avoid re-indenting upstream code */ |
| { |
| targrows = 100; |
| for (i = 0; i < attr_cnt; i++) |
| { |
| if (targrows < vacattrstats[i]->minrows) |
| targrows = vacattrstats[i]->minrows; |
| } |
| for (ind = 0; ind < nindexes; ind++) |
| { |
| AnlIndexData *thisdata = &indexdata[ind]; |
| |
| for (i = 0; i < thisdata->attr_cnt; i++) |
| { |
| if (targrows < thisdata->vacattrstats[i]->minrows) |
| targrows = thisdata->vacattrstats[i]->minrows; |
| } |
| } |
| } |
| |
| /* |
| * Look at extended statistics objects too, as those may define custom |
| * statistics target. So we may need to sample more rows and then build |
| * the statistics with enough detail. |
| */ |
| if (IS_QD_OR_SINGLENODE()) |
| minrows = ComputeExtStatisticsRows(onerel, attr_cnt, vacattrstats); |
| else |
| minrows = 0; |
| |
| if (targrows < minrows) |
| targrows = minrows; |
| |
| /* |
| * Maintain information if the row of a column exceeds WIDTH_THRESHOLD |
| */ |
| if (inh) |
| { |
| ListCell *le; |
| List* tableOIDs = find_all_inheritors(RelationGetRelid(onerel), NoLock, NULL); |
| foreach (le, tableOIDs) |
| { |
| Oid child_relid = lfirst_oid(le); |
| int natts = get_relnatts(child_relid); |
| if (natts > max_natts) |
| max_natts = natts; |
| } |
| } |
| colLargeRowIndexes = (Bitmapset **) palloc0(sizeof(Bitmapset *) * max_natts); |
| colLargeRowLength = (double *)palloc0(sizeof(double) * max_natts); |
| colNDVBySeg = (double *)palloc0(sizeof(double) * max_natts); |
| |
| if ((params->options & VACOPT_FULLSCAN) != 0) |
| { |
| if (onerel->rd_rel->relispartition) |
| { |
| acquire_hll_by_query(onerel, attr_cnt, vacattrstats, elevel); |
| |
| ereport(elevel, (errmsg("HLL FULL SCAN"))); |
| } |
| } |
| |
| sample_needed = needs_sample(onerel, vacattrstats, attr_cnt); |
| if (ctx || sample_needed) |
| { |
| if (ctx) |
| MemoryContextSwitchTo(caller_context); |
| rows = (HeapTuple *) palloc(targrows * sizeof(HeapTuple)); |
| |
| /* |
| * Acquire the sample rows |
| * |
| * colLargeRowIndexes is passed out-of-band, in a global variable, |
| * to avoid changing the function signature from upstream's. |
| * |
| * The same as colLargeRowIndexes. colLargeRowLength stores total |
| * length of too wide rows in the sample for every attribute of |
| * the target relation. ANALYZE ignores too wide columns during |
| * analysis(See comments of WIDTH_THRESHOLD), the stawidth can be |
| * far smaller than the real average width for varlena datums which |
| * are larger than WIDTH_THRESHOLD but stored uncompressed. |
| */ |
| acquire_func_colLargeRowIndexes = colLargeRowIndexes; |
| acquire_func_colLargeRowLength = colLargeRowLength; |
| acquire_func_colNDVBySeg = colNDVBySeg; |
| pgstat_progress_update_param(PROGRESS_ANALYZE_PHASE, |
| inh ? PROGRESS_ANALYZE_PHASE_ACQUIRE_SAMPLE_ROWS_INH : |
| PROGRESS_ANALYZE_PHASE_ACQUIRE_SAMPLE_ROWS); |
| if (inh) |
| numrows = acquire_inherited_sample_rows(onerel, elevel, |
| rows, targrows, |
| &totalrows, &totaldeadrows); |
| else |
| numrows = (*acquirefunc) (onerel, elevel, |
| rows, targrows, |
| &totalrows, &totaldeadrows); |
| acquire_func_colLargeRowIndexes = NULL; |
| acquire_func_colLargeRowLength = NULL; |
| acquire_func_colNDVBySeg = NULL; |
| if (ctx) |
| MemoryContextSwitchTo(anl_context); |
| } |
| else if (onerel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) |
| { |
| ListCell *lc; |
| List *all_children_list = find_all_inheritors(RelationGetRelid(onerel), AccessShareLock, NULL); |
| totalrows = 0; |
| totaldeadrows = 0; |
| numrows = 0; |
| rows = NULL; |
| |
| foreach(lc, all_children_list) |
| { |
| Oid pkrelid = lfirst_oid(lc); |
| if (get_rel_relkind(pkrelid) != RELKIND_RELATION) |
| continue; |
| totalrows += get_rel_reltuples(pkrelid); |
| } |
| } |
| else |
| { |
| /* If we're just merging stats from leafs, these are not needed either */ |
| totalrows = 0; |
| totaldeadrows = 0; |
| numrows = 0; |
| rows = NULL; |
| } |
| |
| if (ctx) |
| { |
| ctx->sample_rows = rows; |
| ctx->num_sample_rows = numrows; |
| ctx->totalrows = totalrows; |
| ctx->totaldeadrows = totaldeadrows; |
| } |
| |
| /* |
| * Compute the statistics. Temporary results during the calculations for |
| * each column are stored in a child context. The calc routines are |
| * responsible to make sure that whatever they store into the VacAttrStats |
| * structure is allocated in anl_context. |
| * |
| * When we have a root partition, we use the leaf partition statistics to |
| * derive root table statistics. In that case, we do not need to collect a |
| * sample. Therefore, the statistics calculation depends on root level have |
| * any tuples. In addition, we continue for statistics calculation if |
| * optimizer_analyze_root_partition or ROOTPARTITION is specified in the |
| * ANALYZE statement. |
| */ |
| if (numrows > 0 || !sample_needed) |
| { |
| HeapTuple *validRows = (HeapTuple *) palloc(numrows * sizeof(HeapTuple)); |
| MemoryContext col_context, |
| old_context; |
| bool build_ext_stats; |
| |
| pgstat_progress_update_param(PROGRESS_ANALYZE_PHASE, |
| PROGRESS_ANALYZE_PHASE_COMPUTE_STATS); |
| |
| col_context = AllocSetContextCreate(anl_context, |
| "Analyze Column", |
| ALLOCSET_DEFAULT_SIZES); |
| old_context = MemoryContextSwitchTo(col_context); |
| |
| /* |
| * Get correlations from segments. |
| */ |
| if (Gp_role == GP_ROLE_DISPATCH && GpPolicyIsPartitioned(onerel->rd_cdbpolicy)) |
| { |
| /* |
| * in gp_acquire_correlations_dispatcher we get correlations and correlationsIsNull |
| * for all columns even if stats is requested for subset of columns. |
| * It's more simpler to implement this way. |
| */ |
| float4 *correlations = (float4 *)palloc0(sizeof(float4) * onerel->rd_att->natts); |
| bool *correlationsIsNull = (bool *)palloc0(sizeof(bool) * onerel->rd_att->natts); |
| gp_acquire_correlations_dispatcher(onerel->rd_rel->oid, inh, correlations, correlationsIsNull); |
| for (i = 0; i < attr_cnt; i++) |
| { |
| VacAttrStats *stats = vacattrstats[i]; |
| stats->partitiontbl_qd = true; |
| stats->corrnull = correlationsIsNull[stats->attr->attnum - 1]; |
| stats->corrval = correlations[stats->attr->attnum - 1]; |
| } |
| pfree(correlations); |
| pfree(correlationsIsNull); |
| } |
| |
| for (i = 0; i < attr_cnt; i++) |
| { |
| VacAttrStats *stats = vacattrstats[i]; |
| |
| if (Gp_role == GP_ROLE_DISPATCH && GpPolicyIsPartitioned(onerel->rd_cdbpolicy)) |
| { |
| stats->stadistinctbyseg = colNDVBySeg[i]; |
| } |
| |
| stats->tupDesc = onerel->rd_att; |
| /* |
| * utilize hyperloglog and merge utilities to derive |
| * root table statistics by directly calling merge_leaf_stats() |
| * if all leaf partition attributes are analyzed |
| */ |
| if(stats->merge_stats) |
| { |
| (*stats->compute_stats) (stats, std_fetch_func, 0, 0); |
| MemoryContextResetAndDeleteChildren(col_context); |
| continue; |
| } |
| Assert(sample_needed); |
| |
| Bitmapset *rowIndexes = colLargeRowIndexes[stats->attr->attnum - 1]; |
| int validRowsLength; |
| |
| /* If there are too wide rows in the sample, remove them |
| * from the sample being sent for stats collection |
| */ |
| if (rowIndexes) |
| { |
| validRowsLength = 0; |
| for (int rownum = 0; rownum < numrows; rownum++) |
| { |
| /* if row is too wide, leave it out of the sample */ |
| if (bms_is_member(rownum, rowIndexes)) |
| continue; |
| |
| validRows[validRowsLength] = rows[rownum]; |
| validRowsLength++; |
| } |
| stats->rows = validRows; |
| } |
| else |
| { |
| stats->rows = rows; |
| validRowsLength = numrows; |
| } |
| AttributeOpts *aopt = |
| get_attribute_options(onerel->rd_id, stats->attr->attnum); |
| |
| /* |
| * get total length and number of too wide rows in the sample, |
| * in case get wrong stawidth. |
| */ |
| stats->totalwidelength = colLargeRowLength[stats->attr->attnum - 1]; |
| stats->widerow_num = numrows - validRowsLength; |
| |
| if (validRowsLength > 0) |
| { |
| stats->compute_stats(stats, |
| std_fetch_func, |
| validRowsLength, // numbers of rows in sample excluding toowide if any. |
| totalrows); |
| /* |
| * Store HLL/HLL fullscan information for leaf partitions in |
| * the stats object |
| */ |
| if (onerel->rd_rel->relkind == RELKIND_RELATION && onerel->rd_rel->relispartition) |
| { |
| MemoryContext old_context; |
| Datum *hll_values; |
| |
| old_context = MemoryContextSwitchTo(stats->anl_context); |
| hll_values = (Datum *) palloc(sizeof(Datum)); |
| int16 hll_length = 0; |
| int16 stakind = 0; |
| if(stats->stahll_full != NULL) |
| { |
| hll_length = datumGetSize(PointerGetDatum(stats->stahll_full), false, -1); |
| hll_values[0] = datumCopy(PointerGetDatum(stats->stahll_full), false, hll_length); |
| stakind = STATISTIC_KIND_FULLHLL; |
| } |
| else if(stats->stahll != NULL) |
| { |
| ((GpHLLCounter) (stats->stahll))->relPages = relpages; |
| ((GpHLLCounter) (stats->stahll))->relTuples = totalrows; |
| |
| hll_length = gp_hyperloglog_len((GpHLLCounter)stats->stahll); |
| hll_values[0] = datumCopy(PointerGetDatum(stats->stahll), false, hll_length); |
| stakind = STATISTIC_KIND_HLL; |
| } |
| MemoryContextSwitchTo(old_context); |
| if (stakind > 0) |
| { |
| stats->stakind[STATISTIC_NUM_SLOTS-1] = stakind; |
| stats->stavalues[STATISTIC_NUM_SLOTS-1] = hll_values; |
| stats->numvalues[STATISTIC_NUM_SLOTS-1] = 1; |
| stats->statyplen[STATISTIC_NUM_SLOTS-1] = hll_length; |
| } |
| } |
| } |
| else |
| { |
| // All the rows were too wide to be included in the sample. We cannot |
| // do much in that case, but at least we know there were no NULLs, and |
| // that every item was >= WIDTH_THRESHOLD in width. |
| stats->stats_valid = true; |
| stats->stanullfrac = 0.0; |
| stats->stawidth = stats->totalwidelength/numrows; |
| stats->stadistinct = 0.0; /* "unknown" */ |
| } |
| stats->rows = rows; // Reset to original rows |
| |
| /* |
| * If the appropriate flavor of the n_distinct option is |
| * specified, override with the corresponding value. |
| */ |
| aopt = get_attribute_options(onerel->rd_id, stats->attr->attnum); |
| if (aopt != NULL) |
| { |
| float8 n_distinct; |
| |
| n_distinct = inh ? aopt->n_distinct_inherited : aopt->n_distinct; |
| if (n_distinct != 0.0) |
| stats->stadistinct = n_distinct; |
| } |
| |
| if (Gp_role == GP_ROLE_EXECUTE) { |
| Assert(ctx->stadistincts); |
| ctx->stadistincts[i] = Float8GetDatum(stats->stadistinct); |
| } |
| |
| MemoryContextResetAndDeleteChildren(col_context); |
| } |
| |
| /* |
| * Datums exceeding WIDTH_THRESHOLD are masked as NULL in the sample, and |
| * are used as is to evaluate index statistics. It is less likely to have |
| * indexes on very wide columns, so the effect will be minimal. |
| */ |
| if (nindexes > 0) |
| compute_index_stats(onerel, totalrows, |
| indexdata, nindexes, |
| rows, numrows, |
| col_context); |
| |
| MemoryContextSwitchTo(old_context); |
| MemoryContextDelete(col_context); |
| |
| /* |
| * Emit the completed stats rows into pg_statistic, replacing any |
| * previous statistics for the target columns. (If there are stats in |
| * pg_statistic for columns we didn't process, we leave them alone.) |
| */ |
| update_attstats(RelationGetRelid(onerel), inh, |
| attr_cnt, vacattrstats); |
| |
| for (ind = 0; ind < nindexes; ind++) |
| { |
| AnlIndexData *thisdata = &indexdata[ind]; |
| |
| update_attstats(RelationGetRelid(Irel[ind]), false, |
| thisdata->attr_cnt, thisdata->vacattrstats); |
| } |
| |
| /* |
| * Should we build extended statistics for this relation? |
| * |
| * The extended statistics catalog does not include an inheritance |
| * flag, so we can't store statistics built both with and without |
| * data from child relations. We can store just one set of statistics |
| * per relation. For plain relations that's fine, but for inheritance |
| * trees we have to pick whether to store statistics for just the |
| * one relation or the whole tree. For plain inheritance we store |
| * the (!inh) version, mostly for backwards compatibility reasons. |
| * For partitioned tables that's pointless (the non-leaf tables are |
| * always empty), so we store stats representing the whole tree. |
| */ |
| build_ext_stats = (onerel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) ? inh : (!inh); |
| |
| /* |
| * Build extended statistics (if there are any). |
| * |
| * For now we only build extended statistics on individual relations, |
| * not for relations representing inheritance trees. |
| */ |
| if (build_ext_stats) |
| BuildRelationExtStatistics(onerel, totalrows, numrows, rows, |
| attr_cnt, vacattrstats); |
| } |
| |
| pgstat_progress_update_param(PROGRESS_ANALYZE_PHASE, |
| PROGRESS_ANALYZE_PHASE_FINALIZE_ANALYZE); |
| |
| /* |
| * Update pages/tuples stats in pg_class ... but not if we're doing |
| * inherited stats. |
| * |
| * GPDB_92_MERGE_FIXME: In postgres it is sufficient to check the number of |
| * pages that are visible with visibilitymap_count(), but in GPDB this |
| * needs to be the count of all pages marked all visible across the all the |
| * QEs. We need to gather this information from the segments and then update |
| * it here. |
| * We assume that VACUUM hasn't set pg_class.reltuples already, even |
| * during a VACUUM ANALYZE. Although VACUUM often updates pg_class, |
| * exceptions exist. A "VACUUM (ANALYZE, INDEX_CLEANUP OFF)" command will |
| * never update pg_class entries for index relations. It's also possible |
| * that an individual index's pg_class entry won't be updated during |
| * VACUUM if the index AM returns NULL from its amvacuumcleanup() routine. |
| */ |
| if (!inh) |
| { |
| BlockNumber relallvisible; |
| |
| if (RelationStorageIsAO(onerel)) |
| relallvisible = 0; |
| else |
| relallvisible = AcquireNumberOfAllVisibleBlocks(onerel); |
| |
| /* Update pg_class for table relation */ |
| vac_update_relstats(onerel, |
| relpages, |
| totalrows, |
| relallvisible, |
| hasindex, |
| InvalidTransactionId, |
| InvalidMultiXactId, |
| in_outer_xact, |
| false /* isVacuum */); |
| |
| /* Update pg_appendonly for ao tables */ |
| if (RelationIsAppendOptimized(onerel)) |
| { |
| Relation aorel; |
| Oid aorelid = RelationGetRelid(onerel); |
| HeapTuple aotup; |
| Form_pg_appendonly aoform; |
| int16 ao_segfile_count = 0; |
| |
| aotup = SearchSysCache1(AORELID, ObjectIdGetDatum(aorelid)); |
| if (!HeapTupleIsValid(aotup)) |
| ereport(ERROR, |
| (errcode(ERRCODE_UNDEFINED_OBJECT), |
| errmsg("appendonly table relid %u does not exist in pg_appendonly", aorelid))); |
| |
| aoform = (Form_pg_appendonly) GETSTRUCT(aotup); |
| if (aoform->segfilecount < MAX_AOREL_CONCURRENCY) |
| { |
| ao_segfile_count = AcquireCountOfSegmentFile(onerel); |
| if (aoform->segfilecount != ao_segfile_count) |
| { |
| aorel = table_open(AppendOnlyRelationId, RowExclusiveLock); |
| aoform->segfilecount = ao_segfile_count; |
| heap_inplace_update(aorel, aotup); |
| table_close(aorel, RowExclusiveLock); |
| } |
| } |
| ReleaseSysCache(aotup); |
| } |
| |
| /* Same for indexes */ |
| for (ind = 0; ind < nindexes; ind++) |
| { |
| AnlIndexData *thisdata = &indexdata[ind]; |
| double totalindexrows; |
| BlockNumber estimatedIndexPages; |
| |
| if (totalrows < 1.0) |
| { |
| /** |
| * If there are no rows in the relation, no point trying to estimate |
| * number of pages in the index. |
| */ |
| elog(elevel, "ANALYZE skipping index %s since relation %s has no rows.", |
| RelationGetRelationName(Irel[ind]), RelationGetRelationName(onerel)); |
| estimatedIndexPages = 1; |
| } |
| else |
| { |
| /** |
| * NOTE: we don't attempt to estimate the number of tuples in an index. |
| * We will assume it to be equal to the estimated number of tuples in the relation. |
| * This does not hold for partial indexes. The number of tuples matching will be |
| * derived in selfuncs.c using the base table statistics. |
| */ |
| estimatedIndexPages = acquire_index_number_of_blocks(Irel[ind], onerel); |
| elog(elevel, "ANALYZE estimated relpages=%u for index %s", |
| estimatedIndexPages, RelationGetRelationName(Irel[ind])); |
| } |
| |
| totalindexrows = ceil(thisdata->tupleFract * totalrows); |
| vac_update_relstats(Irel[ind], |
| estimatedIndexPages, |
| totalindexrows, |
| 0, |
| false, |
| InvalidTransactionId, |
| InvalidMultiXactId, |
| in_outer_xact, |
| false /* isVacuum */); |
| } |
| } |
| else if (onerel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) |
| { |
| /* |
| * Partitioned tables don't have storage, so we don't set any fields |
| * in their pg_class entries except for reltuples and relhasindex. |
| */ |
| vac_update_relstats(onerel, -1, totalrows, |
| 0, hasindex, InvalidTransactionId, |
| InvalidMultiXactId, |
| in_outer_xact, |
| false /* isVacuum */); |
| } |
| |
| /* |
| * Now report ANALYZE to the stats collector. For regular tables, we do |
| * it only if not doing inherited stats. For partitioned tables, we only |
| * do it for inherited stats. (We're never called for not-inherited stats |
| * on partitioned tables anyway.) |
| * |
| * Reset the changes_since_analyze counter only if we analyzed all |
| * columns; otherwise, there is still work for auto-analyze to do. |
| */ |
| if (!inh) |
| pgstat_report_analyze(onerel, totalrows, totaldeadrows, |
| (va_cols == NIL)); |
| else if (onerel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) |
| pgstat_report_analyze(onerel, 0, 0, (va_cols == NIL)); |
| |
| /* |
| * If this isn't part of VACUUM ANALYZE, let index AMs do cleanup. |
| * |
| * Note that most index AMs perform a no-op as a matter of policy for |
| * amvacuumcleanup() when called in ANALYZE-only mode. The only exception |
| * among core index AMs is GIN/ginvacuumcleanup(). |
| */ |
| if (!(params->options & VACOPT_VACUUM)) |
| { |
| for (ind = 0; ind < nindexes; ind++) |
| { |
| IndexBulkDeleteResult *stats; |
| IndexVacuumInfo ivinfo; |
| |
| ivinfo.index = Irel[ind]; |
| ivinfo.analyze_only = true; |
| ivinfo.estimated_count = true; |
| ivinfo.message_level = elevel; |
| ivinfo.num_heap_tuples = onerel->rd_rel->reltuples; |
| ivinfo.strategy = vac_strategy; |
| |
| stats = index_vacuum_cleanup(&ivinfo, NULL); |
| |
| if (stats) |
| pfree(stats); |
| } |
| } |
| |
| /* Done with indexes */ |
| vac_close_indexes(nindexes, Irel, NoLock); |
| |
| /* Log the action if appropriate */ |
| if (IsAutoVacuumWorkerProcess() && params->log_min_duration >= 0) |
| { |
| TimestampTz endtime = GetCurrentTimestamp(); |
| |
| if (params->log_min_duration == 0 || |
| TimestampDifferenceExceeds(starttime, endtime, |
| params->log_min_duration)) |
| { |
| long delay_in_ms; |
| double read_rate = 0; |
| double write_rate = 0; |
| StringInfoData buf; |
| |
| /* |
| * Calculate the difference in the Page Hit/Miss/Dirty that |
| * happened as part of the analyze by subtracting out the |
| * pre-analyze values which we saved above. |
| */ |
| AnalyzePageHit = VacuumPageHit - AnalyzePageHit; |
| AnalyzePageMiss = VacuumPageMiss - AnalyzePageMiss; |
| AnalyzePageDirty = VacuumPageDirty - AnalyzePageDirty; |
| |
| /* |
| * We do not expect an analyze to take > 25 days and it simplifies |
| * things a bit to use TimestampDifferenceMilliseconds. |
| */ |
| delay_in_ms = TimestampDifferenceMilliseconds(starttime, endtime); |
| |
| /* |
| * Note that we are reporting these read/write rates in the same |
| * manner as VACUUM does, which means that while the 'average read |
| * rate' here actually corresponds to page misses and resulting |
| * reads which are also picked up by track_io_timing, if enabled, |
| * the 'average write rate' is actually talking about the rate of |
| * pages being dirtied, not being written out, so it's typical to |
| * have a non-zero 'avg write rate' while I/O timings only reports |
| * reads. |
| * |
| * It's not clear that an ANALYZE will ever result in |
| * FlushBuffer() being called, but we track and support reporting |
| * on I/O write time in case that changes as it's practically free |
| * to do so anyway. |
| */ |
| |
| if (delay_in_ms > 0) |
| { |
| read_rate = (double) BLCKSZ * AnalyzePageMiss / (1024 * 1024) / |
| (delay_in_ms / 1000.0); |
| write_rate = (double) BLCKSZ * AnalyzePageDirty / (1024 * 1024) / |
| (delay_in_ms / 1000.0); |
| } |
| |
| /* |
| * We split this up so we don't emit empty I/O timing values when |
| * track_io_timing isn't enabled. |
| */ |
| |
| initStringInfo(&buf); |
| appendStringInfo(&buf, _("automatic analyze of table \"%s.%s.%s\"\n"), |
| get_database_name(MyDatabaseId), |
| get_namespace_name(RelationGetNamespace(onerel)), |
| RelationGetRelationName(onerel)); |
| if (track_io_timing) |
| { |
| double read_ms = (double) (pgStatBlockReadTime - startreadtime) / 1000; |
| double write_ms = (double) (pgStatBlockWriteTime - startwritetime) / 1000; |
| |
| appendStringInfo(&buf, _("I/O timings: read: %.3f ms, write: %.3f ms\n"), |
| read_ms, write_ms); |
| } |
| appendStringInfo(&buf, _("avg read rate: %.3f MB/s, avg write rate: %.3f MB/s\n"), |
| read_rate, write_rate); |
| appendStringInfo(&buf, _("buffer usage: %lld hits, %lld misses, %lld dirtied\n"), |
| (long long) AnalyzePageHit, |
| (long long) AnalyzePageMiss, |
| (long long) AnalyzePageDirty); |
| appendStringInfo(&buf, _("system usage: %s"), pg_rusage_show(&ru0)); |
| |
| ereport(LOG, |
| (errmsg_internal("%s", buf.data))); |
| |
| pfree(buf.data); |
| } |
| } |
| |
| /* Roll back any GUC changes executed by index functions */ |
| AtEOXact_GUC(false, save_nestlevel); |
| |
| /* Restore userid and security context */ |
| SetUserIdAndSecContext(save_userid, save_sec_context); |
| |
| /* Restore current context and release memory */ |
| MemoryContextSwitchTo(caller_context); |
| MemoryContextDelete(anl_context); |
| anl_context = NULL; |
| } |
| |
| /* |
| * Compute statistics about indexes of a relation |
| */ |
| static void |
| compute_index_stats(Relation onerel, double totalrows, |
| AnlIndexData *indexdata, int nindexes, |
| HeapTuple *rows, int numrows, |
| MemoryContext col_context) |
| { |
| MemoryContext ind_context, |
| old_context; |
| Datum values[INDEX_MAX_KEYS]; |
| bool isnull[INDEX_MAX_KEYS]; |
| int ind, |
| i; |
| |
| ind_context = AllocSetContextCreate(anl_context, |
| "Analyze Index", |
| ALLOCSET_DEFAULT_SIZES); |
| old_context = MemoryContextSwitchTo(ind_context); |
| |
| for (ind = 0; ind < nindexes; ind++) |
| { |
| AnlIndexData *thisdata = &indexdata[ind]; |
| IndexInfo *indexInfo = thisdata->indexInfo; |
| int attr_cnt = thisdata->attr_cnt; |
| TupleTableSlot *slot; |
| EState *estate; |
| ExprContext *econtext; |
| ExprState *predicate; |
| Datum *exprvals; |
| bool *exprnulls; |
| int numindexrows, |
| tcnt, |
| rowno; |
| double totalindexrows; |
| |
| /* Ignore index if no columns to analyze and not partial */ |
| if (attr_cnt == 0 && indexInfo->ii_Predicate == NIL) |
| continue; |
| |
| /* |
| * Need an EState for evaluation of index expressions and |
| * partial-index predicates. Create it in the per-index context to be |
| * sure it gets cleaned up at the bottom of the loop. |
| */ |
| estate = CreateExecutorState(); |
| econtext = GetPerTupleExprContext(estate); |
| /* Need a slot to hold the current heap tuple, too */ |
| slot = MakeSingleTupleTableSlot(RelationGetDescr(onerel), |
| &TTSOpsHeapTuple); |
| |
| /* Arrange for econtext's scan tuple to be the tuple under test */ |
| econtext->ecxt_scantuple = slot; |
| |
| /* Set up execution state for predicate. */ |
| predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); |
| |
| /* Compute and save index expression values */ |
| exprvals = (Datum *) palloc(numrows * attr_cnt * sizeof(Datum)); |
| exprnulls = (bool *) palloc(numrows * attr_cnt * sizeof(bool)); |
| numindexrows = 0; |
| tcnt = 0; |
| for (rowno = 0; rowno < numrows; rowno++) |
| { |
| HeapTuple heapTuple = rows[rowno]; |
| |
| vacuum_delay_point(); |
| |
| /* |
| * Reset the per-tuple context each time, to reclaim any cruft |
| * left behind by evaluating the predicate or index expressions. |
| */ |
| ResetExprContext(econtext); |
| |
| /* Set up for predicate or expression evaluation */ |
| ExecStoreHeapTuple(heapTuple, slot, false); |
| |
| /* If index is partial, check predicate */ |
| if (predicate != NULL) |
| { |
| if (!ExecQual(predicate, econtext)) |
| continue; |
| } |
| numindexrows++; |
| |
| if (attr_cnt > 0) |
| { |
| /* |
| * Evaluate the index row to compute expression values. We |
| * could do this by hand, but FormIndexDatum is convenient. |
| */ |
| FormIndexDatum(indexInfo, |
| slot, |
| estate, |
| values, |
| isnull); |
| |
| /* |
| * Save just the columns we care about. We copy the values |
| * into ind_context from the estate's per-tuple context. |
| */ |
| for (i = 0; i < attr_cnt; i++) |
| { |
| VacAttrStats *stats = thisdata->vacattrstats[i]; |
| int attnum = stats->attr->attnum; |
| |
| if (isnull[attnum - 1]) |
| { |
| exprvals[tcnt] = (Datum) 0; |
| exprnulls[tcnt] = true; |
| } |
| else |
| { |
| exprvals[tcnt] = datumCopy(values[attnum - 1], |
| stats->attrtype->typbyval, |
| stats->attrtype->typlen); |
| exprnulls[tcnt] = false; |
| } |
| tcnt++; |
| } |
| } |
| } |
| |
| /* |
| * Having counted the number of rows that pass the predicate in the |
| * sample, we can estimate the total number of rows in the index. |
| */ |
| thisdata->tupleFract = (double) numindexrows / (double) numrows; |
| totalindexrows = ceil(thisdata->tupleFract * totalrows); |
| |
| /* |
| * Now we can compute the statistics for the expression columns. |
| */ |
| if (numindexrows > 0) |
| { |
| MemoryContextSwitchTo(col_context); |
| for (i = 0; i < attr_cnt; i++) |
| { |
| VacAttrStats *stats = thisdata->vacattrstats[i]; |
| |
| stats->exprvals = exprvals + i; |
| stats->exprnulls = exprnulls + i; |
| stats->rowstride = attr_cnt; |
| stats->compute_stats(stats, |
| ind_fetch_func, |
| numindexrows, |
| totalindexrows); |
| |
| MemoryContextResetAndDeleteChildren(col_context); |
| } |
| } |
| |
| /* And clean up */ |
| MemoryContextSwitchTo(ind_context); |
| |
| ExecDropSingleTupleTableSlot(slot); |
| FreeExecutorState(estate); |
| MemoryContextResetAndDeleteChildren(ind_context); |
| } |
| |
| MemoryContextSwitchTo(old_context); |
| MemoryContextDelete(ind_context); |
| } |
| |
| /* |
| * examine_attribute -- pre-analysis of a single column |
| * |
| * Determine whether the column is analyzable; if so, create and initialize |
| * a VacAttrStats struct for it. If not, return NULL. |
| * |
| * If index_expr isn't NULL, then we're trying to analyze an expression index, |
| * and index_expr is the expression tree representing the column's data. |
| */ |
| static VacAttrStats * |
| examine_attribute(Relation onerel, int attnum, Node *index_expr, int elevel) |
| { |
| Form_pg_attribute attr = TupleDescAttr(onerel->rd_att, attnum - 1); |
| HeapTuple typtuple; |
| VacAttrStats *stats; |
| int i; |
| bool ok; |
| |
| /* Never analyze dropped columns */ |
| if (attr->attisdropped) |
| return NULL; |
| |
| /* Don't analyze column if user has specified not to */ |
| if (attr->attstattarget == 0) |
| return NULL; |
| |
| /* |
| * Create the VacAttrStats struct. Note that we only have a copy of the |
| * fixed fields of the pg_attribute tuple. |
| */ |
| stats = (VacAttrStats *) palloc0(sizeof(VacAttrStats)); |
| stats->elevel = elevel; |
| stats->attr = (Form_pg_attribute) palloc(ATTRIBUTE_FIXED_PART_SIZE); |
| memcpy(stats->attr, attr, ATTRIBUTE_FIXED_PART_SIZE); |
| |
| /* |
| * When analyzing an expression index, believe the expression tree's type |
| * not the column datatype --- the latter might be the opckeytype storage |
| * type of the opclass, which is not interesting for our purposes. (Note: |
| * if we did anything with non-expression index columns, we'd need to |
| * figure out where to get the correct type info from, but for now that's |
| * not a problem.) It's not clear whether anyone will care about the |
| * typmod, but we store that too just in case. |
| */ |
| if (index_expr) |
| { |
| stats->attrtypid = exprType(index_expr); |
| stats->attrtypmod = exprTypmod(index_expr); |
| |
| /* |
| * If a collation has been specified for the index column, use that in |
| * preference to anything else; but if not, fall back to whatever we |
| * can get from the expression. |
| */ |
| if (OidIsValid(onerel->rd_indcollation[attnum - 1])) |
| stats->attrcollid = onerel->rd_indcollation[attnum - 1]; |
| else |
| stats->attrcollid = exprCollation(index_expr); |
| } |
| else |
| { |
| stats->attrtypid = attr->atttypid; |
| stats->attrtypmod = attr->atttypmod; |
| stats->attrcollid = attr->attcollation; |
| } |
| |
| typtuple = SearchSysCacheCopy1(TYPEOID, |
| ObjectIdGetDatum(stats->attrtypid)); |
| if (!HeapTupleIsValid(typtuple)) |
| elog(ERROR, "cache lookup failed for type %u", stats->attrtypid); |
| stats->attrtype = (Form_pg_type) GETSTRUCT(typtuple); |
| stats->anl_context = anl_context; |
| stats->tupattnum = attnum; |
| |
| /* |
| * The fields describing the stats->stavalues[n] element types default to |
| * the type of the data being analyzed, but the type-specific typanalyze |
| * function can change them if it wants to store something else. |
| */ |
| for (i = 0; i < STATISTIC_NUM_SLOTS-1; i++) |
| { |
| stats->statypid[i] = stats->attrtypid; |
| stats->statyplen[i] = stats->attrtype->typlen; |
| stats->statypbyval[i] = stats->attrtype->typbyval; |
| stats->statypalign[i] = stats->attrtype->typalign; |
| } |
| |
| /* |
| * The last slots of statistics is reserved for hyperloglog counter which |
| * is saved as a bytea. Therefore the type information is hardcoded for the |
| * bytea. |
| */ |
| stats->statypid[i] = BYTEAOID; // oid for bytea |
| stats->statyplen[i] = -1; // variable length type |
| stats->statypbyval[i] = false; // bytea is pass by reference |
| stats->statypalign[i] = 'i'; // INT alignment (4-byte) |
| |
| /* |
| * Call the type-specific typanalyze function. If none is specified, use |
| * std_typanalyze(). |
| */ |
| if (OidIsValid(stats->attrtype->typanalyze)) |
| ok = DatumGetBool(OidFunctionCall1(stats->attrtype->typanalyze, |
| PointerGetDatum(stats))); |
| else |
| ok = std_typanalyze(stats); |
| |
| stats->corrnull = true; |
| stats->corrval = 0; |
| stats->partitiontbl_qd = false; |
| |
| if (!ok || stats->compute_stats == NULL || stats->minrows <= 0) |
| { |
| heap_freetuple(typtuple); |
| pfree(stats->attr); |
| pfree(stats); |
| return NULL; |
| } |
| |
| return stats; |
| } |
| |
| /* |
| * GPDB: If we are the dispatcher, then issue analyze on the segments and |
| * collect the statistics from them. |
| */ |
| int |
| gp_acquire_sample_rows_func(Relation onerel, int elevel, |
| HeapTuple *rows, int targrows, |
| double *totalrows, double *totaldeadrows) |
| { |
| if (Gp_role == GP_ROLE_DISPATCH && |
| onerel->rd_cdbpolicy && !GpPolicyIsEntry(onerel->rd_cdbpolicy)) |
| { |
| /* Fetch sample from the segments. */ |
| return acquire_sample_rows_dispatcher(onerel, false, elevel, |
| rows, targrows, |
| totalrows, totaldeadrows); |
| } |
| |
| /* |
| * if relation_acquire_sample_rows exist, we use it directly. |
| * Otherwise, use the acquire_sample_rows by default. |
| */ |
| if (onerel->rd_tableam->relation_acquire_sample_rows) |
| return table_relation_acquire_sample_rows(onerel, elevel, rows, |
| targrows, totalrows, totaldeadrows); |
| |
| return acquire_sample_rows(onerel, elevel, rows, |
| targrows, totalrows, totaldeadrows); |
| } |
| |
| /* |
| * acquire_sample_rows -- acquire a random sample of rows from the table |
| * |
| * Selected rows are returned in the caller-allocated array rows[], which |
| * must have at least targrows entries. |
| * The actual number of rows selected is returned as the function result. |
| * We also estimate the total numbers of live and dead rows in the table, |
| * and return them into *totalrows and *totaldeadrows, respectively. |
| * |
| * The returned list of tuples is in order by physical position in the table. |
| * (We will rely on this later to derive correlation estimates.) |
| * |
| * As of May 2004 we use a new two-stage method: Stage one selects up |
| * to targrows random blocks (or all blocks, if there aren't so many). |
| * Stage two scans these blocks and uses the Vitter algorithm to create |
| * a random sample of targrows rows (or less, if there are less in the |
| * sample of blocks). The two stages are executed simultaneously: each |
| * block is processed as soon as stage one returns its number and while |
| * the rows are read stage two controls which ones are to be inserted |
| * into the sample. |
| * |
| * Although every row has an equal chance of ending up in the final |
| * sample, this sampling method is not perfect: not every possible |
| * sample has an equal chance of being selected. For large relations |
| * the number of different blocks represented by the sample tends to be |
| * too small. We can live with that for now. Improvements are welcome. |
| * |
| * An important property of this sampling method is that because we do |
| * look at a statistically unbiased set of blocks, we should get |
| * unbiased estimates of the average numbers of live and dead rows per |
| * block. The previous sampling method put too much credence in the row |
| * density near the start of the table. |
| */ |
| static int |
| acquire_sample_rows(Relation onerel, int elevel, |
| HeapTuple *rows, int targrows, |
| double *totalrows, double *totaldeadrows) |
| { |
| int numrows = 0; /* # rows now in reservoir */ |
| double samplerows = 0; /* total # rows collected */ |
| double liverows = 0; /* # live rows seen */ |
| double deadrows = 0; /* # dead rows seen */ |
| double rowstoskip = -1; /* -1 means not set yet */ |
| long randseed; /* Seed for block sampler(s) */ |
| BlockNumber totalblocks; |
| TransactionId OldestXmin; |
| BlockSamplerData bs; |
| ReservoirStateData rstate; |
| TupleTableSlot *slot; |
| TableScanDesc scan; |
| BlockNumber nblocks; |
| BlockNumber blksdone = 0; |
| #ifdef USE_PREFETCH |
| int prefetch_maximum = 0; /* blocks to prefetch if enabled */ |
| BlockSamplerData prefetch_bs; |
| #endif |
| |
| Assert(targrows > 0); |
| |
| /* |
| * GPDB: Legacy analyze does make a lot of assumptions regarding the file |
| * layout of a relation. These assumptions are heap specific and do not hold |
| * for AO/AOCO relations. In the case of AO/AOCO, what is actually needed and |
| * used instead of number of blocks, is number of tuples. Moreover, BlockNumber |
| * is uint32 and Number of tuples is uint64. That means that after row number |
| * UINT_MAX we will never analyze the table. |
| * |
| * We introduced a tuple based sampling approach for AO/CO tables to address |
| * above problems, all corresponding logics were moved out of here and enclosed |
| * in table_relation_acquire_sample_rows(). So leave here an assertion to ensure |
| * the relation should not be an AO/CO table. |
| */ |
| Assert(!RelationIsAppendOptimized(onerel)); |
| if (RelationIsPax(onerel)) |
| { |
| /* PAX use non-fixed block layout */ |
| BlockNumber pages; |
| double tuples; |
| double allvisfrac; |
| int32 attr_widths; |
| |
| table_relation_estimate_size(onerel, &attr_widths, &pages, |
| &tuples, &allvisfrac); |
| |
| if (tuples > UINT_MAX) |
| tuples = UINT_MAX; |
| |
| totalblocks = (BlockNumber)tuples; |
| } |
| else |
| totalblocks = RelationGetNumberOfBlocks(onerel); |
| |
| /* Need a cutoff xmin for HeapTupleSatisfiesVacuum */ |
| OldestXmin = GetOldestNonRemovableTransactionId(onerel); |
| |
| /* Prepare for sampling block numbers */ |
| randseed = random(); |
| nblocks = BlockSampler_Init(&bs, totalblocks, targrows, randseed); |
| |
| #ifdef USE_PREFETCH |
| /* |
| * GPDB_14_MERGE_FIXME: pre-fetching is introduced in PG14. |
| * It's not suitable for AO tables, will get error when PrefetchBuffer |
| * because Blocks is not same as Heap tables. |
| * Set prefetch_maximum to zero seems the easiest way to bypass. |
| */ |
| if (RelationIsNonblockRelation(onerel)) |
| { |
| prefetch_maximum = 0; |
| } |
| else |
| { |
| prefetch_maximum = get_tablespace_maintenance_io_concurrency(onerel->rd_rel->reltablespace); |
| /* Create another BlockSampler, using the same seed, for prefetching */ |
| if (prefetch_maximum) |
| (void) BlockSampler_Init(&prefetch_bs, totalblocks, targrows, randseed); |
| } |
| #endif |
| |
| /* Report sampling block numbers */ |
| pgstat_progress_update_param(PROGRESS_ANALYZE_BLOCKS_TOTAL, |
| nblocks); |
| |
| /* Prepare for sampling rows */ |
| reservoir_init_selection_state(&rstate, targrows); |
| |
| scan = table_beginscan_analyze(onerel); |
| slot = table_slot_create(onerel, NULL); |
| |
| #ifdef USE_PREFETCH |
| |
| /* |
| * If we are doing prefetching, then go ahead and tell the kernel about |
| * the first set of pages we are going to want. This also moves our |
| * iterator out ahead of the main one being used, where we will keep it so |
| * that we're always pre-fetching out prefetch_maximum number of blocks |
| * ahead. |
| */ |
| if (prefetch_maximum) |
| { |
| for (int i = 0; i < prefetch_maximum; i++) |
| { |
| BlockNumber prefetch_block; |
| |
| if (!BlockSampler_HasMore(&prefetch_bs)) |
| break; |
| |
| prefetch_block = BlockSampler_Next(&prefetch_bs); |
| PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, prefetch_block); |
| } |
| } |
| #endif |
| |
| /* Outer loop over blocks to sample */ |
| while (BlockSampler_HasMore(&bs)) |
| { |
| bool block_accepted; |
| BlockNumber targblock = BlockSampler_Next(&bs); |
| #ifdef USE_PREFETCH |
| BlockNumber prefetch_targblock = InvalidBlockNumber; |
| |
| /* |
| * Make sure that every time the main BlockSampler is moved forward |
| * that our prefetch BlockSampler also gets moved forward, so that we |
| * always stay out ahead. |
| */ |
| if (prefetch_maximum && BlockSampler_HasMore(&prefetch_bs)) |
| prefetch_targblock = BlockSampler_Next(&prefetch_bs); |
| #endif |
| |
| vacuum_delay_point(); |
| |
| block_accepted = table_scan_analyze_next_block(scan, targblock, vac_strategy); |
| |
| #ifdef USE_PREFETCH |
| |
| /* |
| * When pre-fetching, after we get a block, tell the kernel about the |
| * next one we will want, if there's any left. |
| * |
| * We want to do this even if the table_scan_analyze_next_block() call |
| * above decides against analyzing the block it picked. |
| */ |
| if (prefetch_maximum && prefetch_targblock != InvalidBlockNumber) |
| PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, prefetch_targblock); |
| #endif |
| |
| /* |
| * Don't analyze if table_scan_analyze_next_block() indicated this |
| * block is unsuitable for analyzing. |
| */ |
| if (!block_accepted) |
| continue; |
| |
| while (table_scan_analyze_next_tuple(scan, OldestXmin, &liverows, &deadrows, slot)) |
| { |
| /* |
| * The first targrows sample rows are simply copied into the |
| * reservoir. Then we start replacing tuples in the sample until |
| * we reach the end of the relation. This algorithm is from Jeff |
| * Vitter's paper (see full citation in utils/misc/sampling.c). It |
| * works by repeatedly computing the number of tuples to skip |
| * before selecting a tuple, which replaces a randomly chosen |
| * element of the reservoir (current set of tuples). At all times |
| * the reservoir is a true random sample of the tuples we've |
| * passed over so far, so when we fall off the end of the relation |
| * we're done. |
| */ |
| if (numrows < targrows) |
| rows[numrows++] = ExecCopySlotHeapTuple(slot); |
| else |
| { |
| /* |
| * t in Vitter's paper is the number of records already |
| * processed. If we need to compute a new S value, we must |
| * use the not-yet-incremented value of samplerows as t. |
| */ |
| if (rowstoskip < 0) |
| rowstoskip = reservoir_get_next_S(&rstate, samplerows, targrows); |
| |
| if (rowstoskip <= 0) |
| { |
| /* |
| * Found a suitable tuple, so save it, replacing one old |
| * tuple at random |
| */ |
| int k = (int) (targrows * sampler_random_fract(rstate.randstate)); |
| |
| Assert(k >= 0 && k < targrows); |
| heap_freetuple(rows[k]); |
| rows[k] = ExecCopySlotHeapTuple(slot); |
| } |
| |
| rowstoskip -= 1; |
| } |
| |
| samplerows += 1; |
| } |
| |
| pgstat_progress_update_param(PROGRESS_ANALYZE_BLOCKS_DONE, |
| ++blksdone); |
| SIMPLE_FAULT_INJECTOR("analyze_block"); |
| } |
| |
| ExecDropSingleTupleTableSlot(slot); |
| table_endscan(scan); |
| |
| /* |
| * If we didn't find as many tuples as we wanted then we're done. No sort |
| * is needed, since they're already in order. |
| * |
| * Otherwise we need to sort the collected tuples by position |
| * (itempointer). It's not worth worrying about corner cases where the |
| * tuples are already sorted. |
| */ |
| if (numrows == targrows) |
| qsort((void *) rows, numrows, sizeof(HeapTuple), compare_rows); |
| |
| /* |
| * Estimate total numbers of live and dead rows in relation, extrapolating |
| * on the assumption that the average tuple density in pages we didn't |
| * scan is the same as in the pages we did scan. Since what we scanned is |
| * a random sample of the pages in the relation, this should be a good |
| * assumption. |
| */ |
| if (bs.m > 0) |
| { |
| *totalrows = floor((liverows / bs.m) * totalblocks + 0.5); |
| *totaldeadrows = floor((deadrows / bs.m) * totalblocks + 0.5); |
| } |
| else |
| { |
| *totalrows = 0.0; |
| *totaldeadrows = 0.0; |
| } |
| |
| /* |
| * Emit some interesting relation info |
| */ |
| ereport(elevel, |
| (errmsg("\"%s\": scanned %d of %u pages, " |
| "containing %.0f live rows and %.0f dead rows; " |
| "%d rows in sample, %.0f estimated total rows", |
| RelationGetRelationName(onerel), |
| bs.m, totalblocks, |
| liverows, deadrows, |
| numrows, *totalrows))); |
| |
| return numrows; |
| } |
| |
| /* |
| * qsort comparator for sorting rows[] array |
| */ |
| static int |
| compare_rows(const void *a, const void *b) |
| { |
| HeapTuple ha = *(const HeapTuple *) a; |
| HeapTuple hb = *(const HeapTuple *) b; |
| |
| /* |
| * For AO/AOCO tables, blocknumber does not have a meaning and is not set. |
| * We leave this workaround here for legacy AO/CO analyze still working. |
| * There should be no apparent and measurable perfomance hit from calling |
| * this function. |
| * |
| * The AO/CO Fast Analyze won't reach here anymore. |
| */ |
| if (!BlockNumberIsValid(ItemPointerGetBlockNumberNoCheck(&ha->t_self))) |
| return 0; |
| |
| BlockNumber ba = ItemPointerGetBlockNumber(&ha->t_self); |
| OffsetNumber oa = ItemPointerGetOffsetNumber(&ha->t_self); |
| BlockNumber bb = ItemPointerGetBlockNumber(&hb->t_self); |
| OffsetNumber ob = ItemPointerGetOffsetNumber(&hb->t_self); |
| |
| if (ba < bb) |
| return -1; |
| if (ba > bb) |
| return 1; |
| if (oa < ob) |
| return -1; |
| if (oa > ob) |
| return 1; |
| return 0; |
| } |
| |
| |
| /* |
| * acquire_inherited_sample_rows -- acquire sample rows from inheritance tree |
| * |
| * This has the same API as acquire_sample_rows, except that rows are |
| * collected from all inheritance children as well as the specified table. |
| * We fail and return zero if there are no inheritance children, or if all |
| * children are foreign tables that don't support ANALYZE. |
| */ |
| int |
| acquire_inherited_sample_rows(Relation onerel, int elevel, |
| HeapTuple *rows, int targrows, |
| double *totalrows, double *totaldeadrows) |
| { |
| List *tableOIDs; |
| Relation *rels; |
| AcquireSampleRowsFunc *acquirefuncs; |
| double *relblocks; |
| double totalblocks; |
| int numrows, |
| nrels, |
| i; |
| ListCell *lc; |
| bool has_child; |
| |
| /* |
| * Find all members of inheritance set. We only need AccessShareLock on |
| * the children. |
| */ |
| tableOIDs = |
| find_all_inheritors(RelationGetRelid(onerel), AccessShareLock, NULL); |
| |
| /* |
| * Check that there's at least one descendant, else fail. This could |
| * happen despite analyze_rel's relhassubclass check, if table once had a |
| * child but no longer does. In that case, we can clear the |
| * relhassubclass field so as not to make the same mistake again later. |
| * (This is safe because we hold ShareUpdateExclusiveLock.) |
| * Please refer to https://github.com/greenplum-db/gpdb/issues/14644 |
| */ |
| if (list_length(tableOIDs) < 2) |
| { |
| /* CCI because we already updated the pg_class row in this command */ |
| CommandCounterIncrement(); |
| SetRelationHasSubclass(RelationGetRelid(onerel), false); |
| *totalrows = 0; |
| *totaldeadrows = 0; |
| ereport(elevel, |
| (errmsg("skipping analyze of \"%s.%s\" inheritance tree --- this inheritance tree contains no child tables", |
| get_namespace_name(RelationGetNamespace(onerel)), |
| RelationGetRelationName(onerel)))); |
| if (Gp_role == GP_ROLE_EXECUTE) |
| return 0; |
| } |
| |
| /* |
| * Like in acquire_sample_rows(), if we're in the QD, fetch the sample |
| * from segments. |
| */ |
| if (Gp_role == GP_ROLE_DISPATCH) |
| { |
| return acquire_sample_rows_dispatcher(onerel, |
| true, /* inherited stats */ |
| elevel, rows, targrows, |
| totalrows, totaldeadrows); |
| } |
| |
| /* |
| * Identify acquirefuncs to use, and count blocks in all the relations. |
| * The result could overflow BlockNumber, so we use double arithmetic. |
| */ |
| rels = (Relation *) palloc(list_length(tableOIDs) * sizeof(Relation)); |
| acquirefuncs = (AcquireSampleRowsFunc *) |
| palloc(list_length(tableOIDs) * sizeof(AcquireSampleRowsFunc)); |
| relblocks = (double *) palloc(list_length(tableOIDs) * sizeof(double)); |
| totalblocks = 0; |
| nrels = 0; |
| has_child = false; |
| foreach(lc, tableOIDs) |
| { |
| Oid childOID = lfirst_oid(lc); |
| Relation childrel; |
| AcquireSampleRowsFunc acquirefunc = NULL; |
| BlockNumber relpages = 0; |
| |
| /* We already got the needed lock */ |
| childrel = table_open(childOID, NoLock); |
| |
| /* Ignore if temp table of another backend */ |
| if (RELATION_IS_OTHER_TEMP(childrel)) |
| { |
| /* ... but release the lock on it */ |
| Assert(childrel != onerel); |
| table_close(childrel, AccessShareLock); |
| continue; |
| } |
| |
| /* Check table type (MATVIEW can't happen, but might as well allow) */ |
| if (childrel->rd_rel->relkind == RELKIND_RELATION || |
| childrel->rd_rel->relkind == RELKIND_MATVIEW || |
| childrel->rd_rel->relkind == RELKIND_DIRECTORY_TABLE) |
| { |
| /* use relation_acquire_sample_rows as default. */ |
| acquirefunc = gp_acquire_sample_rows_func; |
| |
| relpages = AcquireNumberOfBlocks(childrel); |
| } |
| else if (childrel->rd_rel->relkind == RELKIND_FOREIGN_TABLE) |
| { |
| /* |
| * For a foreign table, call the FDW's hook function to see |
| * whether it supports analysis. |
| */ |
| FdwRoutine *fdwroutine; |
| bool ok = false; |
| |
| fdwroutine = GetFdwRoutineForRelation(childrel, false); |
| |
| if (fdwroutine->AnalyzeForeignTable != NULL) |
| ok = fdwroutine->AnalyzeForeignTable(childrel, |
| &acquirefunc, |
| &relpages); |
| |
| if (!ok) |
| { |
| /* ignore, but release the lock on it */ |
| Assert(childrel != onerel); |
| table_close(childrel, AccessShareLock); |
| continue; |
| } |
| } |
| else |
| { |
| /* |
| * ignore, but release the lock on it. don't try to unlock the |
| * passed-in relation |
| */ |
| Assert(childrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE); |
| if (childrel != onerel) |
| table_close(childrel, AccessShareLock); |
| else |
| table_close(childrel, NoLock); |
| continue; |
| } |
| |
| /* OK, we'll process this child */ |
| has_child = true; |
| rels[nrels] = childrel; |
| acquirefuncs[nrels] = acquirefunc; |
| relblocks[nrels] = (double) relpages; |
| totalblocks += (double) relpages; |
| nrels++; |
| } |
| |
| /* |
| * If we don't have at least one child table to consider, fail. If the |
| * relation is a partitioned table, it's not counted as a child table. |
| */ |
| if (!has_child) |
| { |
| ereport(elevel, |
| (errmsg("skipping analyze of \"%s.%s\" inheritance tree --- this inheritance tree contains no analyzable child tables", |
| get_namespace_name(RelationGetNamespace(onerel)), |
| RelationGetRelationName(onerel)))); |
| return 0; |
| } |
| |
| /* |
| * Now sample rows from each relation, proportionally to its fraction of |
| * the total block count. (This might be less than desirable if the child |
| * rels have radically different free-space percentages, but it's not |
| * clear that it's worth working harder.) |
| */ |
| pgstat_progress_update_param(PROGRESS_ANALYZE_CHILD_TABLES_TOTAL, |
| nrels); |
| numrows = 0; |
| *totalrows = 0; |
| *totaldeadrows = 0; |
| for (i = 0; i < nrels; i++) |
| { |
| Relation childrel = rels[i]; |
| AcquireSampleRowsFunc acquirefunc = acquirefuncs[i]; |
| double childblocks = relblocks[i]; |
| |
| pgstat_progress_update_param(PROGRESS_ANALYZE_CURRENT_CHILD_TABLE_RELID, |
| RelationGetRelid(childrel)); |
| |
| if (childblocks > 0) |
| { |
| int childtargrows; |
| |
| childtargrows = (int) rint(targrows * childblocks / totalblocks); |
| /* Make sure we don't overrun due to roundoff error */ |
| childtargrows = Min(childtargrows, targrows - numrows); |
| if (childtargrows > 0) |
| { |
| int childrows; |
| double trows, |
| tdrows; |
| |
| /* Fetch a random sample of the child's rows */ |
| childrows = (*acquirefunc) (childrel, elevel, |
| rows + numrows, childtargrows, |
| &trows, &tdrows); |
| |
| /* We may need to convert from child's rowtype to parent's */ |
| if (childrows > 0 && |
| !equalTupleDescs(RelationGetDescr(childrel), |
| RelationGetDescr(onerel), |
| false)) |
| { |
| TupleConversionMap *map; |
| |
| map = convert_tuples_by_name(RelationGetDescr(childrel), |
| RelationGetDescr(onerel)); |
| if (map != NULL) |
| { |
| int j; |
| |
| for (j = 0; j < childrows; j++) |
| { |
| HeapTuple newtup; |
| |
| newtup = execute_attr_map_tuple(rows[numrows + j], map); |
| heap_freetuple(rows[numrows + j]); |
| rows[numrows + j] = newtup; |
| } |
| free_conversion_map(map); |
| } |
| } |
| |
| /* And add to counts */ |
| numrows += childrows; |
| *totalrows += trows; |
| *totaldeadrows += tdrows; |
| } |
| } |
| |
| /* |
| * Note: we cannot release the child-table locks, since we may have |
| * pointers to their TOAST tables in the sampled rows. |
| */ |
| table_close(childrel, NoLock); |
| pgstat_progress_update_param(PROGRESS_ANALYZE_CHILD_TABLES_DONE, |
| i + 1); |
| } |
| |
| return numrows; |
| } |
| |
| /* |
| * This function acquires the HLL counter for the entire table by |
| * using the hyperloglog extension gp_hyperloglog_accum(). |
| * |
| * Unlike acquire_sample_rows(), this returns the HLL counter for |
| * the entire table, and not jsut a sample, and it stores the HLL |
| * counter into a separate attribute in the stats stahll_full to |
| * distinguish it from the HLL for sampled data. This functions scans |
| * the full table only once. |
| */ |
| static void |
| acquire_hll_by_query(Relation onerel, int nattrs, VacAttrStats **attrstats, int elevel) |
| { |
| StringInfoData str, columnStr; |
| int i; |
| int ret; |
| Datum *vals; |
| MemoryContext oldcxt; |
| const char *schemaName = get_namespace_name(RelationGetNamespace(onerel)); |
| |
| initStringInfo(&str); |
| initStringInfo(&columnStr); |
| for (i = 0; i < nattrs; i++) |
| { |
| const char *attname = quote_identifier(NameStr(attrstats[i]->attr->attname)); |
| appendStringInfo(&columnStr, "pg_catalog.gp_hyperloglog_accum(%s)", attname); |
| if(i != nattrs-1) |
| appendStringInfo(&columnStr, ", "); |
| } |
| |
| appendStringInfo(&str, "select %s from %s.%s as Ta ", |
| columnStr.data, |
| quote_identifier(schemaName), |
| quote_identifier(RelationGetRelationName(onerel))); |
| |
| oldcxt = CurrentMemoryContext; |
| |
| if (SPI_OK_CONNECT != SPI_connect()) |
| ereport(ERROR, |
| (errcode(ERRCODE_INTERNAL_ERROR), |
| errmsg("unable to connect to execute internal query"))); |
| |
| elog(elevel, "Executing SQL: %s", str.data); |
| |
| /* |
| * Do the query. We pass readonly==false, to force SPI to take a new |
| * snapshot. That ensures that we see all changes by our own transaction. |
| */ |
| ret = SPI_execute(str.data, false, 0); |
| Assert(ret > 0); |
| |
| /* |
| * targrows in analyze_rel_internal() is an int, |
| * it's unlikely that this query will return more rows |
| */ |
| Assert(SPI_processed < 2); |
| int sampleTuples = (int) SPI_processed; |
| |
| /* Ok, read in the tuples to *rows */ |
| MemoryContextSwitchTo(oldcxt); |
| vals = (Datum *) palloc0(nattrs * sizeof(Datum)); |
| bool isNull = false; |
| |
| for (i = 0; i < sampleTuples; i++) |
| { |
| HeapTuple sampletup = SPI_tuptable->vals[i]; |
| int j; |
| |
| for (j = 0; j < nattrs; j++) |
| { |
| int tupattnum = attrstats[j]->tupattnum; |
| Assert(tupattnum >= 1 && tupattnum <= nattrs); |
| |
| vals[tupattnum - 1] = heap_getattr(sampletup, j + 1, |
| SPI_tuptable->tupdesc, |
| &isNull); |
| if (isNull) |
| { |
| attrstats[j]->stahll_full = (bytea *)gp_hyperloglog_init_def(); |
| continue; |
| } |
| |
| int16 typlen; |
| bool typbyval; |
| get_typlenbyval(SPI_tuptable->tupdesc->tdtypeid, &typlen, &typbyval); |
| int hll_length = datumGetSize(vals[tupattnum-1], typbyval, typlen); |
| attrstats[j]->stahll_full = (bytea *)datumCopy(PointerGetDatum(vals[tupattnum - 1]), false, hll_length); |
| } |
| } |
| |
| SPI_finish(); |
| } |
| |
| /* |
| * Count AO/AOCO tables segment file number. |
| */ |
| static int16 |
| AcquireCountOfSegmentFile(Relation onerel) |
| { |
| int16 count = 0; |
| |
| if (!RelationIsAppendOptimized(onerel)) |
| return 0; |
| |
| if (Gp_role == GP_ROLE_DISPATCH && |
| onerel->rd_cdbpolicy && !GpPolicyIsEntry(onerel->rd_cdbpolicy)) |
| { |
| /* Query the segments using gp_ao_segment_file_count(<rel>). */ |
| char *sql; |
| sql = psprintf("select pg_catalog.gp_ao_segment_file_count(%u)", RelationGetRelid(onerel)); |
| count = get_size_from_segDBs(sql)/getgpsegmentCount(); |
| } |
| else |
| { |
| count = GetAppendOnlySegmentFilesCount(onerel); |
| } |
| return count; |
| } |
| |
| /* |
| * Compute relation size. |
| * |
| * In upstream, this is a simple RelationGetNumberOfBlocks() call. |
| * In GPDB if we're in the dispatcher, we need to get the size from the |
| * segments. |
| */ |
| BlockNumber |
| AcquireNumberOfBlocks(Relation onerel) |
| { |
| int64 totalbytes; |
| |
| if (Gp_role == GP_ROLE_DISPATCH && |
| onerel->rd_cdbpolicy && !GpPolicyIsEntry(onerel->rd_cdbpolicy)) |
| { |
| /* Query the segments using pg_relation_size(<rel>). */ |
| char relsize_sql[100]; |
| |
| snprintf(relsize_sql, sizeof(relsize_sql), |
| "select pg_catalog.pg_relation_size(%u, 'main')", RelationGetRelid(onerel)); |
| totalbytes = get_size_from_segDBs(relsize_sql); |
| if (GpPolicyIsReplicated(onerel->rd_cdbpolicy)) |
| { |
| /* |
| * pg_relation_size sums up the table size on each segment. That's |
| * correct for hash and randomly distributed tables. But for a |
| * replicated table, we want pg_class.relpages to count the data |
| * only once. |
| */ |
| totalbytes = totalbytes / onerel->rd_cdbpolicy->numsegments; |
| } |
| |
| return RelationGuessNumberOfBlocksFromSize(totalbytes); |
| } |
| /* Check size on this server. */ |
| else |
| { |
| return RelationGetNumberOfBlocks(onerel); |
| } |
| } |
| |
| /* |
| * Collect visibility map of relation in dispatcher. |
| * |
| * In GPDB if we're in the dispatcher, we need to collect the number of |
| * visibility map in pg_class from segments. |
| */ |
| BlockNumber |
| AcquireNumberOfAllVisibleBlocks(Relation onerel) |
| { |
| int64 totalvms; |
| |
| /* collect total vms from segments in master */ |
| if (Gp_role == GP_ROLE_DISPATCH && |
| onerel->rd_cdbpolicy && !GpPolicyIsEntry(onerel->rd_cdbpolicy)) |
| { |
| /* Query the segments pg_class. */ |
| char relvm_sql[80]; |
| |
| snprintf(relvm_sql, sizeof(relvm_sql), |
| "select relallvisible from pg_catalog.pg_class where oid = %u", RelationGetRelid(onerel)); |
| totalvms = get_size_from_segDBs(relvm_sql); |
| if (GpPolicyIsReplicated(onerel->rd_cdbpolicy)) |
| { |
| /* |
| * If the distribution of the relation is replicated, we will sum up |
| * vms much twice which we expecting only once. So we need to divide |
| * up totalvms by numsegments. |
| */ |
| totalvms /= onerel->rd_cdbpolicy->numsegments; |
| } |
| |
| return (BlockNumber)totalvms; |
| } |
| /* get vms from local in segment */ |
| else |
| { |
| BlockNumber all_visible = 0; |
| visibilitymap_count(onerel, &all_visible, NULL); |
| |
| return all_visible; |
| } |
| } |
| |
| /* |
| * Compute index relation's size. |
| * |
| * Like AcquireNumberOfBlocks(), but for indexes. Indexes don't |
| * have a distribution policy, so we use the parent table's policy |
| * to determine whether we need to get the size on segments or |
| * locally. |
| */ |
| static BlockNumber |
| acquire_index_number_of_blocks(Relation indexrel, Relation tablerel) |
| { |
| int64 totalbytes; |
| |
| if (Gp_role == GP_ROLE_DISPATCH && |
| tablerel->rd_cdbpolicy && !GpPolicyIsEntry(tablerel->rd_cdbpolicy)) |
| { |
| /* Query the segments using pg_relation_size(<rel>). */ |
| char relsize_sql[100]; |
| |
| snprintf(relsize_sql, sizeof(relsize_sql), |
| "select pg_catalog.pg_relation_size(%u, 'main')", RelationGetRelid(indexrel)); |
| totalbytes = get_size_from_segDBs(relsize_sql); |
| if (GpPolicyIsReplicated(tablerel->rd_cdbpolicy)) |
| { |
| /* |
| * pg_relation_size sums up the table size on each segment. That's |
| * correct for hash and randomly distributed tables. But for a |
| * replicated table, we want pg_class.relpages to count the data |
| * only once. |
| */ |
| totalbytes = totalbytes / tablerel->rd_cdbpolicy->numsegments; |
| } |
| |
| return RelationGuessNumberOfBlocksFromSize(totalbytes); |
| } |
| /* Check size on this server. */ |
| else |
| { |
| return RelationGetNumberOfBlocks(indexrel); |
| } |
| } |
| |
| /* |
| * parse_record_to_string |
| * |
| * CDB: a copy of record_in, but only parse the record string |
| * into separate strs for each column. |
| */ |
| static void |
| parse_record_to_string(char *string, TupleDesc tupdesc, char** values, bool *nulls) |
| { |
| char *ptr; |
| int ncolumns; |
| int i; |
| bool needComma; |
| StringInfoData buf; |
| |
| Assert(string != NULL); |
| Assert(values != NULL); |
| Assert(nulls != NULL); |
| |
| ncolumns = tupdesc->natts; |
| needComma = false; |
| |
| /* |
| * Scan the string. We use "buf" to accumulate the de-quoted data for |
| * each column, which is then fed to the appropriate input converter. |
| */ |
| ptr = string; |
| |
| /* Allow leading whitespace */ |
| while (*ptr && isspace((unsigned char) *ptr)) |
| ptr++; |
| if (*ptr++ != '(') |
| { |
| ereport(ERROR, |
| (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), |
| errmsg("malformed record literal: \"%s\"", string), |
| errdetail("Missing left parenthesis."))); |
| } |
| |
| initStringInfo(&buf); |
| |
| for (i = 0; i < ncolumns; i++) |
| { |
| /* Ignore dropped columns in datatype, but fill with nulls */ |
| if (TupleDescAttr(tupdesc, i)->attisdropped) |
| { |
| values[i] = NULL; |
| nulls[i] = true; |
| continue; |
| } |
| |
| if (needComma) |
| { |
| /* Skip comma that separates prior field from this one */ |
| if (*ptr == ',') |
| ptr++; |
| else |
| { |
| /* *ptr must be ')' */ |
| ereport(ERROR, |
| (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), |
| errmsg("malformed record literal: \"%s\"", string), |
| errdetail("Too few columns."))); |
| } |
| } |
| |
| /* Check for null: completely empty input means null */ |
| if (*ptr == ',' || *ptr == ')') |
| { |
| values[i] = NULL; |
| nulls[i] = true; |
| } |
| else |
| { |
| /* Extract string for this column */ |
| bool inquote = false; |
| |
| resetStringInfo(&buf); |
| while (inquote || !(*ptr == ',' || *ptr == ')')) |
| { |
| char ch = *ptr++; |
| |
| if (ch == '\0') |
| { |
| ereport(ERROR, |
| (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), |
| errmsg("malformed record literal: \"%s\"", |
| string), |
| errdetail("Unexpected end of input."))); |
| } |
| if (ch == '\\') |
| { |
| if (*ptr == '\0') |
| { |
| ereport(ERROR, |
| (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), |
| errmsg("malformed record literal: \"%s\"", |
| string), |
| errdetail("Unexpected end of input."))); |
| } |
| appendStringInfoChar(&buf, *ptr++); |
| } |
| else if (ch == '"') |
| { |
| if (!inquote) |
| inquote = true; |
| else if (*ptr == '"') |
| { |
| /* doubled quote within quote sequence */ |
| appendStringInfoChar(&buf, *ptr++); |
| } |
| else |
| inquote = false; |
| } |
| else |
| appendStringInfoChar(&buf, ch); |
| } |
| |
| values[i] = palloc(strlen(buf.data) + 1); |
| memcpy(values[i], buf.data, strlen(buf.data) + 1); |
| nulls[i] = false; |
| } |
| |
| /* |
| * Prep for next column |
| */ |
| needComma = true; |
| } |
| |
| if (*ptr++ != ')') |
| { |
| ereport(ERROR, |
| (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), |
| errmsg("malformed record literal: \"%s\"", string), |
| errdetail("Too many columns."))); |
| } |
| /* Allow trailing whitespace */ |
| while (*ptr && isspace((unsigned char) *ptr)) |
| ptr++; |
| if (*ptr) |
| { |
| ereport(ERROR, |
| (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), |
| errmsg("malformed record literal: \"%s\"", string), |
| errdetail("Junk after right parenthesis."))); |
| } |
| } |
| |
| /* |
| * Build a querydesc for a sql, set "dest" to portal->holdStore |
| */ |
| static QueryDesc *build_querydesc(Portal portal, char *sql) |
| { |
| List *raw_parsetree_list; |
| RawStmt *parsetree; |
| List *querytree_list; |
| List *plantree_list; |
| PlannedStmt *plan_stmt; |
| DestReceiver *destReceiver; |
| QueryDesc *queryDesc = NULL; |
| destReceiver = CreateDestReceiver(DestTuplestore); |
| SetTuplestoreDestReceiverParams(destReceiver, |
| portal->holdStore, |
| portal->holdContext, |
| false, NULL, NULL); |
| |
| /* |
| * Parse the SQL string into a list of raw parse trees. |
| */ |
| raw_parsetree_list = pg_parse_query(sql); |
| |
| /* |
| * Do parse analysis, rule rewrite, planning, and execution for each raw |
| * parsetree. |
| */ |
| |
| /* There is only one element in list due to simple select. */ |
| Assert(list_length(raw_parsetree_list) == 1); |
| parsetree = (RawStmt *) linitial(raw_parsetree_list); |
| |
| querytree_list = pg_analyze_and_rewrite(parsetree, |
| sql, |
| NULL, |
| 0, |
| NULL); |
| plantree_list = pg_plan_queries(querytree_list, sql, 0, NULL); |
| |
| /* There is only one statement in list due to simple select. */ |
| Assert(list_length(plantree_list) == 1); |
| plan_stmt = (PlannedStmt *) linitial(plantree_list); |
| |
| queryDesc = CreateQueryDesc(plan_stmt, |
| sql, |
| GetActiveSnapshot(), |
| InvalidSnapshot, |
| destReceiver, |
| NULL, |
| NULL, |
| INSTRUMENT_NONE); |
| |
| |
| |
| list_free_deep(querytree_list); |
| list_free_deep(raw_parsetree_list); |
| |
| return queryDesc; |
| } |
| |
| static int |
| process_sample_rows(Portal portal, |
| QueryDesc *queryDesc, |
| Relation onerel, |
| HeapTuple *rows, |
| int targrows, |
| double *totalrows, |
| double *totaldeadrows) |
| { |
| /* |
| * 'colLargeRowIndexes' is essentially an argument, but it's passed via a |
| * global variable to avoid changing the AcquireSampleRowsFunc prototype. |
| */ |
| Bitmapset **colLargeRowIndexes = acquire_func_colLargeRowIndexes; |
| double *colLargeRowLength = acquire_func_colLargeRowLength; |
| double *colNDVBySeg = acquire_func_colNDVBySeg; |
| TupleDesc relDesc = RelationGetDescr(onerel); |
| TupleDesc funcTupleDesc; |
| TupleDesc sampleTupleDesc; |
| int sampleTuples; /* 32 bit - assume that number of tuples will not > 2B */ |
| Datum *funcRetValues; |
| bool *funcRetNulls; |
| int ncolumns; |
| AttInMetadata *attinmeta; |
| int numLiveColumns; |
| int i; |
| int index = 0; |
| TupleTableSlot *slot; |
| Datum *dvalues; |
| bool *dnulls; |
| |
| /* |
| * Count the number of columns, excluding dropped columns. We'll need that |
| * later. |
| */ |
| numLiveColumns = 0; |
| for (i = 0; i < relDesc->natts; i++) |
| { |
| Form_pg_attribute attr = TupleDescAttr(relDesc, i); |
| |
| if (attr->attisdropped) |
| continue; |
| |
| numLiveColumns++; |
| } |
| |
| /* |
| * Build a modified tuple descriptor for the table. |
| * |
| * Some datatypes need special treatment, so we cannot use the relation's |
| * original tupledesc. |
| * |
| * Also create tupledesc of return record of function gp_acquire_sample_rows. |
| */ |
| sampleTupleDesc = CreateTupleDescCopy(relDesc); |
| ncolumns = NUM_SAMPLE_FIXED_COLS + numLiveColumns; |
| |
| funcTupleDesc = CreateTemplateTupleDesc(ncolumns); |
| TupleDescInitEntry(funcTupleDesc, (AttrNumber) 1, "", FLOAT8OID, -1, 0); |
| TupleDescInitEntry(funcTupleDesc, (AttrNumber) 2, "", FLOAT8OID, -1, 0); |
| TupleDescInitEntry(funcTupleDesc, (AttrNumber) 3, "", FLOAT8ARRAYOID, -1, 0); |
| TupleDescInitEntry(funcTupleDesc, (AttrNumber) 4, "", FLOAT8ARRAYOID, -1, 0); |
| |
| for (i = 0; i < relDesc->natts; i++) |
| { |
| Form_pg_attribute attr = TupleDescAttr(relDesc, i); |
| |
| Oid typid = gp_acquire_sample_rows_col_type(attr->atttypid); |
| |
| TupleDescAttr(sampleTupleDesc, i)->atttypid = typid; |
| |
| if (!attr->attisdropped) |
| { |
| TupleDescInitEntry(funcTupleDesc, (AttrNumber) 4 + index, "", |
| typid, attr->atttypmod, attr->attndims); |
| |
| index++; |
| } |
| } |
| |
| /* For RECORD results, make sure a typmod has been assigned */ |
| Assert(funcTupleDesc->tdtypeid == RECORDOID && funcTupleDesc->tdtypmod < 0); |
| assign_record_type_typmod(funcTupleDesc); |
| |
| attinmeta = TupleDescGetAttInMetadata(sampleTupleDesc); |
| |
| /* |
| * Read the result set from each segment. Gather the sample rows *rows, |
| * and sum up the summary rows for grand 'totalrows' and 'totaldeadrows'. |
| */ |
| funcRetValues = (Datum *) palloc0(funcTupleDesc->natts * sizeof(Datum)); |
| funcRetNulls = (bool *) palloc0(funcTupleDesc->natts * sizeof(bool)); |
| dvalues = (Datum *) palloc0(relDesc->natts * sizeof(Datum)); |
| dnulls = (bool *) palloc0(relDesc->natts * sizeof(bool)); |
| sampleTuples = 0; |
| *totalrows = 0; |
| *totaldeadrows = 0; |
| |
| slot = MakeSingleTupleTableSlot(queryDesc->tupDesc, &TTSOpsMinimalTuple); |
| |
| for (;;) |
| { |
| bool ok; |
| TupleDesc typeinfo; |
| int natts; |
| Datum attr; |
| bool isnull; |
| double this_totalrows = 0; |
| double this_totaldeadrows = 0; |
| |
| CHECK_FOR_INTERRUPTS(); |
| |
| ok = tuplestore_gettupleslot(portal->holdStore, true, false, slot); |
| |
| if (!ok) |
| break; |
| |
| typeinfo = slot->tts_tupleDescriptor; |
| natts = typeinfo->natts; |
| |
| /* There should be only one attribute with OID RECORDOID. */ |
| if (1 != natts) |
| { |
| elog(ERROR, |
| "wrong number of attributes %d when 1 expected", |
| natts); |
| } |
| |
| if (RECORDOID != typeinfo->attrs[0].atttypid) |
| { |
| elog(ERROR, |
| "wrong attribute OID %d, RECORDOID %d is expected", |
| typeinfo->attrs[0].atttypid, RECORDOID); |
| |
| } |
| |
| /* Make sure the tuple is fully deconstructed */ |
| slot_getallattrs(slot); |
| |
| /* There should be only one attribute with OID RECORDOID */ |
| attr = slot_getattr(slot, 1, &isnull); |
| if (isnull) |
| { |
| elog(ERROR, |
| "null value for attribute in tuple"); |
| } |
| |
| /* Get record from attribute and parse it */ |
| { |
| HeapTupleHeader rec = (HeapTupleHeader) PG_DETOAST_DATUM(attr); |
| Oid tupType; |
| int32 tupTypmod; |
| TupleDesc tupdesc; |
| HeapTupleData tuple; |
| |
| /* Extract type info from the tuple itself */ |
| tupType = HeapTupleHeaderGetTypeId(rec); |
| tupTypmod = HeapTupleHeaderGetTypMod(rec); |
| tupdesc = lookup_rowtype_tupdesc(tupType, tupTypmod); |
| |
| /* Build a temporary HeapTuple control structure */ |
| tuple.t_len = HeapTupleHeaderGetDatumLength(rec); |
| ItemPointerSetInvalid(&(tuple.t_self)); |
| tuple.t_data = rec; |
| |
| /* Break down the tuple into fields */ |
| heap_deform_tuple(&tuple, tupdesc, funcRetValues, funcRetNulls); |
| |
| if (!funcRetNulls[0]) |
| { |
| /* This is a summary row. */ |
| ArrayType *arrayVal; |
| Datum *colndv; |
| bool *nulls; |
| int numelems; |
| |
| Assert(!funcRetNulls[1] && !funcRetNulls[3]); |
| |
| this_totalrows = DatumGetFloat8(funcRetValues[0]); |
| this_totaldeadrows = DatumGetFloat8(funcRetValues[1]); |
| (*totalrows) += this_totalrows; |
| (*totaldeadrows) += this_totaldeadrows; |
| |
| arrayVal = DatumGetArrayTypeP(funcRetValues[3]); |
| deconstruct_array(arrayVal, FLOAT8OID, 8, true, 'd', |
| &colndv, &nulls, &numelems); |
| Assert(numelems == relDesc->natts); |
| for (i = 0; i < numelems; i++) |
| { |
| double this_colndv = DatumGetFloat8(colndv[i]); |
| if (this_colndv < 0) { |
| Assert(this_colndv >= -1); |
| colNDVBySeg[i] += fabs(this_colndv) * this_totalrows; |
| } else { |
| /* if current segment have any data, then ndv won't be 0. |
| * if current segment have no rows, ndv is 0. |
| */ |
| colNDVBySeg[i] += DatumGetFloat8(colndv[i]); |
| } |
| } |
| } |
| else |
| { |
| /* This is a sample row. */ |
| if (sampleTuples >= targrows) |
| elog(ERROR, "too many sample rows received from gp_acquire_sample_rows"); |
| |
| /* Read the 'toolarge' bitmap, if any */ |
| if (colLargeRowIndexes && !funcRetNulls[2]) |
| { |
| ArrayType *arrayVal; |
| Datum *largelength; |
| bool *nulls; |
| int numelems; |
| arrayVal = DatumGetArrayTypeP(funcRetValues[2]); |
| deconstruct_array(arrayVal, FLOAT8OID, 8, true, 'd', |
| &largelength, &nulls, &numelems); |
| |
| for (i = 0; i < relDesc->natts; i++) |
| { |
| Form_pg_attribute attr = TupleDescAttr(relDesc, i); |
| |
| if (attr->attisdropped) |
| continue; |
| |
| if (largelength[i] != (Datum) 0) |
| { |
| colLargeRowIndexes[i] = bms_add_member(colLargeRowIndexes[i], sampleTuples); |
| colLargeRowLength[i] += DatumGetFloat8(largelength[i]); |
| } |
| } |
| } |
| |
| /* Process the columns */ |
| index = 0; |
| for (i = 0; i < relDesc->natts; i++) |
| { |
| Form_pg_attribute attr = TupleDescAttr(relDesc, i); |
| |
| if (attr->attisdropped) |
| { |
| dnulls[i] = true; |
| continue; |
| } |
| |
| dnulls[i] = funcRetNulls[NUM_SAMPLE_FIXED_COLS + index]; |
| dvalues[i] = funcRetValues[NUM_SAMPLE_FIXED_COLS + index]; |
| index++; /* Move index to the next result set attribute */ |
| } |
| |
| /* |
| * Form a tuple |
| */ |
| rows[sampleTuples] = heap_form_tuple(attinmeta->tupdesc, |
| dvalues, |
| dnulls); |
| sampleTuples++; |
| |
| /* |
| * note: we don't set the OIDs in the sample. ANALYZE doesn't |
| * collect stats for them |
| */ |
| } |
| ReleaseTupleDesc(tupdesc); |
| } |
| |
| ExecClearTuple(slot); |
| } |
| ExecDropSingleTupleTableSlot(slot); |
| pfree(funcRetValues); |
| pfree(funcRetNulls); |
| pfree(dvalues); |
| pfree(dnulls); |
| |
| return sampleTuples; |
| } |
| |
| /* |
| * Collect a sample from segments. |
| * |
| * Calls the gp_acquire_sample_rows() helper function on each segment, |
| * and merges the results. |
| */ |
| static int |
| acquire_sample_rows_dispatcher(Relation onerel, bool inh, int elevel, |
| HeapTuple *rows, int targrows, |
| double *totalrows, double *totaldeadrows) |
| { |
| StringInfoData str; |
| int perseg_targrows; |
| int sampleTuples; /* 32 bit - assume that number of tuples will not > 2B */ |
| char *sql; |
| Portal portal; |
| QueryDesc *queryDesc = NULL; |
| |
| Assert(targrows > 0.0); |
| |
| /* |
| * Step1: Construct SQL command to dispatch to segments. |
| * |
| * Acquire an evenly-sized sample from each segment. |
| * |
| * XXX: If there's a significant bias between the segments, i.e. some |
| * segments have a lot more rows than others, the sample will biased, too. |
| * Would be nice to improve that, but it's not clear how. We could issue |
| * another query to get the table size from each segment first, and use |
| * those to weigh the sample size to get from each segment. But that'd |
| * require an extra round-trip, which is also not good. The caller |
| * actually already did that, to get the total relation size, but it |
| * doesn't pass that down to us, let alone the per-segment sizes. |
| */ |
| if (GpPolicyIsReplicated(onerel->rd_cdbpolicy)) |
| perseg_targrows = targrows; |
| else if (GpPolicyIsPartitioned(onerel->rd_cdbpolicy)) |
| perseg_targrows = targrows / onerel->rd_cdbpolicy->numsegments; |
| else |
| elog(ERROR, "acquire_sample_rows_dispatcher() cannot be used on a non-distributed table"); |
| |
| /* |
| * Did not use 'select * from pg_catalog.gp_acquire_sample_rows(...) as (..);' |
| * here. Because it requires to specify columns explicitly which leads to |
| * permission check on each columns. This is not consistent with GPDB5 and |
| * may result in different behaviour under different acl configuration. |
| */ |
| initStringInfo(&str); |
| appendStringInfo(&str, "select pg_catalog.gp_acquire_sample_rows(%u, %d, '%s');", |
| RelationGetRelid(onerel), |
| perseg_targrows, |
| inh ? "t" : "f"); |
| |
| /* |
| * Step2: Execute the constructed SQL. |
| * |
| * Do not use SPI here, because there might be a large number of wide rows |
| * returned and stored in memory, SPI cannot spill data to disk which may |
| * lead to OOM easily. |
| * |
| * Do not use SPI cusror either, because we should use SPI_cursor_fetch to fetch |
| * results in batches, which may have bad performance. |
| * |
| * Use ExecutorStart|ExecutorRun|ExecutorEnd to execute a plan and store results |
| * into tuplestore could handle this situation well. |
| * |
| * Execute the given query and store the results into portal->holdStore to |
| * avoid memory error. |
| */ |
| elog(elevel, "Executing SQL: %s", str.data); |
| sql = str.data; |
| /* Create a new portal to run the query in */ |
| portal = CreateNewPortal(); |
| /* Don't display the portal in pg_cursors, it is for internal use only */ |
| portal->visible = false; |
| /* use a tuplestore to store received tuples to avoid out of memory error */ |
| PortalCreateHoldStore(portal); |
| queryDesc = build_querydesc(portal, sql); |
| |
| /* Call ExecutorStart to prepare the plan for execution */ |
| ExecutorStart(queryDesc, 0); |
| |
| /* Run the plan */ |
| ExecutorRun(queryDesc, ForwardScanDirection, 0L, true); |
| |
| /* Wait for completion of all qExec processes. */ |
| if (queryDesc->estate->dispatcherState |
| && queryDesc->estate->dispatcherState->primaryResults) |
| { |
| cdbdisp_checkDispatchResult(queryDesc->estate->dispatcherState, DISPATCH_WAIT_NONE); |
| } |
| |
| ExecutorFinish(queryDesc); |
| /* |
| * Step3: process results. |
| */ |
| sampleTuples = process_sample_rows(portal, queryDesc, onerel, rows, |
| targrows, totalrows, totaldeadrows); |
| |
| ExecutorEnd(queryDesc); |
| FreeQueryDesc(queryDesc); |
| PortalDrop(portal, false); |
| |
| return sampleTuples; |
| } |
| |
| /* |
| * update_attstats() -- update attribute statistics for one relation |
| * |
| * Statistics are stored in several places: the pg_class row for the |
| * relation has stats about the whole relation, and there is a |
| * pg_statistic row for each (non-system) attribute that has ever |
| * been analyzed. The pg_class values are updated by VACUUM, not here. |
| * |
| * pg_statistic rows are just added or updated normally. This means |
| * that pg_statistic will probably contain some deleted rows at the |
| * completion of a vacuum cycle, unless it happens to get vacuumed last. |
| * |
| * To keep things simple, we punt for pg_statistic, and don't try |
| * to compute or store rows for pg_statistic itself in pg_statistic. |
| * This could possibly be made to work, but it's not worth the trouble. |
| * Note analyze_rel() has seen to it that we won't come here when |
| * vacuuming pg_statistic itself. |
| * |
| * Note: there would be a race condition here if two backends could |
| * ANALYZE the same table concurrently. Presently, we lock that out |
| * by taking a self-exclusive lock on the relation in analyze_rel(). |
| */ |
| static void |
| update_attstats(Oid relid, bool inh, int natts, VacAttrStats **vacattrstats) |
| { |
| Relation sd; |
| int attno; |
| |
| if (natts <= 0) |
| return; /* nothing to do */ |
| |
| sd = table_open(StatisticRelationId, RowExclusiveLock); |
| |
| for (attno = 0; attno < natts; attno++) |
| { |
| VacAttrStats *stats = vacattrstats[attno]; |
| HeapTuple stup, |
| oldtup; |
| int i, |
| k, |
| n; |
| Datum values[Natts_pg_statistic]; |
| bool nulls[Natts_pg_statistic]; |
| bool replaces[Natts_pg_statistic]; |
| |
| /* Ignore attr if we weren't able to collect stats */ |
| if (!stats->stats_valid) |
| continue; |
| |
| /* |
| * Construct a new pg_statistic tuple |
| */ |
| for (i = 0; i < Natts_pg_statistic; ++i) |
| { |
| nulls[i] = false; |
| replaces[i] = true; |
| } |
| |
| values[Anum_pg_statistic_starelid - 1] = ObjectIdGetDatum(relid); |
| values[Anum_pg_statistic_staattnum - 1] = Int16GetDatum(stats->attr->attnum); |
| values[Anum_pg_statistic_stainherit - 1] = BoolGetDatum(inh); |
| values[Anum_pg_statistic_stanullfrac - 1] = Float4GetDatum(stats->stanullfrac); |
| values[Anum_pg_statistic_stawidth - 1] = Int32GetDatum(stats->stawidth); |
| values[Anum_pg_statistic_stadistinct - 1] = Float4GetDatum(stats->stadistinct); |
| i = Anum_pg_statistic_stakind1 - 1; |
| for (k = 0; k < STATISTIC_NUM_SLOTS; k++) |
| { |
| values[i++] = Int16GetDatum(stats->stakind[k]); /* stakindN */ |
| } |
| i = Anum_pg_statistic_staop1 - 1; |
| for (k = 0; k < STATISTIC_NUM_SLOTS; k++) |
| { |
| values[i++] = ObjectIdGetDatum(stats->staop[k]); /* staopN */ |
| } |
| i = Anum_pg_statistic_stacoll1 - 1; |
| for (k = 0; k < STATISTIC_NUM_SLOTS; k++) |
| { |
| values[i++] = ObjectIdGetDatum(stats->stacoll[k]); /* stacollN */ |
| } |
| i = Anum_pg_statistic_stanumbers1 - 1; |
| for (k = 0; k < STATISTIC_NUM_SLOTS; k++) |
| { |
| int nnum = stats->numnumbers[k]; |
| |
| if (nnum > 0) |
| { |
| Datum *numdatums = (Datum *) palloc(nnum * sizeof(Datum)); |
| ArrayType *arry; |
| |
| for (n = 0; n < nnum; n++) |
| numdatums[n] = Float4GetDatum(stats->stanumbers[k][n]); |
| /* XXX knows more than it should about type float4: */ |
| arry = construct_array(numdatums, nnum, |
| FLOAT4OID, |
| sizeof(float4), true, TYPALIGN_INT); |
| values[i++] = PointerGetDatum(arry); /* stanumbersN */ |
| } |
| else |
| { |
| nulls[i] = true; |
| values[i++] = (Datum) 0; |
| } |
| } |
| i = Anum_pg_statistic_stavalues1 - 1; |
| for (k = 0; k < STATISTIC_NUM_SLOTS; k++) |
| { |
| if (stats->numvalues[k] > 0) |
| { |
| ArrayType *arry; |
| |
| arry = construct_array(stats->stavalues[k], |
| stats->numvalues[k], |
| stats->statypid[k], |
| stats->statyplen[k], |
| stats->statypbyval[k], |
| stats->statypalign[k]); |
| values[i++] = PointerGetDatum(arry); /* stavaluesN */ |
| } |
| else |
| { |
| nulls[i] = true; |
| values[i++] = (Datum) 0; |
| } |
| } |
| |
| /* Is there already a pg_statistic tuple for this attribute? */ |
| oldtup = SearchSysCache3(STATRELATTINH, |
| ObjectIdGetDatum(relid), |
| Int16GetDatum(stats->attr->attnum), |
| BoolGetDatum(inh)); |
| |
| if (HeapTupleIsValid(oldtup)) |
| { |
| /* Yes, replace it */ |
| stup = heap_modify_tuple(oldtup, |
| RelationGetDescr(sd), |
| values, |
| nulls, |
| replaces); |
| ReleaseSysCache(oldtup); |
| CatalogTupleUpdate(sd, &stup->t_self, stup); |
| } |
| else |
| { |
| /* No, insert new tuple */ |
| stup = heap_form_tuple(RelationGetDescr(sd), values, nulls); |
| CatalogTupleInsert(sd, stup); |
| } |
| |
| heap_freetuple(stup); |
| } |
| |
| table_close(sd, RowExclusiveLock); |
| } |
| |
| /* |
| * Standard fetch function for use by compute_stats subroutines. |
| * |
| * This exists to provide some insulation between compute_stats routines |
| * and the actual storage of the sample data. |
| */ |
| static Datum |
| std_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull) |
| { |
| int attnum = stats->tupattnum; |
| HeapTuple tuple = stats->rows[rownum]; |
| TupleDesc tupDesc = stats->tupDesc; |
| |
| return heap_getattr(tuple, attnum, tupDesc, isNull); |
| } |
| |
| /* |
| * Fetch function for analyzing index expressions. |
| * |
| * We have not bothered to construct index tuples, instead the data is |
| * just in Datum arrays. |
| */ |
| static Datum |
| ind_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull) |
| { |
| int i; |
| |
| /* exprvals and exprnulls are already offset for proper column */ |
| i = rownum * stats->rowstride; |
| *isNull = stats->exprnulls[i]; |
| return stats->exprvals[i]; |
| } |
| |
| |
| /*========================================================================== |
| * |
| * Code below this point represents the "standard" type-specific statistics |
| * analysis algorithms. This code can be replaced on a per-data-type basis |
| * by setting a nonzero value in pg_type.typanalyze. |
| * |
| *========================================================================== |
| */ |
| |
| |
| /* |
| * In PostgreSQL, WIDTH_THRESHOLD is here, but we've moved it to vacuum.h in |
| * GPDB. |
| */ |
| |
| #define swapInt(a,b) do {int _tmp; _tmp=a; a=b; b=_tmp;} while(0) |
| #define swapDatum(a,b) do {Datum _tmp; _tmp=a; a=b; b=_tmp;} while(0) |
| |
| /* |
| * Extra information used by the default analysis routines |
| */ |
| typedef struct |
| { |
| int count; /* # of duplicates */ |
| int first; /* values[] index of first occurrence */ |
| } ScalarMCVItem; |
| |
| typedef struct |
| { |
| SortSupport ssup; |
| int *tupnoLink; |
| } CompareScalarsContext; |
| |
| |
| static void compute_trivial_stats(VacAttrStatsP stats, |
| AnalyzeAttrFetchFunc fetchfunc, |
| int samplerows, |
| double totalrows); |
| static void compute_distinct_stats(VacAttrStatsP stats, |
| AnalyzeAttrFetchFunc fetchfunc, |
| int samplerows, |
| double totalrows); |
| static void compute_scalar_stats(VacAttrStatsP stats, |
| AnalyzeAttrFetchFunc fetchfunc, |
| int samplerows, |
| double totalrows); |
| static void merge_leaf_stats(VacAttrStatsP stats, |
| AnalyzeAttrFetchFunc fetchfunc, |
| int samplerows, |
| double totalrows); |
| static int compare_scalars(const void *a, const void *b, void *arg); |
| static int compare_mcvs(const void *a, const void *b); |
| static int analyze_mcv_list(int *mcv_counts, |
| int num_mcv, |
| double stadistinct, |
| double stanullfrac, |
| int samplerows, |
| double totalrows); |
| |
| |
| /* |
| * std_typanalyze -- the default type-specific typanalyze function |
| */ |
| bool |
| std_typanalyze(VacAttrStats *stats) |
| { |
| Form_pg_attribute attr = stats->attr; |
| Oid ltopr; |
| Oid eqopr; |
| StdAnalyzeData *mystats; |
| |
| /* If the attstattarget column is negative, use the default value */ |
| /* NB: it is okay to scribble on stats->attr since it's a copy */ |
| if (attr->attstattarget < 0) |
| attr->attstattarget = default_statistics_target; |
| |
| /* Look for default "<" and "=" operators for column's type */ |
| get_sort_group_operators(stats->attrtypid, |
| false, false, false, |
| <opr, &eqopr, NULL, |
| NULL); |
| |
| /* Save the operator info for compute_stats routines */ |
| mystats = (StdAnalyzeData *) palloc(sizeof(StdAnalyzeData)); |
| mystats->eqopr = eqopr; |
| mystats->eqfunc = OidIsValid(eqopr) ? get_opcode(eqopr) : InvalidOid; |
| mystats->ltopr = ltopr; |
| stats->extra_data = mystats; |
| stats->merge_stats = false; |
| |
| /* |
| * Determine which standard statistics algorithm to use |
| */ |
| List *va_cols = list_make1(makeString(NameStr(stats->attr->attname))); |
| if (get_rel_relkind(attr->attrelid) == RELKIND_PARTITIONED_TABLE && |
| !get_rel_relispartition(attr->attrelid) && |
| leaf_parts_analyzed(stats->attr->attrelid, InvalidOid, va_cols, stats->elevel) && |
| op_hashjoinable(eqopr, stats->attrtypid)) |
| { |
| stats->merge_stats = true; |
| stats->compute_stats = merge_leaf_stats; |
| stats->minrows = 300 * attr->attstattarget; |
| } |
| else |
| if (OidIsValid(eqopr) && OidIsValid(ltopr)) |
| { |
| /* Seems to be a scalar datatype */ |
| stats->compute_stats = compute_scalar_stats; |
| /*-------------------- |
| * The following choice of minrows is based on the paper |
| * "Random sampling for histogram construction: how much is enough?" |
| * by Surajit Chaudhuri, Rajeev Motwani and Vivek Narasayya, in |
| * Proceedings of ACM SIGMOD International Conference on Management |
| * of Data, 1998, Pages 436-447. Their Corollary 1 to Theorem 5 |
| * says that for table size n, histogram size k, maximum relative |
| * error in bin size f, and error probability gamma, the minimum |
| * random sample size is |
| * r = 4 * k * ln(2*n/gamma) / f^2 |
| * Taking f = 0.5, gamma = 0.01, n = 10^6 rows, we obtain |
| * r = 305.82 * k |
| * Note that because of the log function, the dependence on n is |
| * quite weak; even at n = 10^12, a 300*k sample gives <= 0.66 |
| * bin size error with probability 0.99. So there's no real need to |
| * scale for n, which is a good thing because we don't necessarily |
| * know it at this point. |
| *-------------------- |
| */ |
| stats->minrows = 300 * attr->attstattarget; |
| } |
| else if (OidIsValid(eqopr)) |
| { |
| /* We can still recognize distinct values */ |
| stats->compute_stats = compute_distinct_stats; |
| /* Might as well use the same minrows as above */ |
| stats->minrows = 300 * attr->attstattarget; |
| } |
| else |
| { |
| /* Can't do much but the trivial stuff */ |
| stats->compute_stats = compute_trivial_stats; |
| /* Might as well use the same minrows as above */ |
| stats->minrows = 300 * attr->attstattarget; |
| } |
| list_free(va_cols); |
| return true; |
| } |
| |
| |
| /* |
| * compute_trivial_stats() -- compute very basic column statistics |
| * |
| * We use this when we cannot find a hash "=" operator for the datatype. |
| * |
| * We determine the fraction of non-null rows and the average datum width. |
| */ |
| static void |
| compute_trivial_stats(VacAttrStatsP stats, |
| AnalyzeAttrFetchFunc fetchfunc, |
| int samplerows, |
| double totalrows) |
| { |
| int i; |
| int null_cnt = 0; |
| int nonnull_cnt = 0; |
| double total_width = 0; |
| bool is_varlena = (!stats->attrtype->typbyval && |
| stats->attrtype->typlen == -1); |
| bool is_varwidth = (!stats->attrtype->typbyval && |
| stats->attrtype->typlen < 0); |
| |
| for (i = 0; i < samplerows; i++) |
| { |
| Datum value; |
| bool isnull; |
| |
| vacuum_delay_point(); |
| |
| value = fetchfunc(stats, i, &isnull); |
| |
| /* Check for null/nonnull */ |
| if (isnull) |
| { |
| null_cnt++; |
| continue; |
| } |
| nonnull_cnt++; |
| |
| /* |
| * If it's a variable-width field, add up widths for average width |
| * calculation. Note that if the value is toasted, we use the toasted |
| * width. We don't bother with this calculation if it's a fixed-width |
| * type. |
| */ |
| if (is_varlena) |
| { |
| total_width += VARSIZE_ANY(DatumGetPointer(value)); |
| } |
| else if (is_varwidth) |
| { |
| /* must be cstring */ |
| total_width += strlen(DatumGetCString(value)) + 1; |
| } |
| } |
| |
| /* We can only compute average width if we found some non-null values. */ |
| if (nonnull_cnt > 0) |
| { |
| stats->stats_valid = true; |
| /* Do the simple null-frac and width stats */ |
| stats->stanullfrac = (double) null_cnt / (double) samplerows; |
| if (is_varwidth) |
| stats->stawidth = total_width / (double) nonnull_cnt; |
| else |
| stats->stawidth = stats->attrtype->typlen; |
| stats->stadistinct = 0.0; /* "unknown" */ |
| } |
| else if (null_cnt > 0) |
| { |
| /* We found only nulls; assume the column is entirely null */ |
| stats->stats_valid = true; |
| stats->stanullfrac = 1.0; |
| if (is_varwidth) |
| stats->stawidth = 0; /* "unknown" */ |
| else |
| stats->stawidth = stats->attrtype->typlen; |
| stats->stadistinct = 0.0; /* "unknown" */ |
| } |
| } |
| |
| |
| /* |
| * compute_distinct_stats() -- compute column statistics including ndistinct |
| * |
| * We use this when we can find only an "=" operator for the datatype. |
| * |
| * We determine the fraction of non-null rows, the average width, the |
| * most common values, and the (estimated) number of distinct values. |
| * |
| * The most common values are determined by brute force: we keep a list |
| * of previously seen values, ordered by number of times seen, as we scan |
| * the samples. A newly seen value is inserted just after the last |
| * multiply-seen value, causing the bottommost (oldest) singly-seen value |
| * to drop off the list. The accuracy of this method, and also its cost, |
| * depend mainly on the length of the list we are willing to keep. |
| */ |
| static void |
| compute_distinct_stats(VacAttrStatsP stats, |
| AnalyzeAttrFetchFunc fetchfunc, |
| int samplerows, |
| double totalrows) |
| { |
| int i; |
| int null_cnt = 0; |
| int nonnull_cnt = 0; |
| int toowide_cnt = 0; |
| double total_width = 0; |
| bool is_varlena = (!stats->attrtype->typbyval && |
| stats->attrtype->typlen == -1); |
| bool is_varwidth = (!stats->attrtype->typbyval && |
| stats->attrtype->typlen < 0); |
| FmgrInfo f_cmpeq; |
| typedef struct |
| { |
| Datum value; |
| int count; |
| } TrackItem; |
| TrackItem *track; |
| int track_cnt, |
| track_max; |
| int num_mcv = stats->attr->attstattarget; |
| StdAnalyzeData *mystats = (StdAnalyzeData *) stats->extra_data; |
| |
| /* |
| * We track up to 2*n values for an n-element MCV list; but at least 10 |
| */ |
| track_max = 2 * num_mcv; |
| if (track_max < 10) |
| track_max = 10; |
| track = (TrackItem *) palloc(track_max * sizeof(TrackItem)); |
| track_cnt = 0; |
| |
| fmgr_info(mystats->eqfunc, &f_cmpeq); |
| |
| stats->stahll = (bytea *)gp_hyperloglog_init_def(); |
| |
| ereport(DEBUG2, |
| (errmsg("Computing Minimal Stats for column %s", |
| get_attname(stats->attr->attrelid, stats->attr->attnum, false)))); |
| |
| for (i = 0; i < samplerows; i++) |
| { |
| Datum value; |
| bool isnull; |
| bool match; |
| int firstcount1, |
| j; |
| |
| vacuum_delay_point(); |
| |
| value = fetchfunc(stats, i, &isnull); |
| |
| /* Check for null/nonnull */ |
| if (isnull) |
| { |
| null_cnt++; |
| continue; |
| } |
| nonnull_cnt++; |
| |
| stats->stahll = (bytea *)gp_hyperloglog_add_item((GpHLLCounter) stats->stahll, value, stats->attr->attlen, stats->attr->attbyval, stats->attr->attalign); |
| |
| /* |
| * If it's a variable-width field, add up widths for average width |
| * calculation. Note that if the value is toasted, we use the toasted |
| * width. We don't bother with this calculation if it's a fixed-width |
| * type. |
| */ |
| if (is_varlena) |
| { |
| total_width += VARSIZE_ANY(DatumGetPointer(value)); |
| |
| /* |
| * If the value is toasted, we want to detoast it just once to |
| * avoid repeated detoastings and resultant excess memory usage |
| * during the comparisons. Also, check to see if the value is |
| * excessively wide, and if so don't detoast at all --- just |
| * ignore the value. |
| */ |
| if (toast_raw_datum_size(value) > WIDTH_THRESHOLD) |
| { |
| toowide_cnt++; |
| continue; |
| } |
| value = PointerGetDatum(PG_DETOAST_DATUM(value)); |
| } |
| else if (is_varwidth) |
| { |
| /* must be cstring */ |
| total_width += strlen(DatumGetCString(value)) + 1; |
| } |
| |
| /* |
| * See if the value matches anything we're already tracking. |
| */ |
| match = false; |
| firstcount1 = track_cnt; |
| for (j = 0; j < track_cnt; j++) |
| { |
| if (DatumGetBool(FunctionCall2Coll(&f_cmpeq, |
| stats->attrcollid, |
| value, track[j].value))) |
| { |
| match = true; |
| break; |
| } |
| if (j < firstcount1 && track[j].count == 1) |
| firstcount1 = j; |
| } |
| |
| if (match) |
| { |
| /* Found a match */ |
| track[j].count++; |
| /* This value may now need to "bubble up" in the track list */ |
| while (j > 0 && track[j].count > track[j - 1].count) |
| { |
| swapDatum(track[j].value, track[j - 1].value); |
| swapInt(track[j].count, track[j - 1].count); |
| j--; |
| } |
| } |
| else |
| { |
| /* No match. Insert at head of count-1 list */ |
| if (track_cnt < track_max) |
| track_cnt++; |
| for (j = track_cnt - 1; j > firstcount1; j--) |
| { |
| track[j].value = track[j - 1].value; |
| track[j].count = track[j - 1].count; |
| } |
| if (firstcount1 < track_cnt) |
| { |
| track[firstcount1].value = value; |
| track[firstcount1].count = 1; |
| } |
| } |
| } |
| |
| /* We can only compute real stats if we found some non-null values. */ |
| if (nonnull_cnt > 0) |
| { |
| int nmultiple, |
| summultiple; |
| |
| stats->stats_valid = true; |
| /* Do the simple null-frac and width stats */ |
| stats->stanullfrac = (double) null_cnt / (double) samplerows; |
| if (is_varwidth) |
| stats->stawidth = (total_width + stats->totalwidelength) / (double) (nonnull_cnt + stats->widerow_num); |
| else |
| stats->stawidth = stats->attrtype->typlen; |
| |
| /* Count the number of values we found multiple times */ |
| summultiple = 0; |
| for (nmultiple = 0; nmultiple < track_cnt; nmultiple++) |
| { |
| if (track[nmultiple].count == 1) |
| break; |
| summultiple += track[nmultiple].count; |
| } |
| |
| ((GpHLLCounter) (stats->stahll))->nmultiples = nmultiple; |
| ((GpHLLCounter) (stats->stahll))->ndistinct = track_cnt; |
| ((GpHLLCounter) (stats->stahll))->samplerows = samplerows; |
| |
| if (nmultiple == 0) |
| { |
| /* |
| * If we found no repeated non-null values, assume it's a unique |
| * column; but be sure to discount for any nulls we found. |
| */ |
| stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac); |
| } |
| else if (track_cnt < track_max && toowide_cnt == 0 && |
| nmultiple == track_cnt) |
| { |
| /* |
| * Our track list includes every value in the sample, and every |
| * value appeared more than once. Assume the column has just |
| * these values. (This case is meant to address columns with |
| * small, fixed sets of possible values, such as boolean or enum |
| * columns. If there are any values that appear just once in the |
| * sample, including too-wide values, we should assume that that's |
| * not what we're dealing with.) |
| */ |
| stats->stadistinct = track_cnt; |
| } |
| else |
| { |
| /*---------- |
| * Estimate the number of distinct values using the estimator |
| * proposed by Haas and Stokes in IBM Research Report RJ 10025: |
| * n*d / (n - f1 + f1*n/N) |
| * where f1 is the number of distinct values that occurred |
| * exactly once in our sample of n rows (from a total of N), |
| * and d is the total number of distinct values in the sample. |
| * This is their Duj1 estimator; the other estimators they |
| * recommend are considerably more complex, and are numerically |
| * very unstable when n is much smaller than N. |
| * |
| * In this calculation, we consider only non-nulls. We used to |
| * include rows with null values in the n and N counts, but that |
| * leads to inaccurate answers in columns with many nulls, and |
| * it's intuitively bogus anyway considering the desired result is |
| * the number of distinct non-null values. |
| * |
| * We assume (not very reliably!) that all the multiply-occurring |
| * values are reflected in the final track[] list, and the other |
| * nonnull values all appeared but once. (XXX this usually |
| * results in a drastic overestimate of ndistinct. Can we do |
| * any better?) |
| *---------- |
| */ |
| int f1 = nonnull_cnt - summultiple; |
| int d = f1 + nmultiple; |
| double n = samplerows - null_cnt; |
| double N = totalrows * (1.0 - stats->stanullfrac); |
| double stadistinct; |
| |
| /* N == 0 shouldn't happen, but just in case ... */ |
| if (N > 0) |
| stadistinct = (n * d) / ((n - f1) + f1 * n / N); |
| else |
| stadistinct = 0; |
| |
| /* Clamp to sane range in case of roundoff error */ |
| if (stadistinct < d) |
| stadistinct = d; |
| if (stadistinct > N) |
| stadistinct = N; |
| /* And round to integer */ |
| stats->stadistinct = floor(stadistinct + 0.5); |
| } |
| |
| /* |
| * If we estimated the number of distinct values at more than 10% of |
| * the total row count (a very arbitrary limit), then assume that |
| * stadistinct should scale with the row count rather than be a fixed |
| * value. |
| */ |
| if (stats->stadistinct > 0.1 * totalrows) |
| stats->stadistinct = -(stats->stadistinct / totalrows); |
| |
| /* |
| * Decide how many values are worth storing as most-common values. If |
| * we are able to generate a complete MCV list (all the values in the |
| * sample will fit, and we think these are all the ones in the table), |
| * then do so. Otherwise, store only those values that are |
| * significantly more common than the values not in the list. |
| * |
| * Note: the first of these cases is meant to address columns with |
| * small, fixed sets of possible values, such as boolean or enum |
| * columns. If we can *completely* represent the column population by |
| * an MCV list that will fit into the stats target, then we should do |
| * so and thus provide the planner with complete information. But if |
| * the MCV list is not complete, it's generally worth being more |
| * selective, and not just filling it all the way up to the stats |
| * target. |
| */ |
| if (track_cnt < track_max && toowide_cnt == 0 && |
| stats->stadistinct > 0 && |
| track_cnt <= num_mcv) |
| { |
| /* Track list includes all values seen, and all will fit */ |
| num_mcv = track_cnt; |
| } |
| else |
| { |
| int *mcv_counts; |
| |
| /* Incomplete list; decide how many values are worth keeping */ |
| if (num_mcv > track_cnt) |
| num_mcv = track_cnt; |
| |
| if (num_mcv > 0) |
| { |
| mcv_counts = (int *) palloc(num_mcv * sizeof(int)); |
| for (i = 0; i < num_mcv; i++) |
| mcv_counts[i] = track[i].count; |
| |
| num_mcv = analyze_mcv_list(mcv_counts, num_mcv, |
| stats->stadistinct, |
| stats->stanullfrac, |
| samplerows, totalrows); |
| } |
| } |
| |
| /* Generate MCV slot entry */ |
| if (num_mcv > 0) |
| { |
| MemoryContext old_context; |
| Datum *mcv_values; |
| float4 *mcv_freqs; |
| |
| /* Must copy the target values into anl_context */ |
| old_context = MemoryContextSwitchTo(stats->anl_context); |
| mcv_values = (Datum *) palloc(num_mcv * sizeof(Datum)); |
| mcv_freqs = (float4 *) palloc(num_mcv * sizeof(float4)); |
| for (i = 0; i < num_mcv; i++) |
| { |
| mcv_values[i] = datumCopy(track[i].value, |
| stats->attrtype->typbyval, |
| stats->attrtype->typlen); |
| mcv_freqs[i] = (double) track[i].count / (double) samplerows; |
| } |
| MemoryContextSwitchTo(old_context); |
| |
| stats->stakind[0] = STATISTIC_KIND_MCV; |
| stats->staop[0] = mystats->eqopr; |
| stats->stacoll[0] = stats->attrcollid; |
| stats->stanumbers[0] = mcv_freqs; |
| stats->numnumbers[0] = num_mcv; |
| stats->stavalues[0] = mcv_values; |
| stats->numvalues[0] = num_mcv; |
| |
| /* |
| * Accept the defaults for stats->statypid and others. They have |
| * been set before we were called (see vacuum.h) |
| */ |
| } |
| } |
| else if (null_cnt > 0) |
| { |
| /* We found only nulls; assume the column is entirely null */ |
| stats->stats_valid = true; |
| stats->stanullfrac = 1.0; |
| if (is_varwidth) |
| stats->stawidth = 0; /* "unknown" */ |
| else |
| stats->stawidth = stats->attrtype->typlen; |
| stats->stadistinct = 0.0; /* "unknown" */ |
| } |
| |
| /* We don't need to bother cleaning up any of our temporary palloc's */ |
| } |
| |
| |
| /* |
| * compute_scalar_stats() -- compute column statistics |
| * |
| * We use this when we can find "=" and "<" operators for the datatype. |
| * |
| * We determine the fraction of non-null rows, the average width, the |
| * most common values, the (estimated) number of distinct values, the |
| * distribution histogram, and the correlation of physical to logical order. |
| * |
| * The desired stats can be determined fairly easily after sorting the |
| * data values into order. |
| */ |
| static void |
| compute_scalar_stats(VacAttrStatsP stats, |
| AnalyzeAttrFetchFunc fetchfunc, |
| int samplerows, |
| double totalrows) |
| { |
| int i; |
| int null_cnt = 0; |
| int nonnull_cnt = 0; |
| int toowide_cnt = 0; |
| double total_width = 0; |
| bool is_varlena = (!stats->attrtype->typbyval && |
| stats->attrtype->typlen == -1); |
| bool is_varwidth = (!stats->attrtype->typbyval && |
| stats->attrtype->typlen < 0); |
| double corr_xysum; |
| SortSupportData ssup; |
| ScalarItem *values; |
| int values_cnt = 0; |
| int *tupnoLink; |
| ScalarMCVItem *track; |
| int track_cnt = 0; |
| int num_mcv = stats->attr->attstattarget; |
| int num_bins = stats->attr->attstattarget; |
| StdAnalyzeData *mystats = (StdAnalyzeData *) stats->extra_data; |
| |
| values = (ScalarItem *) palloc(samplerows * sizeof(ScalarItem)); |
| tupnoLink = (int *) palloc(samplerows * sizeof(int)); |
| track = (ScalarMCVItem *) palloc(num_mcv * sizeof(ScalarMCVItem)); |
| |
| memset(&ssup, 0, sizeof(ssup)); |
| ssup.ssup_cxt = CurrentMemoryContext; |
| ssup.ssup_collation = stats->attrcollid; |
| ssup.ssup_nulls_first = false; |
| |
| /* |
| * For now, don't perform abbreviated key conversion, because full values |
| * are required for MCV slot generation. Supporting that optimization |
| * would necessitate teaching compare_scalars() to call a tie-breaker. |
| */ |
| ssup.abbreviate = false; |
| |
| PrepareSortSupportFromOrderingOp(mystats->ltopr, &ssup); |
| |
| /* Initialize HLL counter to be stored in stats */ |
| stats->stahll = (bytea *)gp_hyperloglog_init_def(); |
| |
| ereport(DEBUG2, |
| (errmsg("Computing Scalar Stats for column %s", |
| get_attname(stats->attr->attrelid, stats->attr->attnum, false)))); |
| |
| /* Initial scan to find sortable values */ |
| for (i = 0; i < samplerows; i++) |
| { |
| Datum value; |
| bool isnull; |
| |
| vacuum_delay_point(); |
| |
| value = fetchfunc(stats, i, &isnull); |
| |
| /* Check for null/nonnull */ |
| if (isnull) |
| { |
| null_cnt++; |
| continue; |
| } |
| nonnull_cnt++; |
| |
| stats->stahll = (bytea *)gp_hyperloglog_add_item((GpHLLCounter) stats->stahll, value, stats->attr->attlen, stats->attr->attbyval, stats->attr->attalign); |
| |
| /* |
| * If it's a variable-width field, add up widths for average width |
| * calculation. Note that if the value is toasted, we use the toasted |
| * width. We don't bother with this calculation if it's a fixed-width |
| * type. |
| */ |
| if (is_varlena) |
| { |
| total_width += VARSIZE_ANY(DatumGetPointer(value)); |
| |
| /* |
| * If the value is toasted, we want to detoast it just once to |
| * avoid repeated detoastings and resultant excess memory usage |
| * during the comparisons. Also, check to see if the value is |
| * excessively wide, and if so don't detoast at all --- just |
| * ignore the value. |
| */ |
| if (toast_raw_datum_size(value) > WIDTH_THRESHOLD) |
| { |
| toowide_cnt++; |
| continue; |
| } |
| value = PointerGetDatum(PG_DETOAST_DATUM(value)); |
| } |
| else if (is_varwidth) |
| { |
| /* must be cstring */ |
| total_width += strlen(DatumGetCString(value)) + 1; |
| } |
| |
| /* Add it to the list to be sorted */ |
| values[values_cnt].value = value; |
| values[values_cnt].tupno = values_cnt; |
| tupnoLink[values_cnt] = values_cnt; |
| values_cnt++; |
| } |
| |
| /* We can only compute real stats if we found some sortable values. */ |
| if (values_cnt > 0) |
| { |
| int ndistinct, /* # distinct values in sample */ |
| nmultiple, /* # that appear multiple times */ |
| num_hist, |
| dups_cnt; |
| int slot_idx = 0; |
| CompareScalarsContext cxt; |
| |
| /* Sort the collected values */ |
| cxt.ssup = &ssup; |
| cxt.tupnoLink = tupnoLink; |
| qsort_arg((void *) values, values_cnt, sizeof(ScalarItem), |
| compare_scalars, (void *) &cxt); |
| |
| /* |
| * Now scan the values in order, find the most common ones, and also |
| * accumulate ordering-correlation statistics. |
| * |
| * To determine which are most common, we first have to count the |
| * number of duplicates of each value. The duplicates are adjacent in |
| * the sorted list, so a brute-force approach is to compare successive |
| * datum values until we find two that are not equal. However, that |
| * requires N-1 invocations of the datum comparison routine, which are |
| * completely redundant with work that was done during the sort. (The |
| * sort algorithm must at some point have compared each pair of items |
| * that are adjacent in the sorted order; otherwise it could not know |
| * that it's ordered the pair correctly.) We exploit this by having |
| * compare_scalars remember the highest tupno index that each |
| * ScalarItem has been found equal to. At the end of the sort, a |
| * ScalarItem's tupnoLink will still point to itself if and only if it |
| * is the last item of its group of duplicates (since the group will |
| * be ordered by tupno). |
| */ |
| corr_xysum = 0; |
| ndistinct = 0; |
| nmultiple = 0; |
| dups_cnt = 0; |
| for (i = 0; i < values_cnt; i++) |
| { |
| int tupno = values[i].tupno; |
| |
| corr_xysum += ((double) i) * ((double) tupno); |
| dups_cnt++; |
| if (tupnoLink[tupno] == tupno) |
| { |
| /* Reached end of duplicates of this value */ |
| ndistinct++; |
| if (dups_cnt > 1) |
| { |
| nmultiple++; |
| if (track_cnt < num_mcv || |
| dups_cnt > track[track_cnt - 1].count) |
| { |
| /* |
| * Found a new item for the mcv list; find its |
| * position, bubbling down old items if needed. Loop |
| * invariant is that j points at an empty/ replaceable |
| * slot. |
| */ |
| int j; |
| |
| if (track_cnt < num_mcv) |
| track_cnt++; |
| for (j = track_cnt - 1; j > 0; j--) |
| { |
| if (dups_cnt <= track[j - 1].count) |
| break; |
| track[j].count = track[j - 1].count; |
| track[j].first = track[j - 1].first; |
| } |
| track[j].count = dups_cnt; |
| track[j].first = i + 1 - dups_cnt; |
| } |
| } |
| dups_cnt = 0; |
| } |
| } |
| |
| stats->stats_valid = true; |
| /* Do the simple null-frac and width stats */ |
| stats->stanullfrac = (double) null_cnt / (double) samplerows; |
| if (is_varwidth) |
| stats->stawidth = (total_width + stats->totalwidelength) / (double) (nonnull_cnt + stats->widerow_num); |
| else |
| stats->stawidth = stats->attrtype->typlen; |
| |
| // interpolate NDV calculation based on the hll distinct count |
| // for each column in leaf partitions which will be used later |
| // to merge root stats |
| ((GpHLLCounter) (stats->stahll))->nmultiples = nmultiple; |
| ((GpHLLCounter) (stats->stahll))->ndistinct = ndistinct; |
| ((GpHLLCounter) (stats->stahll))->samplerows = samplerows; |
| |
| if (nmultiple == 0) |
| { |
| /* |
| * If we found no repeated non-null values, assume it's a unique |
| * column; but be sure to discount for any nulls we found. |
| */ |
| stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac); |
| } |
| else if (toowide_cnt == 0 && nmultiple == ndistinct) |
| { |
| /* |
| * Every value in the sample appeared more than once. Assume the |
| * column has just these values. (This case is meant to address |
| * columns with small, fixed sets of possible values, such as |
| * boolean or enum columns. If there are any values that appear |
| * just once in the sample, including too-wide values, we should |
| * assume that that's not what we're dealing with.) |
| */ |
| stats->stadistinct = ndistinct; |
| } |
| else |
| { |
| /*---------- |
| * Estimate the number of distinct values using the estimator |
| * proposed by Haas and Stokes in IBM Research Report RJ 10025: |
| * n*d / (n - f1 + f1*n/N) |
| * where f1 is the number of distinct values that occurred |
| * exactly once in our sample of n rows (from a total of N), |
| * and d is the total number of distinct values in the sample. |
| * This is their Duj1 estimator; the other estimators they |
| * recommend are considerably more complex, and are numerically |
| * very unstable when n is much smaller than N. |
| * |
| * In this calculation, we consider only non-nulls. We used to |
| * include rows with null values in the n and N counts, but that |
| * leads to inaccurate answers in columns with many nulls, and |
| * it's intuitively bogus anyway considering the desired result is |
| * the number of distinct non-null values. |
| * |
| * Overwidth values are assumed to have been distinct. |
| *---------- |
| */ |
| int f1 = ndistinct - nmultiple + toowide_cnt; |
| int d = f1 + nmultiple; |
| double n = samplerows - null_cnt; |
| double N = totalrows * (1.0 - stats->stanullfrac); |
| double stadistinct; |
| |
| /* N == 0 shouldn't happen, but just in case ... */ |
| if (N > 0) |
| stadistinct = (n * d) / ((n - f1) + f1 * n / N); |
| else |
| stadistinct = 0; |
| |
| /* Clamp to sane range in case of roundoff error */ |
| if (stadistinct < d) |
| stadistinct = d; |
| if (stadistinct > N) |
| stadistinct = N; |
| /* And round to integer */ |
| stats->stadistinct = floor(stadistinct + 0.5); |
| } |
| |
| /* |
| * For FULLSCAN HLL, get ndistinct from the GpHLLCounter |
| * instead of computing it |
| */ |
| if (stats->stahll_full != NULL) |
| { |
| GpHLLCounter hLLFull = (GpHLLCounter) DatumGetByteaP(stats->stahll_full); |
| GpHLLCounter hllFull_copy = gp_hll_copy(hLLFull); |
| stats->stadistinct = round(gp_hyperloglog_estimate(hllFull_copy)); |
| pfree(hllFull_copy); |
| if ((fabs(totalrows - stats->stadistinct) / (float) totalrows) < 0.05) |
| { |
| stats->stadistinct = -1; |
| } |
| |
| } |
| /* |
| * If we estimated the number of distinct values at more than 10% of |
| * the total row count (a very arbitrary limit), then assume that |
| * stadistinct should scale with the row count rather than be a fixed |
| * value. |
| */ |
| if (stats->stadistinct > 0.1 * totalrows) |
| stats->stadistinct = -(stats->stadistinct / totalrows); |
| |
| /* |
| * Decide how many values are worth storing as most-common values. If |
| * we are able to generate a complete MCV list (all the values in the |
| * sample will fit, and we think these are all the ones in the table), |
| * then do so. Otherwise, store only those values that are |
| * significantly more common than the values not in the list. |
| * |
| * Note: the first of these cases is meant to address columns with |
| * small, fixed sets of possible values, such as boolean or enum |
| * columns. If we can *completely* represent the column population by |
| * an MCV list that will fit into the stats target, then we should do |
| * so and thus provide the planner with complete information. But if |
| * the MCV list is not complete, it's generally worth being more |
| * selective, and not just filling it all the way up to the stats |
| * target. |
| */ |
| if (track_cnt == ndistinct && toowide_cnt == 0 && |
| stats->stadistinct > 0 && |
| track_cnt <= num_mcv) |
| { |
| /* Track list includes all values seen, and all will fit */ |
| num_mcv = track_cnt; |
| } |
| else |
| { |
| int *mcv_counts; |
| |
| /* Incomplete list; decide how many values are worth keeping */ |
| if (num_mcv > track_cnt) |
| num_mcv = track_cnt; |
| |
| if (num_mcv > 0) |
| { |
| mcv_counts = (int *) palloc(num_mcv * sizeof(int)); |
| for (i = 0; i < num_mcv; i++) |
| mcv_counts[i] = track[i].count; |
| |
| num_mcv = analyze_mcv_list(mcv_counts, num_mcv, |
| stats->stadistinct, |
| stats->stanullfrac, |
| samplerows, totalrows); |
| } |
| } |
| |
| /* Generate MCV slot entry */ |
| if (num_mcv > 0) |
| { |
| MemoryContext old_context; |
| Datum *mcv_values; |
| float4 *mcv_freqs; |
| |
| /* Must copy the target values into anl_context */ |
| old_context = MemoryContextSwitchTo(stats->anl_context); |
| mcv_values = (Datum *) palloc(num_mcv * sizeof(Datum)); |
| mcv_freqs = (float4 *) palloc(num_mcv * sizeof(float4)); |
| for (i = 0; i < num_mcv; i++) |
| { |
| mcv_values[i] = datumCopy(values[track[i].first].value, |
| stats->attrtype->typbyval, |
| stats->attrtype->typlen); |
| mcv_freqs[i] = (double) track[i].count / (double) samplerows; |
| } |
| MemoryContextSwitchTo(old_context); |
| |
| stats->stakind[slot_idx] = STATISTIC_KIND_MCV; |
| stats->staop[slot_idx] = mystats->eqopr; |
| stats->stacoll[slot_idx] = stats->attrcollid; |
| stats->stanumbers[slot_idx] = mcv_freqs; |
| stats->numnumbers[slot_idx] = num_mcv; |
| stats->stavalues[slot_idx] = mcv_values; |
| stats->numvalues[slot_idx] = num_mcv; |
| |
| /* |
| * Accept the defaults for stats->statypid and others. They have |
| * been set before we were called (see vacuum.h) |
| */ |
| slot_idx++; |
| } |
| |
| /* |
| * Generate a histogram slot entry if there are at least two distinct |
| * values not accounted for in the MCV list. (This ensures the |
| * histogram won't collapse to empty or a singleton.) |
| */ |
| num_hist = ndistinct - num_mcv; |
| if (num_hist > num_bins) |
| num_hist = num_bins + 1; |
| if (num_hist >= 2) |
| { |
| MemoryContext old_context; |
| Datum *hist_values; |
| int nvals; |
| int pos, |
| posfrac, |
| delta, |
| deltafrac; |
| |
| /* Sort the MCV items into position order to speed next loop */ |
| qsort((void *) track, num_mcv, |
| sizeof(ScalarMCVItem), compare_mcvs); |
| |
| /* |
| * Collapse out the MCV items from the values[] array. |
| * |
| * Note we destroy the values[] array here... but we don't need it |
| * for anything more. We do, however, still need values_cnt. |
| * nvals will be the number of remaining entries in values[]. |
| */ |
| if (num_mcv > 0) |
| { |
| int src, |
| dest; |
| int j; |
| |
| src = dest = 0; |
| j = 0; /* index of next interesting MCV item */ |
| while (src < values_cnt) |
| { |
| int ncopy; |
| |
| if (j < num_mcv) |
| { |
| int first = track[j].first; |
| |
| if (src >= first) |
| { |
| /* advance past this MCV item */ |
| src = first + track[j].count; |
| j++; |
| continue; |
| } |
| ncopy = first - src; |
| } |
| else |
| ncopy = values_cnt - src; |
| memmove(&values[dest], &values[src], |
| ncopy * sizeof(ScalarItem)); |
| src += ncopy; |
| dest += ncopy; |
| } |
| nvals = dest; |
| } |
| else |
| nvals = values_cnt; |
| Assert(nvals >= num_hist); |
| |
| /* Must copy the target values into anl_context */ |
| old_context = MemoryContextSwitchTo(stats->anl_context); |
| hist_values = (Datum *) palloc(num_hist * sizeof(Datum)); |
| |
| /* |
| * The object of this loop is to copy the first and last values[] |
| * entries along with evenly-spaced values in between. So the |
| * i'th value is values[(i * (nvals - 1)) / (num_hist - 1)]. But |
| * computing that subscript directly risks integer overflow when |
| * the stats target is more than a couple thousand. Instead we |
| * add (nvals - 1) / (num_hist - 1) to pos at each step, tracking |
| * the integral and fractional parts of the sum separately. |
| */ |
| delta = (nvals - 1) / (num_hist - 1); |
| deltafrac = (nvals - 1) % (num_hist - 1); |
| pos = posfrac = 0; |
| |
| for (i = 0; i < num_hist; i++) |
| { |
| hist_values[i] = datumCopy(values[pos].value, |
| stats->attrtype->typbyval, |
| stats->attrtype->typlen); |
| pos += delta; |
| posfrac += deltafrac; |
| if (posfrac >= (num_hist - 1)) |
| { |
| /* fractional part exceeds 1, carry to integer part */ |
| pos++; |
| posfrac -= (num_hist - 1); |
| } |
| } |
| |
| MemoryContextSwitchTo(old_context); |
| |
| stats->stakind[slot_idx] = STATISTIC_KIND_HISTOGRAM; |
| stats->staop[slot_idx] = mystats->ltopr; |
| stats->stacoll[slot_idx] = stats->attrcollid; |
| stats->stavalues[slot_idx] = hist_values; |
| stats->numvalues[slot_idx] = num_hist; |
| |
| /* |
| * Accept the defaults for stats->statypid and others. They have |
| * been set before we were called (see vacuum.h) |
| */ |
| slot_idx++; |
| } |
| |
| /* Generate a correlation entry if there are multiple values */ |
| if (values_cnt > 1) |
| { |
| MemoryContext old_context; |
| float4 *corrs; |
| /* Must copy the target values into anl_context */ |
| old_context = MemoryContextSwitchTo(stats->anl_context); |
| corrs = (float4 *) palloc(sizeof(float4)); |
| MemoryContextSwitchTo(old_context); |
| if (stats->partitiontbl_qd ) |
| { |
| if (!stats->corrnull) |
| { |
| corrs[0] = stats->corrval; |
| stats->stakind[slot_idx] = STATISTIC_KIND_CORRELATION; |
| stats->staop[slot_idx] = mystats->ltopr; |
| stats->stacoll[slot_idx] = stats->attrcollid; |
| stats->stanumbers[slot_idx] = corrs; |
| stats->numnumbers[slot_idx] = 1; |
| slot_idx++; |
| } |
| } |
| else /* this is on QE */ |
| { |
| double corr_xsum, |
| corr_x2sum; |
| |
| /*---------- |
| * Since we know the x and y value sets are both |
| * 0, 1, ..., values_cnt-1 |
| * we have sum(x) = sum(y) = |
| * (values_cnt-1)*values_cnt / 2 |
| * and sum(x^2) = sum(y^2) = |
| * (values_cnt-1)*values_cnt*(2*values_cnt-1) / 6. |
| *---------- |
| */ |
| corr_xsum = ((double) (values_cnt - 1)) * |
| ((double) values_cnt) / 2.0; |
| corr_x2sum = ((double) (values_cnt - 1)) * |
| ((double) values_cnt) * (double) (2 * values_cnt - 1) / 6.0; |
| /* And the correlation coefficient reduces to */ |
| corrs[0] = (values_cnt * corr_xysum - corr_xsum * corr_xsum) / |
| (values_cnt * corr_x2sum - corr_xsum * corr_xsum); |
| |
| stats->stakind[slot_idx] = STATISTIC_KIND_CORRELATION; |
| stats->staop[slot_idx] = mystats->ltopr; |
| stats->stacoll[slot_idx] = stats->attrcollid; |
| stats->stanumbers[slot_idx] = corrs; |
| stats->numnumbers[slot_idx] = 1; |
| slot_idx++; |
| } |
| } |
| |
| /* set the ndv from segments */ |
| if (stats->stadistinctbyseg != 0 && slot_idx < STATISTIC_NUM_SLOTS - 1 |
| && !OidIsValid(stats->attrtype->typanalyze)) // should not be the array |
| { |
| MemoryContext old_context; |
| Datum *ndvbs; |
| |
| /* Must copy the target values into anl_context */ |
| old_context = MemoryContextSwitchTo(stats->anl_context); |
| ndvbs = (Datum *) palloc(sizeof(Datum)); |
| ndvbs[0] = Float8GetDatum(stats->stadistinctbyseg); |
| MemoryContextSwitchTo(old_context); |
| |
| stats->stakind[slot_idx] = STATISTIC_KIND_NDV_BY_SEGMENTS; |
| stats->staop[slot_idx] = mystats->ltopr; |
| stats->stacoll[slot_idx] = stats->attrcollid; |
| |
| stats->statypid[slot_idx] = FLOAT8OID; |
| stats->statyplen[slot_idx] = 8; |
| stats->statypbyval[slot_idx] = true; |
| stats->statypalign[slot_idx] = TYPALIGN_DOUBLE; |
| |
| stats->stavalues[slot_idx] = ndvbs; |
| stats->numvalues[slot_idx] = 1; |
| slot_idx++; |
| } |
| } |
| else if (nonnull_cnt > 0) |
| { |
| /* We found some non-null values, but they were all too wide */ |
| Assert(nonnull_cnt == toowide_cnt); |
| stats->stats_valid = true; |
| /* Do the simple null-frac and width stats */ |
| stats->stanullfrac = (double) null_cnt / (double) samplerows; |
| if (is_varwidth) |
| stats->stawidth = (total_width + stats->totalwidelength) / (double) (nonnull_cnt + stats->widerow_num); |
| else |
| stats->stawidth = stats->attrtype->typlen; |
| /* Assume all too-wide values are distinct, so it's a unique column */ |
| stats->stadistinct = -1.0 * (1.0 - stats->stanullfrac); |
| } |
| else if (null_cnt > 0) |
| { |
| /* We found only nulls; assume the column is entirely null */ |
| stats->stats_valid = true; |
| stats->stanullfrac = 1.0; |
| if (is_varwidth) |
| stats->stawidth = 0; /* "unknown" */ |
| else |
| stats->stawidth = stats->attrtype->typlen; |
| stats->stadistinct = 0.0; /* "unknown" */ |
| } |
| else |
| { |
| /* |
| * ORCA complains if a column has no statistics whatsoever, so store |
| * either the best we can figure out given what we have, or zero in |
| * case we don't have enough. |
| */ |
| stats->stats_valid = true; |
| if (samplerows) |
| stats->stanullfrac = (double) null_cnt / (double) samplerows; |
| else |
| stats->stanullfrac = 0.0; |
| if (is_varwidth) |
| stats->stawidth = 0; /* "unknown" */ |
| else |
| stats->stawidth = stats->attrtype->typlen; |
| stats->stadistinct = 0.0; /* "unknown" */ |
| } |
| |
| /* We don't need to bother cleaning up any of our temporary palloc's */ |
| } |
| |
| /* |
| * merge_leaf_stats() -- merge leaf stats for the root |
| * |
| * We use this when we can find "=" and "<" operators for the datatype. |
| * |
| * This is only used when the relation is the root partition and merges |
| * the statistics available in pg_statistic for the leaf partitions. |
| * |
| * We determine the fraction of non-null rows, the average width, the |
| * most common values, the (estimated) number of distinct values, the |
| * distribution histogram. |
| */ |
| static void |
| merge_leaf_stats(VacAttrStatsP stats, |
| AnalyzeAttrFetchFunc fetchfunc, |
| int samplerows, |
| double totalrows) |
| { |
| List *all_children_list; |
| List *oid_list; |
| StdAnalyzeData *mystats = (StdAnalyzeData *) stats->extra_data; |
| int numPartitions; |
| |
| ListCell *lc; |
| float *relTuples; |
| float *nDistincts; |
| float *nMultiples; |
| int relNum; |
| float totalTuples = 0; |
| float nmultiple = 0; // number of values that appeared more than once |
| bool allDistinct = false; |
| int slot_idx = 0; |
| int sampleCount = 0; |
| Oid ltopr = mystats->ltopr; |
| Oid eqopr = mystats->eqopr; |
| |
| ereport(DEBUG2, |
| (errmsg("Merging leaf partition stats to calculate root partition stats : column %s", |
| get_attname(stats->attr->attrelid, stats->attr->attnum, false)))); |
| |
| /* |
| * Since we have acquired ShareUpdateExclusiveLock on the parent table when |
| * ANALYZE'ing it, we don't need extra lock to guard against concurrent DROP |
| * of either the parent or the child (which requries AccessExclusiveLock on |
| * the parent). |
| * Concurrent UPDATE is possible but because we are not updating the table |
| * ourselves, NoLock is sufficient here. |
| */ |
| all_children_list = find_all_inheritors(stats->attr->attrelid, NoLock, NULL); |
| SIMPLE_FAULT_INJECTOR("merge_leaf_stats_after_find_children"); |
| |
| oid_list = NIL; |
| foreach (lc, all_children_list) |
| { |
| Oid pkrelid = lfirst_oid(lc); |
| |
| /* skip intermediate partitions, we're only interested in leaves */ |
| if (get_rel_relkind(pkrelid) != RELKIND_RELATION) |
| continue; |
| |
| oid_list = lappend_oid(oid_list, pkrelid); |
| } |
| numPartitions = list_length(oid_list); |
| |
| relTuples = (float *) palloc0(sizeof(float) * numPartitions); |
| nDistincts = (float *) palloc0(sizeof(float) * numPartitions); |
| nMultiples = (float *) palloc0(sizeof(float) * numPartitions); |
| |
| relNum = 0; |
| foreach (lc, oid_list) |
| { |
| Oid pkrelid = lfirst_oid(lc); |
| |
| relTuples[relNum] = get_rel_reltuples(pkrelid); |
| totalTuples = totalTuples + relTuples[relNum]; |
| relNum++; |
| } |
| |
| if (totalTuples == 0.0) |
| return; |
| |
| MemoryContext old_context; |
| |
| HeapTuple *heaptupleStats = |
| (HeapTuple *) palloc(numPartitions * sizeof(HeapTuple)); |
| |
| // NDV calculations |
| float4 colAvgWidth = 0; |
| float4 nullCount = 0; |
| GpHLLCounter *hllcounters = (GpHLLCounter *) palloc0(numPartitions * sizeof(GpHLLCounter)); |
| GpHLLCounter *hllcounters_fullscan = (GpHLLCounter *) palloc0(numPartitions * sizeof(GpHLLCounter)); |
| GpHLLCounter *hllcounters_copy = (GpHLLCounter *) palloc0(numPartitions * sizeof(GpHLLCounter)); |
| |
| GpHLLCounter finalHLL = NULL; |
| GpHLLCounter finalHLLFull = NULL; |
| int i = 0; |
| double ndistinct = 0.0; |
| int fullhll_count = 0; |
| int samplehll_count = 0; |
| int totalhll_count = 0; |
| foreach (lc, oid_list) |
| { |
| Oid leaf_relid = lfirst_oid(lc); |
| int32 stawidth = 0; |
| float4 stanullfrac = 0.0; |
| |
| const char *attname = get_attname(stats->attr->attrelid, stats->attr->attnum, false); |
| |
| /* |
| * fetch_leaf_attnum and fetch_leaf_att_stats retrieve leaf partition |
| * table's pg_attribute tuple and pg_statistic tuple through index scan |
| * instead of system catalog cache. Since if using system catalog cache, |
| * the total tuple entries insert into the cache will up to: |
| * (number_of_leaf_tables * number_of_column_in_this_table) pg_attribute tuples |
| * + |
| * (number_of_leaf_tables * number_of_column_in_this_table) pg_statistic tuples |
| * which could use extremely large memroy in CacheMemoryContext. |
| * This happens when all of the leaf tables are analyzed. And the current function |
| * will execute for all columns. |
| * |
| * fetch_leaf_att_stats copy the original tuple, so remember to free it. |
| * |
| * As a side-effect, ANALYZE same root table serveral times in same session is much |
| * more slower than before since we don't rely on system catalog cache. |
| * |
| * But we still using the tuple descriptor in system catalog cache to retrieve |
| * attribute in fetched tuples. See get_attstatsslot. |
| */ |
| AttrNumber child_attno = fetch_leaf_attnum(leaf_relid, attname); |
| heaptupleStats[i] = fetch_leaf_att_stats(leaf_relid, child_attno); |
| |
| // if there is no colstats, we can skip this partition's stats |
| if (!HeapTupleIsValid(heaptupleStats[i])) |
| { |
| i++; |
| continue; |
| } |
| |
| stawidth = ((Form_pg_statistic) GETSTRUCT(heaptupleStats[i]))->stawidth; |
| stanullfrac = ((Form_pg_statistic) GETSTRUCT(heaptupleStats[i]))->stanullfrac; |
| colAvgWidth = colAvgWidth + (stawidth > 0 ? stawidth : 0) * relTuples[i]; |
| nullCount = nullCount + (stanullfrac > 0.0 ? stanullfrac : 0.0) * relTuples[i]; |
| |
| AttStatsSlot hllSlot; |
| |
| (void) get_attstatsslot(&hllSlot, heaptupleStats[i], STATISTIC_KIND_FULLHLL, |
| InvalidOid, ATTSTATSSLOT_VALUES); |
| |
| if (hllSlot.nvalues > 0) |
| { |
| hllcounters_fullscan[i] = (GpHLLCounter) DatumGetByteaP(hllSlot.values[0]); |
| GpHLLCounter finalHLLFull_intermediate = finalHLLFull; |
| finalHLLFull = gp_hyperloglog_merge_counters(finalHLLFull_intermediate, hllcounters_fullscan[i]); |
| if (NULL != finalHLLFull_intermediate) |
| { |
| pfree(finalHLLFull_intermediate); |
| } |
| free_attstatsslot(&hllSlot); |
| fullhll_count++; |
| totalhll_count++; |
| } |
| |
| (void) get_attstatsslot(&hllSlot, heaptupleStats[i], STATISTIC_KIND_HLL, |
| InvalidOid, ATTSTATSSLOT_VALUES); |
| |
| if (hllSlot.nvalues > 0) |
| { |
| hllcounters[i] = (GpHLLCounter) DatumGetByteaP(hllSlot.values[0]); |
| nDistincts[i] = (float) hllcounters[i]->ndistinct; |
| nMultiples[i] = (float) hllcounters[i]->nmultiples; |
| sampleCount += hllcounters[i]->samplerows; |
| hllcounters_copy[i] = gp_hll_copy(hllcounters[i]); |
| GpHLLCounter finalHLL_intermediate = finalHLL; |
| finalHLL = gp_hyperloglog_merge_counters(finalHLL_intermediate, hllcounters[i]); |
| if (NULL != finalHLL_intermediate) |
| { |
| pfree(finalHLL_intermediate); |
| } |
| free_attstatsslot(&hllSlot); |
| samplehll_count++; |
| totalhll_count++; |
| } |
| i++; |
| } |
| |
| if (totalhll_count == 0) |
| { |
| /* |
| * If neither HLL nor HLL Full scan stats are available, |
| * continue merging stats based on the defaults, instead |
| * of reading them from HLL counter. |
| */ |
| } |
| else |
| { |
| /* |
| * If all partitions have HLL full scan counters, |
| * merge root NDV's based on leaf partition HLL full scan |
| * counter |
| */ |
| if (fullhll_count == totalhll_count) |
| { |
| ndistinct = gp_hyperloglog_estimate(finalHLLFull); |
| pfree(finalHLLFull); |
| /* |
| * For fullscan the ndistinct is calculated based on the entire table scan |
| * so if it's within the marginal error, we consider everything as distinct, |
| * else the ndistinct value will provide the actual value and we do not , |
| * need to do any additional calculation for the nmultiple |
| */ |
| if ((fabs(totalTuples - ndistinct) / (float) totalTuples) < GP_HLL_ERROR_MARGIN) |
| { |
| allDistinct = true; |
| } |
| nmultiple = ndistinct; |
| } |
| /* |
| * Else if all partitions have HLL counter based on sampled data, |
| * merge root NDV's based on leaf partition HLL counter on |
| * sampled data |
| */ |
| else if (finalHLL != NULL && samplehll_count == totalhll_count) |
| { |
| ndistinct = gp_hyperloglog_estimate(finalHLL); |
| pfree(finalHLL); |
| /* |
| * For sampled HLL counter, the ndistinct calculated is based on the |
| * sampled data. We consider everything distinct if the ndistinct |
| * calculated is within marginal error, else we need to calculate |
| * the number of distinct values for the table based on the estimator |
| * proposed by Haas and Stokes, used later in the code. |
| */ |
| if ((fabs(sampleCount - ndistinct) / (float) sampleCount) < GP_HLL_ERROR_MARGIN) |
| { |
| allDistinct = true; |
| } |
| else |
| { |
| /* |
| * The gp_hyperloglog_estimate() utility merges the number of |
| * distnct values accurately, but for the NDV estimator used later |
| * in the code, we also need additional information for nmultiples, |
| * i.e., the number of values that appeared more than once. |
| * At this point we have the information for nmultiples for each |
| * partition, but the nmultiples in one partition can be accounted as |
| * a distinct value in some other partition. In order to merge the |
| * approximate nmultiples better, we extract unique values in each |
| * partition as follows, |
| * P1 -> ndistinct1 , nmultiple1 |
| * P2 -> ndistinct2 , nmultiple2 |
| * P3 -> ndistinct3 , nmultiple3 |
| * Root -> ndistinct(Root) (using gp_hyperloglog_estimate) |
| * nunique1 = ndistinct(Root) - gp_hyperloglog_estimate(P2 & P3) |
| * nunique2 = ndistinct(Root) - gp_hyperloglog_estimate(P1 & P3) |
| * nunique3 = ndistinct(Root) - gp_hyperloglog_estimate(P2 & P1) |
| * And finally once we have unique values in individual partitions, |
| * we can get the nmultiples on the ROOT as seen below, |
| * nmultiple(Root) = ndistinct(Root) - (sum of uniques in each partition) |
| */ |
| /* |
| * hllcounters_left array stores the merged hll result of all the |
| * hll counters towards the left of index i and excluding the hll |
| * counter at index i |
| */ |
| GpHLLCounter *hllcounters_left = (GpHLLCounter *) palloc0(numPartitions * sizeof(GpHLLCounter)); |
| |
| /* |
| * hllcounters_right array stores the merged hll result of all the |
| * hll counters towards the right of index i and excluding the hll |
| * counter at index i |
| */ |
| GpHLLCounter *hllcounters_right = (GpHLLCounter *) palloc0(numPartitions * sizeof(GpHLLCounter)); |
| |
| hllcounters_left[0] = gp_hyperloglog_init_def(); |
| hllcounters_right[numPartitions - 1] = gp_hyperloglog_init_def(); |
| |
| /* |
| * The following loop populates the left and right array by accumulating the merged |
| * result of all the hll counters towards the left/right of the given index i excluding |
| * the counter at index i. |
| * Note that there might be empty values for some partitions, in which case the |
| * corresponding element in the left/right arrays will simply be the value |
| * of its neighbor. |
| * For E.g If the hllcounters_copy array is 1, null, 2, 3, null, 4 |
| * the left and right arrays will be as follows: |
| * hllcounters_left: default, 1, 1, (1,2), (1,2,3), (1,2,3) |
| * hllcounters_right: (2,3,4), (2,3,4), (3,4), 4, 4, default |
| */ |
| /* |
| * The first and the last element in the left and right arrays |
| * are default values since there is no element towards |
| * the left or right of them |
| */ |
| for (i = 1; i < numPartitions; i++) |
| { |
| /* populate left array */ |
| if (nDistincts[i - 1] == 0) |
| { |
| hllcounters_left[i] = gp_hll_copy(hllcounters_left[i - 1]); |
| } |
| else |
| { |
| GpHLLCounter hllcounter_temp1 = gp_hll_copy(hllcounters_copy[i - 1]); |
| GpHLLCounter hllcounter_temp2 = gp_hll_copy(hllcounters_left[i - 1]); |
| hllcounters_left[i] = gp_hyperloglog_merge_counters(hllcounter_temp1, hllcounter_temp2); |
| pfree(hllcounter_temp1); |
| pfree(hllcounter_temp2); |
| } |
| |
| /* populate right array */ |
| if (nDistincts[numPartitions - i] == 0) |
| { |
| hllcounters_right[numPartitions - i - 1] = gp_hll_copy(hllcounters_right[numPartitions - i]); |
| } |
| else |
| { |
| GpHLLCounter hllcounter_temp1 = gp_hll_copy(hllcounters_copy[numPartitions - i]); |
| GpHLLCounter hllcounter_temp2 = gp_hll_copy(hllcounters_right[numPartitions - i]); |
| hllcounters_right[numPartitions - i - 1] = gp_hyperloglog_merge_counters(hllcounter_temp1, hllcounter_temp2); |
| pfree(hllcounter_temp1); |
| pfree(hllcounter_temp2); |
| } |
| } |
| |
| int nUnique = 0; |
| for (i = 0; i < numPartitions; i++) |
| { |
| /* Skip if statistics are missing for the partition */ |
| if (nDistincts[i] == 0) |
| continue; |
| |
| GpHLLCounter hllcounter_temp1 = gp_hll_copy(hllcounters_left[i]); |
| GpHLLCounter hllcounter_temp2 = gp_hll_copy(hllcounters_right[i]); |
| GpHLLCounter final = NULL; |
| final = gp_hyperloglog_merge_counters(hllcounter_temp1, hllcounter_temp2); |
| |
| pfree(hllcounter_temp1); |
| pfree(hllcounter_temp2); |
| |
| if (final != NULL) |
| { |
| float nUniques = ndistinct - gp_hyperloglog_estimate(final); |
| nUnique += nUniques; |
| nmultiple += nMultiples[i] * (nUniques / nDistincts[i]); |
| pfree(final); |
| } |
| else |
| { |
| nUnique = ndistinct; |
| break; |
| } |
| } |
| |
| // nmultiples for the ROOT |
| nmultiple += ndistinct - nUnique; |
| |
| if (nmultiple < 0) |
| nmultiple = 0; |
| |
| pfree(hllcounters_left); |
| pfree(hllcounters_right); |
| } |
| } |
| else |
| { |
| /* Else error out due to incompatible leaf HLL counter merge */ |
| pfree(hllcounters); |
| pfree(hllcounters_fullscan); |
| pfree(hllcounters_copy); |
| pfree(nDistincts); |
| pfree(nMultiples); |
| |
| ereport(ERROR, |
| (errmsg("ANALYZE cannot merge since not all non-empty leaf partitions have consistent hyperloglog statistics for merge"), |
| errhint("Re-run ANALYZE or ANALYZE FULLSCAN"))); |
| } |
| } |
| pfree(hllcounters); |
| pfree(hllcounters_fullscan); |
| pfree(hllcounters_copy); |
| pfree(nDistincts); |
| pfree(nMultiples); |
| |
| if (allDistinct || (!OidIsValid(eqopr) && !OidIsValid(ltopr))) |
| { |
| /* If we found no repeated values, assume it's a unique column */ |
| ndistinct = -1.0; |
| } |
| else if ((int) nmultiple >= (int) ndistinct) |
| { |
| /* |
| * Every value in the sample appeared more than once. Assume the |
| * column has just these values. |
| */ |
| } |
| else |
| { |
| /*---------- |
| * Estimate the number of distinct values using the estimator |
| * proposed by Haas and Stokes in IBM Research Report RJ 10025: |
| * n*d / (n - f1 + f1*n/N) |
| * where f1 is the number of distinct values that occurred |
| * exactly once in our sample of n rows (from a total of N), |
| * and d is the total number of distinct values in the sample. |
| * This is their Duj1 estimator; the other estimators they |
| * recommend are considerably more complex, and are numerically |
| * very unstable when n is much smaller than N. |
| * |
| * Overwidth values are assumed to have been distinct. |
| *---------- |
| */ |
| int f1 = ndistinct - nmultiple; |
| int d = f1 + nmultiple; |
| double numer, denom, stadistinct; |
| |
| numer = (double) sampleCount * (double) d; |
| |
| denom = (double) (sampleCount - f1) + |
| (double) f1 * (double) sampleCount / totalTuples; |
| |
| stadistinct = numer / denom; |
| /* Clamp to sane range in case of roundoff error */ |
| if (stadistinct < (double) d) |
| stadistinct = (double) d; |
| if (stadistinct > totalTuples) |
| stadistinct = totalTuples; |
| ndistinct = floor(stadistinct + 0.5); |
| } |
| |
| ndistinct = round(ndistinct); |
| if (ndistinct > 0.1 * totalTuples) |
| ndistinct = -(ndistinct / totalTuples); |
| |
| // finalize NDV calculation |
| stats->stadistinct = ndistinct; |
| stats->stats_valid = true; |
| stats->stawidth = colAvgWidth / totalTuples; |
| stats->stanullfrac = (float4) nullCount / (float4) totalTuples; |
| |
| // MCV calculations |
| MCVFreqPair **mcvpairArray = NULL; |
| int rem_mcv = 0; |
| int num_mcv = 0; |
| if (ndistinct > -1 && OidIsValid(eqopr)) |
| { |
| if (ndistinct < 0) |
| { |
| ndistinct = -ndistinct * totalTuples; |
| } |
| |
| old_context = MemoryContextSwitchTo(stats->anl_context); |
| |
| void *resultMCV[2]; |
| |
| mcvpairArray = aggregate_leaf_partition_MCVs( |
| stats->attr->attrelid, stats->attr->attnum, |
| numPartitions, heaptupleStats, relTuples, |
| stats->attr->attstattarget, ndistinct, &num_mcv, &rem_mcv, |
| resultMCV); |
| MemoryContextSwitchTo(old_context); |
| |
| if (num_mcv > 0) |
| { |
| stats->stakind[slot_idx] = STATISTIC_KIND_MCV; |
| stats->staop[slot_idx] = mystats->eqopr; |
| stats->stavalues[slot_idx] = (Datum *) resultMCV[0]; |
| stats->numvalues[slot_idx] = num_mcv; |
| stats->stanumbers[slot_idx] = (float4 *) resultMCV[1]; |
| stats->numnumbers[slot_idx] = num_mcv; |
| slot_idx++; |
| } |
| } |
| |
| // Histogram calculation |
| if (OidIsValid(eqopr) && OidIsValid(ltopr)) |
| { |
| old_context = MemoryContextSwitchTo(stats->anl_context); |
| |
| void *resultHistogram[1]; |
| int num_hist = aggregate_leaf_partition_histograms( |
| stats->attr->attrelid, stats->attr->attnum, |
| numPartitions, heaptupleStats, relTuples, |
| stats->attr->attstattarget, mcvpairArray + num_mcv, |
| rem_mcv, resultHistogram); |
| MemoryContextSwitchTo(old_context); |
| if (num_hist > 0) |
| { |
| stats->stakind[slot_idx] = STATISTIC_KIND_HISTOGRAM; |
| stats->staop[slot_idx] = mystats->ltopr; |
| stats->stavalues[slot_idx] = (Datum *) resultHistogram[0]; |
| stats->numvalues[slot_idx] = num_hist; |
| slot_idx++; |
| } |
| } |
| |
| // ndistinct by segments calculation |
| { |
| old_context = MemoryContextSwitchTo(stats->anl_context); |
| bool valid; |
| double ndinstinct_by_segs = 0; |
| Datum *ndvbs; |
| |
| valid = aggregate_leaf_partition_ndvbs( |
| numPartitions, heaptupleStats, relTuples, &ndinstinct_by_segs); |
| |
| if (valid) |
| { |
| ndvbs = (Datum *) palloc(sizeof(Datum)); |
| ndvbs[0] = Float8GetDatum(ndinstinct_by_segs); |
| |
| stats->stakind[slot_idx] = STATISTIC_KIND_NDV_BY_SEGMENTS; |
| stats->staop[slot_idx] = mystats->ltopr; |
| |
| stats->statypid[slot_idx] = FLOAT8OID; |
| stats->statyplen[slot_idx] = 8; |
| stats->statypbyval[slot_idx] = true; |
| stats->statypalign[slot_idx] = TYPALIGN_DOUBLE; |
| |
| stats->stavalues[slot_idx] = ndvbs; |
| stats->numvalues[slot_idx] = 1; |
| slot_idx++; |
| } |
| MemoryContextSwitchTo(old_context); |
| } |
| |
| for (i = 0; i < numPartitions; i++) |
| { |
| if (HeapTupleIsValid(heaptupleStats[i])) |
| heap_freetuple(heaptupleStats[i]); |
| } |
| if (num_mcv > 0) |
| pfree(mcvpairArray); |
| pfree(heaptupleStats); |
| pfree(relTuples); |
| } |
| |
| /* |
| * qsort_arg comparator for sorting ScalarItems |
| * |
| * Aside from sorting the items, we update the tupnoLink[] array |
| * whenever two ScalarItems are found to contain equal datums. The array |
| * is indexed by tupno; for each ScalarItem, it contains the highest |
| * tupno that that item's datum has been found to be equal to. This allows |
| * us to avoid additional comparisons in compute_scalar_stats(). |
| */ |
| static int |
| compare_scalars(const void *a, const void *b, void *arg) |
| { |
| Datum da = ((const ScalarItem *) a)->value; |
| int ta = ((const ScalarItem *) a)->tupno; |
| Datum db = ((const ScalarItem *) b)->value; |
| int tb = ((const ScalarItem *) b)->tupno; |
| CompareScalarsContext *cxt = (CompareScalarsContext *) arg; |
| int compare; |
| |
| compare = ApplySortComparator(da, false, db, false, cxt->ssup); |
| if (compare != 0) |
| return compare; |
| |
| /* |
| * The two datums are equal, so update cxt->tupnoLink[]. |
| */ |
| if (cxt->tupnoLink[ta] < tb) |
| cxt->tupnoLink[ta] = tb; |
| if (cxt->tupnoLink[tb] < ta) |
| cxt->tupnoLink[tb] = ta; |
| |
| /* |
| * For equal datums, sort by tupno |
| */ |
| return ta - tb; |
| } |
| |
| /* |
| * qsort comparator for sorting ScalarMCVItems by position |
| */ |
| static int |
| compare_mcvs(const void *a, const void *b) |
| { |
| int da = ((const ScalarMCVItem *) a)->first; |
| int db = ((const ScalarMCVItem *) b)->first; |
| |
| return da - db; |
| } |
| |
| /* |
| * Analyze the list of common values in the sample and decide how many are |
| * worth storing in the table's MCV list. |
| * |
| * mcv_counts is assumed to be a list of the counts of the most common values |
| * seen in the sample, starting with the most common. The return value is the |
| * number that are significantly more common than the values not in the list, |
| * and which are therefore deemed worth storing in the table's MCV list. |
| */ |
| static int |
| analyze_mcv_list(int *mcv_counts, |
| int num_mcv, |
| double stadistinct, |
| double stanullfrac, |
| int samplerows, |
| double totalrows) |
| { |
| double ndistinct_table; |
| double sumcount; |
| int i; |
| |
| /* |
| * If the entire table was sampled, keep the whole list. This also |
| * protects us against division by zero in the code below. |
| */ |
| if (samplerows == totalrows || totalrows <= 1.0) |
| return num_mcv; |
| |
| /* Re-extract the estimated number of distinct nonnull values in table */ |
| ndistinct_table = stadistinct; |
| if (ndistinct_table < 0) |
| ndistinct_table = -ndistinct_table * totalrows; |
| |
| /* |
| * Exclude the least common values from the MCV list, if they are not |
| * significantly more common than the estimated selectivity they would |
| * have if they weren't in the list. All non-MCV values are assumed to be |
| * equally common, after taking into account the frequencies of all the |
| * values in the MCV list and the number of nulls (c.f. eqsel()). |
| * |
| * Here sumcount tracks the total count of all but the last (least common) |
| * value in the MCV list, allowing us to determine the effect of excluding |
| * that value from the list. |
| * |
| * Note that we deliberately do this by removing values from the full |
| * list, rather than starting with an empty list and adding values, |
| * because the latter approach can fail to add any values if all the most |
| * common values have around the same frequency and make up the majority |
| * of the table, so that the overall average frequency of all values is |
| * roughly the same as that of the common values. This would lead to any |
| * uncommon values being significantly overestimated. |
| */ |
| sumcount = 0.0; |
| for (i = 0; i < num_mcv - 1; i++) |
| sumcount += mcv_counts[i]; |
| |
| while (num_mcv > 0) |
| { |
| double selec, |
| otherdistinct, |
| N, |
| n, |
| K, |
| variance, |
| stddev; |
| |
| /* |
| * Estimated selectivity the least common value would have if it |
| * wasn't in the MCV list (c.f. eqsel()). |
| */ |
| selec = 1.0 - sumcount / samplerows - stanullfrac; |
| if (selec < 0.0) |
| selec = 0.0; |
| if (selec > 1.0) |
| selec = 1.0; |
| otherdistinct = ndistinct_table - (num_mcv - 1); |
| if (otherdistinct > 1) |
| selec /= otherdistinct; |
| |
| /* |
| * If the value is kept in the MCV list, its population frequency is |
| * assumed to equal its sample frequency. We use the lower end of a |
| * textbook continuity-corrected Wald-type confidence interval to |
| * determine if that is significantly more common than the non-MCV |
| * frequency --- specifically we assume the population frequency is |
| * highly likely to be within around 2 standard errors of the sample |
| * frequency, which equates to an interval of 2 standard deviations |
| * either side of the sample count, plus an additional 0.5 for the |
| * continuity correction. Since we are sampling without replacement, |
| * this is a hypergeometric distribution. |
| * |
| * XXX: Empirically, this approach seems to work quite well, but it |
| * may be worth considering more advanced techniques for estimating |
| * the confidence interval of the hypergeometric distribution. |
| */ |
| N = totalrows; |
| n = samplerows; |
| K = N * mcv_counts[num_mcv - 1] / n; |
| variance = n * K * (N - K) * (N - n) / (N * N * (N - 1)); |
| stddev = sqrt(variance); |
| |
| if (mcv_counts[num_mcv - 1] > selec * samplerows + 2 * stddev + 0.5) |
| { |
| /* |
| * The value is significantly more common than the non-MCV |
| * selectivity would suggest. Keep it, and all the other more |
| * common values in the list. |
| */ |
| break; |
| } |
| else |
| { |
| /* Discard this value and consider the next least common value */ |
| num_mcv--; |
| if (num_mcv == 0) |
| break; |
| sumcount -= mcv_counts[num_mcv - 1]; |
| } |
| } |
| return num_mcv; |
| } |
| |
| /* |
| * calculate correlations use the weighted mean algorithm. |
| * |
| * the formula for calculating the weighted mean is: |
| * sum(correlationOnSeg[i] * (totalRowsOnSeg[i] / totalRows)) |
| * i is from 0 to N. N is the number of segments. |
| */ |
| static void |
| calculate_correlation_use_weighted_mean(CdbPgResults *cdb_pgresults, |
| Relation onerel, |
| TupleDesc tupleDesc, |
| float4 *correlations, |
| bool *correlationsIsNull, |
| int live_natts) |
| { |
| TupleDesc relDesc = RelationGetDescr(onerel); |
| int attNum = relDesc->natts; |
| int segmentNum = cdb_pgresults->numResults; |
| |
| /* |
| * totalRowsOnSeg, correlationOnSeg and correlationIsNullOnSeg are One-Dimensional arrays |
| * to store infos of the following results. |
| * |
| * | seg0 | seg1 | seg2 |
| * -------------------------------- |
| * att0| 0*3+0 | 0*3+1 | 0*3+2 |
| * att1| 1*3+0 | 1*3+1 | 1*3+2 |
| * att2| 2*3+0 | 2*3+1 | 2*3+2 |
| * |
| * We can use attno * segmentNum + segno as index to access the |
| * info of column attno on segment segno. |
| * index from attno * segmentNum to attno * segmentNum + segmentNum -1 store infos |
| * of column attno. |
| */ |
| int *totalRowsOnSeg = (int *)palloc0(sizeof(int) * segmentNum * attNum); |
| float4 *correlationOnSeg = (float4 *)palloc0(sizeof(float4) * segmentNum * attNum); |
| bool *correlationIsNullOnSeg = (bool *)palloc0(sizeof(bool) * segmentNum * attNum); |
| int *totalRows = (int *)palloc0(sizeof(int) * attNum); |
| char **funcRetValues = (char **) palloc0(tupleDesc->natts * sizeof(char *)); |
| bool *funcRetNulls = (bool *) palloc(tupleDesc->natts * sizeof(bool)); |
| |
| for (int segno = 0; segno < segmentNum; segno++) |
| { |
| int rows, nfields; |
| float4 correlationValue; |
| int attno; |
| struct pg_result *pgresult = cdb_pgresults->pg_results[segno]; |
| int index; |
| ExecStatusType status = PQresultStatus(pgresult); |
| if (status != PGRES_TUPLES_OK) |
| { |
| cdbdisp_clearCdbPgResults(cdb_pgresults); |
| ereport(ERROR, |
| (errmsg("unexpected result from segment: %d", |
| status))); |
| } |
| /* |
| * gp_acquire_correlations returns a result for each alive columns. |
| */ |
| rows = PQntuples(pgresult); |
| nfields = PQnfields(pgresult); |
| if (rows != live_natts || nfields != 1) |
| { |
| cdbdisp_clearCdbPgResults(cdb_pgresults); |
| ereport(ERROR, |
| (errmsg("unexpected shape of result from segment (%d rows, %d cols)", |
| rows, nfields))); |
| } |
| for (int j = 0; j < rows; j++) |
| { |
| char *rowStr = PQgetvalue(pgresult, j, 0); |
| if (rowStr == NULL) |
| elog(ERROR, "got NULL pointer from return value of gp_acquire_correlations"); |
| |
| parse_record_to_string(rowStr, tupleDesc, funcRetValues, funcRetNulls); |
| |
| /* get the first column: attno, and it is not NULL */ |
| attno = DatumGetInt32(DirectFunctionCall1(int4in, |
| CStringGetDatum(funcRetValues[0]))); |
| index = attno * segmentNum + segno; |
| correlationIsNullOnSeg[index] = true; |
| /* get the second column: correlation, and it maybe NULl */ |
| if (!funcRetNulls[1]) |
| { |
| correlationValue = DatumGetFloat4(DirectFunctionCall1(float4in, |
| CStringGetDatum(funcRetValues[1]))); |
| correlationsIsNull[attno] = false; |
| |
| correlationOnSeg[index] = correlationValue; |
| correlationIsNullOnSeg[index] = false; |
| } |
| /* get the third column: totalrow on segment i, this is not NULL. |
| * if correlation for this column is NULL, the totalrow is 0. |
| */ |
| totalRowsOnSeg[index] = DatumGetInt32(DirectFunctionCall1(int4in, |
| CStringGetDatum(funcRetValues[2]))); |
| totalRows[attno] += totalRowsOnSeg[index]; |
| } |
| } |
| |
| /* |
| * Calculate overall correlation from correlation on each segments |
| * we use weighted average algorithm to calculate correlation to |
| * better handle skewed data between segments. |
| */ |
| for (int attno = 0; attno < attNum; attno++) |
| { |
| Form_pg_attribute relatt = TupleDescAttr(relDesc, attno); |
| |
| if (relatt->attisdropped) |
| continue; |
| float4 weight; |
| for (int segno = 0; segno < segmentNum; segno++) |
| { |
| int index = attno * segmentNum + segno; |
| if (!correlationIsNullOnSeg[index]) |
| { |
| weight = (float4)totalRowsOnSeg[index] / totalRows[attno]; |
| correlations[attno] += correlationOnSeg[index] * weight; |
| } |
| } |
| } |
| |
| pfree(totalRowsOnSeg); |
| pfree(correlationOnSeg); |
| pfree(correlationIsNullOnSeg); |
| pfree(totalRows); |
| for (int i = 0; i < tupleDesc->natts; i++) |
| { |
| if (funcRetValues[i]) |
| pfree(funcRetValues[i]); |
| } |
| pfree(funcRetValues); |
| pfree(funcRetNulls); |
| } |
| |
| /* |
| * calculate correlations use the mean algorithm. |
| * |
| * In some situations, we may not be able to obtain reltuples of a table, |
| * such as none-leaf part of partitioned table or the parent table of the |
| * inherited table. So we can not use the weighted mean algorithm. |
| * |
| * the formula for calculating the mean is: |
| * sum(correlationOnSeg) / count(corrNotNullSeg) |
| */ |
| static void |
| calculate_correlation_use_mean(CdbPgResults *cdb_pgresults, |
| Relation onerel, |
| TupleDesc tupleDesc, |
| float4 *correlations, |
| bool *correlationsIsNull, |
| int live_natts) |
| { |
| TupleDesc relDesc = RelationGetDescr(onerel); |
| int attNum = relDesc->natts; |
| int segmentNum = cdb_pgresults->numResults; |
| /* |
| * corrNotNullSegNum[attno] stores the number of segments for which |
| * the correlation of column attno is not null. |
| */ |
| int *corrNotNullSegNum = (int *)palloc0(sizeof(int) * attNum); |
| char **funcRetValues = (char **) palloc0(tupleDesc->natts * sizeof(char *)); |
| bool *funcRetNulls = (bool *) palloc(tupleDesc->natts * sizeof(bool)); |
| |
| for (int segno = 0; segno < segmentNum; segno++) |
| { |
| int ntuples, nfields; |
| float4 correlationValue; |
| int attno; |
| struct pg_result *pgresult = cdb_pgresults->pg_results[segno]; |
| ExecStatusType status = PQresultStatus(pgresult); |
| if (status != PGRES_TUPLES_OK) |
| { |
| cdbdisp_clearCdbPgResults(cdb_pgresults); |
| ereport(ERROR, |
| (errmsg("unexpected result from segment: %d", |
| status))); |
| } |
| /* |
| * gp_acquire_correlations returns a result for each alive columns. |
| */ |
| ntuples = PQntuples(pgresult); |
| nfields = PQnfields(pgresult); |
| if (ntuples != live_natts || nfields != 1) |
| { |
| cdbdisp_clearCdbPgResults(cdb_pgresults); |
| ereport(ERROR, |
| (errmsg("unexpected shape of result from segment (%d rows, %d cols)", |
| ntuples, nfields))); |
| } |
| for (int j = 0; j < ntuples; j++) |
| { |
| char *rowStr = PQgetvalue(pgresult, j, 0); |
| if (rowStr == NULL) |
| elog(ERROR, "got NULL pointer from return value of gp_acquire_correlations"); |
| |
| parse_record_to_string(rowStr, tupleDesc, funcRetValues, funcRetNulls); |
| /* get the first column: attno, and it is not NULL */ |
| attno = DatumGetInt32(DirectFunctionCall1(int4in, |
| CStringGetDatum(funcRetValues[0]))); |
| /* get the second column: correlation, and it maybe NULl */ |
| if (!funcRetNulls[1]) |
| { |
| correlationValue = DatumGetFloat4(DirectFunctionCall1(float4in, |
| CStringGetDatum(funcRetValues[1]))); |
| correlationsIsNull[attno] = false; |
| correlations[attno] += correlationValue; |
| corrNotNullSegNum[attno]++; |
| } |
| } |
| } |
| |
| /* |
| * Calculate overall correlation from correlation on each segments |
| * we use the mean algorithm to calculate correlation. |
| */ |
| for (int attno = 0; attno < attNum; attno++) |
| { |
| Form_pg_attribute relatt = TupleDescAttr(relDesc, attno); |
| |
| if (relatt->attisdropped) |
| continue; |
| if (!correlationsIsNull[attno]) |
| { |
| correlations[attno] = correlations[attno] / corrNotNullSegNum[attno]; |
| } |
| } |
| |
| pfree(corrNotNullSegNum); |
| for (int i = 0; i < tupleDesc->natts; i++) |
| { |
| if (funcRetValues[i]) |
| pfree(funcRetValues[i]); |
| } |
| pfree(funcRetValues); |
| pfree(funcRetNulls); |
| } |
| |
| /* |
| * We cannot use the same method as PostgreSQL does to calculate the |
| * correlation in QD. When we collect data from segments to QD, this |
| * will change the physical order of the data. such as in segment 1, |
| * the data is 1,3,5,7,9. And in segment 2, the data is 2,4,6,8,10. |
| * In each segment the data is ordered, and correlation is 1 in each segment. |
| * But after we collect the data to QD, it may be 1,3,5,2,4,7,9,6,8,10. |
| * And the correlation is 0.3 or something else and it is not stable. |
| * And this will increase the cost of index scan which is shouldn't be done. |
| * |
| * So get correlations from segments and summarize them. |
| */ |
| static void |
| gp_acquire_correlations_dispatcher(Oid relOid, bool inh, float4 *correlations, bool *correlationsIsNull) |
| { |
| CdbPgResults cdb_pgresults = {NULL, 0}; |
| char *sql; |
| Relation onerel; |
| int attNum; |
| int live_natts; |
| TupleDesc relDesc; |
| /* |
| * For child table of inherited tables and leaf table of partitioned table, |
| * We use the weighted mean average to calculate the correlation collected |
| * from the segments. |
| * the formula for calculating the weighted mean is: |
| * sum(correlationOnSeg[i] * (totalRowsOnSeg[i] / totalRows)) |
| * i is from 0 to N. N is the number of segments. |
| * |
| * However, since reltuples of none-leaf part of partitioned table and the |
| * parent table of the inherited table is 0, we can only use the average |
| * to calculate correlation. |
| */ |
| bool useWeightedMean = !inh; |
| sql = psprintf("select pg_catalog.gp_acquire_correlations(%d, '%s');", relOid, inh ? "t" : "f"); |
| |
| Assert(Gp_role == GP_ROLE_DISPATCH); |
| onerel = table_open(relOid, AccessShareLock); |
| relDesc = RelationGetDescr(onerel); |
| attNum = relDesc->natts; |
| |
| /* dispatch sql to segments */ |
| CdbDispatchCommand(sql, DF_WITH_SNAPSHOT, &cdb_pgresults); |
| |
| /* Count the number of non-dropped cols */ |
| live_natts = 0; |
| for (int attno = 0; attno < attNum; attno++) |
| { |
| Form_pg_attribute relatt = TupleDescAttr(relDesc, attno); |
| correlationsIsNull[attno] = true; |
| |
| if (relatt->attisdropped) |
| continue; |
| live_natts++; |
| } |
| |
| /* construct tupleDesc for RECORD results of gp_acquire_correlations */ |
| TupleDesc tupleDesc = CreateTemplateTupleDesc(3); |
| TupleDescInitEntry(tupleDesc, |
| 1, |
| "attnum", |
| INT4OID, |
| -1, |
| 0); |
| TupleDescInitEntry(tupleDesc, |
| 2, |
| "correlation", |
| FLOAT4OID, |
| -1, |
| 0); |
| TupleDescInitEntry(tupleDesc, |
| 3, |
| "totalrows", |
| INT4OID, |
| -1, |
| 0); |
| |
| /* For RECORD results, make sure a typmod has been assigned */ |
| Assert(tupleDesc->tdtypeid == RECORDOID && tupleDesc->tdtypmod < 0); |
| assign_record_type_typmod(tupleDesc); |
| |
| /* calculate correlations by cdb_pgresults */ |
| if (useWeightedMean) |
| calculate_correlation_use_weighted_mean(&cdb_pgresults, |
| onerel, |
| tupleDesc, |
| correlations, |
| correlationsIsNull, |
| live_natts); |
| else |
| calculate_correlation_use_mean(&cdb_pgresults, |
| onerel, |
| tupleDesc, |
| correlations, |
| correlationsIsNull, |
| live_natts); |
| |
| cdbdisp_clearCdbPgResults(&cdb_pgresults); |
| table_close(onerel, AccessShareLock); |
| } |