| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /*------------------------------------------------------------------------- |
| * |
| * analyze.c |
| * the Postgres statistics generator |
| * |
| * Portions Copyright (c) 2005-2010, Greenplum inc |
| * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group |
| * Portions Copyright (c) 1994, Regents of the University of California |
| * |
| * The new ANALYZE implementation uses routine query processing |
| * capabilities of GPDB to compute statistics on relations. It |
| * differs from old ANALYZE functionally in that it computes global |
| * statistics which improves cardinalities of join/group-by |
| * operations. |
| * It uses SPI under the covers to issue queries. The main point of |
| * entry is the function analyzeStatement. This function does the |
| * heavy lifting of setting up transactions and getting locks. |
| * |
| * IDENTIFICATION |
| * $PostgreSQL: pgsql/src/backend/commands/analyze.c,v 1.101 2006/11/05 22:42:08 tgl Exp $ |
| * |
| *------------------------------------------------------------------------- |
| */ |
| #include "postgres.h" |
| |
| #include <math.h> |
| |
| #include "access/pxfuriparser.h" |
| #include "access/heapam.h" |
| #include "access/hd_work_mgr.h" |
| #include "access/pxfanalyze.h" |
| #include "catalog/catquery.h" |
| #include "catalog/heap.h" |
| #include "access/transam.h" |
| #include "access/tuptoaster.h" |
| #include "access/aosegfiles.h" |
| #include "access/parquetsegfiles.h" |
| #include "access/hash.h" |
| #include "access/xact.h" |
| #include "catalog/index.h" |
| #include "catalog/indexing.h" |
| #include "catalog/namespace.h" |
| #include "catalog/catalog.h" |
| #include "catalog/pg_exttable.h" |
| #include "cdb/cdbappendonlyam.h" |
| #include "cdb/cdbpartition.h" |
| #include "cdb/cdbheap.h" |
| #include "cdb/cdbhash.h" |
| #include "commands/vacuum.h" |
| #include "commands/dbcommands.h" |
| #include "executor/executor.h" |
| #include "lib/stringinfo.h" |
| #include "libpq/pqformat.h" /* pq_beginmessage() etc. */ |
| #include "miscadmin.h" |
| #include "parser/parse_expr.h" |
| #include "parser/parse_oper.h" |
| #include "parser/parse_relation.h" |
| #include "postmaster/identity.h" |
| #include "pgstat.h" |
| #include "utils/acl.h" |
| #include "utils/datum.h" |
| #include "utils/faultinjector.h" |
| #include "utils/lsyscache.h" |
| #include "utils/memutils.h" |
| #include "utils/syscache.h" |
| #include "utils/tuplesort.h" |
| #include "utils/palloc.h" |
| #include "utils/pg_locale.h" |
| #include "utils/builtins.h" |
| #include "utils/inval.h" |
| #include "utils/uri.h" |
| #include "cdb/cdbvars.h" |
| #include "cdb/cdbanalyze.h" |
| #include "cdb/cdbrelsize.h" |
| #include "cdb/cdbdatalocality.h" |
| #include "utils/fmgroids.h" |
| #include "storage/backendid.h" |
| #include "executor/spi.h" |
| #include "postmaster/autovacuum.h" |
| #include "catalog/pg_namespace.h" |
| #include "utils/debugbreak.h" |
| #include "nodes/makefuncs.h" |
| #include "nodes/nodes.h" |
| #include "nodes/parsenodes.h" |
| |
| #include "commands/analyzeutils.h" |
| |
| /** |
| * Statistics related parameters. |
| */ |
| |
| int default_statistics_target = 25; |
| double analyze_relative_error = 0.25; |
| bool gp_statistics_pullup_from_child_partition = FALSE; |
| bool gp_statistics_use_fkeys = FALSE; |
| int gp_statistics_blocks_target = 25; |
| double gp_statistics_ndistinct_scaling_ratio_threshold = 0.10; |
| double gp_statistics_sampling_threshold = 10000; |
| const int gp_external_table_default_number_of_pages = 1000; |
| const int gp_external_table_default_number_of_tuples = 1000000; |
| |
| /** |
| * This struct contains statistics produced during ANALYZE |
| * on a column. |
| */ |
| typedef struct AttributeStatistics |
| { |
| float4 ndistinct; /* number of distinct values. If less than 0, represents fraction of values that are distinct */ |
| float4 nullFraction; /* fraction of values that are null */ |
| float4 avgWidth; /* average width of values */ |
| ArrayType *mcv; /* most common values */ |
| ArrayType *freq; /* frequencies of most common values */ |
| ArrayType *hist; /* equi-depth histogram bounds */ |
| } AttributeStatistics; |
| |
| /** |
| * Logging level. |
| */ |
| static int elevel = -1; |
| |
| /* |
| * in dbsize.c |
| */ |
| extern int64 calculate_relation_size(Relation rel); |
| |
| /* Top level functions */ |
| static void analyzeRelation(Relation relation, List *lAttributeNames, bool rootonly); |
| static void analyzeComputeAttributeStatistics(Oid relationOid, |
| const char *attributeName, |
| float4 relTuples, |
| Oid sampleTableOid, |
| float4 sampleTableRelTuples, |
| bool mergeStats, |
| AttributeStatistics *stats); |
| static List* analyzableRelations(bool rootonly, List** fullRelOids); |
| static bool analyzePermitted(Oid relationOid); |
| static List *analyzableAttributes(Relation candidateRelation); |
| static int calculate_virtual_segment_number(List* candidateRelations); |
| static List *buildExplicitAttributeNames(Oid relationOid, VacuumStmt *stmt); |
| |
| /* Reltuples/relpages estimation functions */ |
| static void gp_statistics_estimate_reltuples_relpages_heap(Relation rel, float4 *reltuples, float4 *relpages); |
| static void gp_statistics_estimate_reltuples_relpages_ao_rows(Relation rel, float4 *reltuples, float4 *relpages); |
| static void gp_statistics_estimate_reltuples_relpages_parquet(Relation rel, float4 *reltuples, float4 *relpages); |
| static void gp_statistics_estimate_reltuples_relpages_external(Relation rel, float4 *relTuples, float4 *relPages); |
| static void analyzeEstimateReltuplesRelpages(Oid relationOid, float4 *relTuples, float4 *relPages, bool rootonly); |
| static void analyzeEstimateIndexpages(Oid relationOid, Oid indexOid, float4 *indexPages); |
| |
| static void getExternalRelTuples(Oid relationOid, float4 *relTuples); |
| static void getExternalRelPages(Oid relationOid, float4 *relPages , Relation rel); |
| static float4 getExtrelPagesHDFS(Uri *uri); |
| static bool isExternalHDFSProtocol(Oid relOid); |
| |
| /* Attribute-type related functions */ |
| static bool isOrderedAndHashable(Oid relationOid, const char *attributeName); |
| static bool isBoolType(Oid relationOid, const char *attributeName); |
| static bool isNotNull(Oid relationOid, const char *attributeName); |
| static bool isFixedWidth(Oid relationOid, const char *attributeName, float4 *width); |
| static bool hasMaxDefined(Oid relationOid, const char *attributeName); |
| |
| /* Sampling related */ |
| static float4 estimateSampleSize(Oid relationOid, const char *attributeName, float4 relTuples); |
| static Oid buildSampleTable(Oid relationOid, |
| char* sampleTableName, |
| List *lAttributeNames, |
| float4 relTuples, |
| float4 requestedSampleSize, |
| float4 *sampleTableRelTuples); |
| |
| /* Attribute statistics computation */ |
| static int4 numberOfMCVEntries(Oid relationOid, const char *attributeName); |
| static int4 numberOfHistogramEntries(Oid relationOid, const char *attributeName); |
| static void analyzeComputeNDistinctBySample(Oid relationOid, |
| float4 relTuples, |
| Oid sampleTableOid, |
| float4 sampleTableRelTuples, |
| const char *attributeName, |
| bool *computeMCV, |
| bool *computeHist, |
| AttributeStatistics *stats |
| ); |
| static float4 analyzeComputeNDistinctAbsolute(Oid sampleTableOid, |
| float4 sampleTableRelTuples, |
| const char *attributeName); |
| static float4 analyzeComputeNRepeating(Oid relationOid, |
| const char *attributeName); |
| static float4 analyzeNullCount(Oid sampleTableOid, Oid relationOid, const char *attributeName, bool mergeStats); |
| static float4 analyzeComputeAverageWidth(Oid sampleTableOid, |
| Oid relationOid, |
| const char *attributeName, |
| float4 relTuples, |
| bool mergeStats); |
| static void analyzeComputeMCV(Oid relationOid, |
| Oid sampleTableOid, |
| const char *attributeName, |
| float4 relTuples, |
| unsigned int nEntries, |
| bool mergeStats, |
| ArrayType **mcv, |
| ArrayType **freq); |
| static void analyzeComputeHistogram(Oid relationOid, |
| Oid sampleTableOid, |
| const char *attributeName, |
| float4 relTuples, |
| unsigned int nEntries, |
| bool mergeStats, |
| ArrayType *mcv, |
| ArrayType **hist); |
| static float4 analyzeComputeNDistinctByLargestPartition(Oid relationOid, |
| float4 relTuples, |
| const char *attributeName); |
| static StringInfo getStringLeafPartitionOids(Oid relationOid); |
| static Oid get_largest_leaf_partition(Oid rootOid); |
| |
| /* Catalog related */ |
| static void updateAttributeStatisticsInCatalog(Oid relationOid, const char *attributeName, |
| AttributeStatistics *stats); |
| static void updateReltuplesRelpagesInCatalog(Oid relationOid, float4 relTuples, float4 relPages); |
| |
| /** |
| * Extern stuff. |
| */ |
| |
| extern List *find_all_inheritors(Oid parentrel); |
| extern BlockNumber RelationGuessNumberOfBlocks(double totalbytes); |
| extern Oid LookupFuncName(List *funcname, int nargs, const Oid *argtypes, bool noError); |
| extern bool optimizer; /* status of the new optimizer */ |
| extern bool optimizer_analyze_root_partition; /* GUC for stats collection on root partitions */ |
| /* |
| * To avoid consuming too much memory during analysis and/or too much space |
| * in the resulting pg_statistic rows, we ignore varlena datums that are wider |
| * than WIDTH_THRESHOLD (after detoasting!). This is legitimate for MCV |
| * and distinct-value calculations since a wide value is unlikely to be |
| * duplicated at all, much less be a most-common value. For the same reason, |
| * ignoring wide values will not affect our estimates of histogram bin |
| * boundaries very much. |
| */ |
| #define COLUMN_WIDTH_THRESHOLD 1024 |
| |
| |
| /** |
| * This is the main entry point for analyze execution. Three possible ways of calling this method. |
| * 1. Full database ANALYZE. No relations are explicitly specified. |
| * 2. List of relations is specified (Usually by autovacuum). |
| * 3. One relation is specified (optionally, a list of columns). |
| * This method can only be called in DISPATCH or UTILITY roles. |
| * Input: |
| * vacstmt - Vacuum statement. |
| * relids - Usually NULL except when called by autovacuum. |
| */ |
| void analyzeStatement(VacuumStmt *stmt, List *relids, int preferred_seg_num) |
| { |
| /* MPP-14608: Analyze may create temp tables. |
| * Disable autostats so that analyze is not called during their creation. */ |
| |
| GpAutoStatsModeValue autostatvalBackup = gp_autostats_mode; |
| GpAutoStatsModeValue autostatInFunctionsvalBackup = gp_autostats_mode_in_functions; |
| bool optimizerBackup = optimizer; |
| |
| gp_autostats_mode = GP_AUTOSTATS_NONE; |
| gp_autostats_mode_in_functions = GP_AUTOSTATS_NONE; |
| optimizer = false; |
| |
| PG_TRY(); |
| { |
| analyzeStmt(stmt, relids, preferred_seg_num); |
| gp_autostats_mode = autostatvalBackup; |
| gp_autostats_mode_in_functions = autostatInFunctionsvalBackup; |
| optimizer = optimizerBackup; |
| } |
| |
| /* Clean up in case of error. */ |
| PG_CATCH(); |
| { |
| gp_autostats_mode = autostatvalBackup; |
| gp_autostats_mode_in_functions = autostatInFunctionsvalBackup; |
| optimizer = optimizerBackup; |
| |
| /* Carry on with error handling. */ |
| PG_RE_THROW(); |
| } |
| PG_END_TRY(); |
| |
| Assert(gp_autostats_mode == autostatvalBackup); |
| Assert(gp_autostats_mode_in_functions == autostatInFunctionsvalBackup); |
| Assert(optimizer == optimizerBackup); |
| } |
| |
| /** |
| * This method can only be called in DISPATCH or UTILITY roles. |
| * Input: |
| * vacstmt - Vacuum statement. |
| * relids - Usually NULL except when called by autovacuum. |
| */ |
| void analyzeStmt(VacuumStmt *stmt, List *relids, int preferred_seg_num) |
| { |
| List *lRelOids = NIL; |
| List *lFullRelOids = NIL; |
| MemoryContext callerContext = NULL; |
| MemoryContext analyzeStatementContext = NULL; |
| MemoryContext analyzeRelationContext = NULL; |
| bool bUseOwnXacts = false; |
| ListCell *le1 = NULL; |
| int successCount = 0, failCount = 0; |
| StringInfoData failNames; |
| ResourceOwner asOwner, oldOwner1; /* asOwner for analyzeStmt resource owner */ |
| |
| /** |
| * Ensure that an ANALYZE is requested. |
| */ |
| Assert(stmt->analyze); |
| |
| /** |
| * Ensure that vacuum was not requested. |
| */ |
| Assert(!stmt->vacuum); |
| |
| /** |
| * Both relids and stmt->relation cannot be non-null. |
| */ |
| Assert(!(relids != NIL && stmt->relation != NULL)); |
| |
| /** |
| * Works only in DISPATCH and UTILITY mode. |
| */ |
| Assert(Gp_role == GP_ROLE_DISPATCH || Gp_role == GP_ROLE_UTILITY); |
| |
| /** |
| * Only works in normal processing mode - should not be called in bootstrapping or |
| * init mode. |
| */ |
| Assert(IsNormalProcessingMode()); |
| |
| /* If running in diagnostic mode, simply return */ |
| if (Gp_interconnect_type == INTERCONNECT_TYPE_NIL) |
| { |
| return; |
| } |
| |
| if (stmt->verbose) |
| elevel = INFO; |
| else |
| elevel = DEBUG2; |
| |
| callerContext = CurrentMemoryContext; |
| |
| /* |
| * This is the statement-level context. This will be cleaned up when we exit this |
| * function. |
| */ |
| analyzeStatementContext = AllocSetContextCreate(PortalContext, |
| "Analyze", |
| ALLOCSET_DEFAULT_MINSIZE, |
| ALLOCSET_DEFAULT_INITSIZE, |
| ALLOCSET_DEFAULT_MAXSIZE); |
| |
| MemoryContextSwitchTo(analyzeStatementContext); |
| |
| initStringInfo(&failNames); |
| |
| /* |
| * This is a per relation context. |
| */ |
| analyzeRelationContext = AllocSetContextCreate(analyzeStatementContext, |
| "AnalyzeRel", |
| ALLOCSET_DEFAULT_MINSIZE, |
| ALLOCSET_DEFAULT_INITSIZE, |
| ALLOCSET_DEFAULT_MAXSIZE); |
| |
| /** |
| * What relations need to be ANALYZED. |
| */ |
| if (relids == NIL && stmt->relation == NULL) |
| { |
| /** |
| * ANALYZE entire DB. |
| */ |
| lRelOids = analyzableRelations(stmt->rootonly, &lFullRelOids); |
| if (stmt->rootonly && NIL == lRelOids) |
| { |
| ereport(WARNING, |
| (errmsg("there are no partitioned tables in database to ANALYZE ROOTPARTITION"))); |
| } |
| } |
| else if (relids != NIL) |
| { |
| /** |
| * ANALYZE called by autovacuum. |
| */ |
| lRelOids = relids; |
| lFullRelOids = relids; |
| } |
| else |
| { |
| /** |
| * ANALYZE one relation (optionally, a list of columns). |
| */ |
| Oid relationOid = InvalidOid; |
| Assert(relids == NIL); |
| Assert(stmt->relation != NULL); |
| relationOid = RangeVarGetRelid(stmt->relation, false, false /*allowHcatalog*/); |
| PartStatus ps = rel_part_status(relationOid); |
| |
| if (ps != PART_STATUS_ROOT && stmt->rootonly) |
| { |
| ereport(WARNING, |
| (errmsg("skipping \"%s\" --- cannot analyze a non-root partition using ANALYZE ROOTPARTITION", |
| get_rel_name(relationOid)))); |
| } |
| else if (ps == PART_STATUS_ROOT) |
| { |
| PartitionNode *pn = get_parts(relationOid, 0 /*level*/ , |
| 0 /*parent*/, false /* inctemplate */, CurrentMemoryContext, true /*includesubparts*/); |
| Assert(pn); |
| if (!stmt->rootonly) |
| { |
| lRelOids = all_leaf_partition_relids(pn); /* all leaves */ |
| } |
| lFullRelOids = all_leaf_partition_relids(pn); |
| lRelOids = lappend_oid(lRelOids, relationOid); /* root partition */ |
| lFullRelOids = lappend_oid(lFullRelOids, relationOid); /* root partition */ |
| if (optimizer_analyze_midlevel_partition) |
| { |
| lRelOids = list_concat(lRelOids, all_interior_partition_relids(pn)); /* interior partitions */ |
| } |
| lFullRelOids = list_concat(lFullRelOids, all_interior_partition_relids(pn)); /* interior partitions */ |
| } |
| else if (ps == PART_STATUS_INTERIOR) /* analyze an interior partition directly */ |
| { |
| /* disable analyzing mid-level partitions directly since the users are encouraged |
| * to work with the root partition only. To gather stats on mid-level partitions |
| * (for Orca's use), the user should run ANALYZE or ANALYZE ROOTPARTITION on the |
| * root level. |
| */ |
| ereport(WARNING, |
| (errmsg("skipping \"%s\" --- cannot analyze a mid-level partition. " |
| "Please run ANALYZE on the root partition table.", |
| get_rel_name(relationOid)))); |
| } |
| else if (!isExternalHDFSProtocol(relationOid)) |
| { |
| /* |
| * Support analyze for external table. |
| * For now, HDFS protocol external table is supported. |
| */ |
| ereport(WARNING, |
| (errmsg("skipping \"%s\" --- cannot analyze external table with non-HDFS or non-MAGMA protocol. " |
| "Please run ANALYZE on external table with HDFS or MAGMA protocol.", |
| get_rel_name(relationOid)))); |
| } |
| else |
| { |
| lRelOids = list_make1_oid(relationOid); |
| lFullRelOids = list_make1_oid(relationOid); |
| } |
| } |
| |
| /* |
| * Decide whether we need to start/commit our own transactions. |
| * The scenarios in which we can start/commit our own transactions are: |
| * 1. We are not in a transaction block and there are multiple relations specified (some of them may be implicit) |
| * 2. We are in autovacuum mode |
| */ |
| |
| if ((!IsInTransactionChain((void *) stmt) && list_length(lRelOids) > 1) |
| || IsAutoVacuumProcess()) |
| bUseOwnXacts = true; |
| |
| elog(DEBUG2, "ANALYZE start/commit own transaction = %s", bUseOwnXacts ? "true" : "false"); |
| |
| /** |
| * Iterate through all relids in the list and issue analyze on all columns on each relation. |
| */ |
| |
| if (bUseOwnXacts) |
| { |
| /* |
| * We commit the transaction started in PostgresMain() here, and start |
| * another one before exiting to match the commit waiting for us back in |
| * PostgresMain(). |
| */ |
| CommitTransactionCommand(); |
| MemoryContextSwitchTo(analyzeStatementContext); |
| } |
| |
| /** |
| * Create a resource owner to keep track of our resources even not in trasaction block |
| */ |
| asOwner = ResourceOwnerCreate(CurrentResourceOwner, "analyzeStmt"); |
| oldOwner1 = CurrentResourceOwner; |
| CurrentResourceOwner = asOwner; |
| |
| /** |
| * we use preferred_seg_num as default and |
| * compute target_seg_num based on data size and distributed type |
| * if there is no preferred_seg_num. |
| */ |
| int target_seg_num = preferred_seg_num; |
| if (target_seg_num <= 0) { |
| target_seg_num = calculate_virtual_segment_number(lFullRelOids); |
| } |
| elog(LOG, "virtual segment number of analyze is: %d\n", target_seg_num); |
| |
| /** |
| * We open relations with appropreciate locks |
| */ |
| List *candidateRelations = NIL; |
| |
| foreach (le1, lRelOids) |
| { |
| Oid candidateOid = InvalidOid; |
| Relation candidateRelation = NULL; |
| |
| Assert(analyzeStatementContext == CurrentMemoryContext); |
| |
| candidateOid = lfirst_oid(le1); |
| candidateRelation = |
| try_relation_open(candidateOid, ShareUpdateExclusiveLock, false); |
| |
| if (candidateRelation) |
| { |
| candidateRelations = lappend(candidateRelations, candidateRelation); |
| } |
| else |
| { |
| elog(ERROR, "Cannot open and lock relation %s for analyze", |
| RelationGetRelationName(candidateRelation)); |
| } |
| } |
| |
| /** |
| * We allocate query resource for analyze |
| */ |
| QueryResource *resource = AllocateResource(QRL_ONCE, 1, 0, target_seg_num, target_seg_num, NULL, 0); |
| QueryResource *savedResource = NULL; |
| |
| savedResource = GetActiveQueryResource(); |
| SetActiveQueryResource(resource); |
| |
| /** |
| * We do actual analyze |
| */ |
| PG_TRY(); |
| { |
| foreach (le1, candidateRelations) |
| { |
| Relation candidateRelation = NULL; |
| bool bTemp = false; |
| |
| Assert(analyzeStatementContext == CurrentMemoryContext); |
| |
| if (bUseOwnXacts) |
| { |
| /** |
| * We use a different transaction per relation so that we |
| * may release locks on relations as soon as possible. |
| */ |
| StartTransactionCommand(); |
| ActiveSnapshot = CopySnapshot(GetTransactionSnapshot()); |
| MemoryContextSwitchTo(analyzeStatementContext); |
| } |
| |
| candidateRelation = (Relation)lfirst(le1); |
| |
| if (candidateRelation) |
| { |
| /** |
| * We got a lock on the relation. Good! |
| */ |
| if (analyzePermitted(RelationGetRelid(candidateRelation))) |
| { |
| StringInfoData ext_uri; |
| |
| /* |
| * We have permission to ANALYZE. |
| */ |
| |
| /* MPP-7576: don't track internal namespace tables */ |
| switch (candidateRelation->rd_rel->relnamespace) |
| { |
| case PG_CATALOG_NAMESPACE: |
| /* MPP-7773: don't track objects in system namespace |
| * if modifying system tables (eg during upgrade) |
| */ |
| if (allowSystemTableModsDDL) |
| { |
| bTemp = true; |
| } |
| break; |
| |
| case PG_TOAST_NAMESPACE: |
| case PG_BITMAPINDEX_NAMESPACE: |
| case PG_AOSEGMENT_NAMESPACE: |
| bTemp = true; |
| break; |
| |
| default: |
| break; |
| } |
| |
| /* MPP-7572: Don't track metadata if table in any |
| * temporary namespace |
| */ |
| if (!bTemp) |
| { |
| bTemp = isAnyTempNamespace( |
| candidateRelation->rd_rel->relnamespace); |
| } |
| |
| initStringInfo(&ext_uri); |
| |
| if (candidateRelation->rd_rel->relkind != RELKIND_RELATION) |
| { |
| /** |
| * Is the relation the right kind? |
| */ |
| ereport(WARNING, |
| (errmsg("skipping \"%s\" --- cannot analyze indexes, views, external tables or special system tables", |
| RelationGetRelationName(candidateRelation)))); |
| |
| relation_close_at_resource_owner(candidateRelation, ShareUpdateExclusiveLock, asOwner); |
| } |
| else if (isOtherTempNamespace(RelationGetNamespace(candidateRelation))) |
| { |
| /* Silently ignore tables that are temp tables of other backends. */ |
| relation_close_at_resource_owner(candidateRelation, ShareUpdateExclusiveLock, asOwner); |
| } |
| else if (RelationIsExternalPxfReadOnly(candidateRelation, &ext_uri) && |
| (!pxf_enable_stat_collection)) |
| { |
| /* PXF supports ANALYZE, but only when the GUC is on */ |
| ereport(WARNING, |
| (errmsg("skipping \"%s\" --- analyze for PXF tables is turned off by 'pxf_enable_stat_collection'", |
| RelationGetRelationName(candidateRelation)))); |
| |
| relation_close_at_resource_owner(candidateRelation, ShareUpdateExclusiveLock, asOwner); |
| } |
| else |
| { |
| List *lAttNames = NIL; |
| |
| /* Switch to per relation context */ |
| MemoryContextSwitchTo(analyzeRelationContext); |
| |
| if (stmt->va_cols) |
| { |
| /** |
| * Column names have been provided. Should have specified relation name as well. |
| */ |
| Assert(stmt->relation && "Column names specified but not relation name"); |
| lAttNames = buildExplicitAttributeNames(RelationGetRelid(candidateRelation), stmt); |
| } |
| else |
| { |
| lAttNames = analyzableAttributes(candidateRelation); |
| } |
| |
| /* Start a sub-transaction for each analyzed table */ |
| MemoryContext oldcontext = CurrentMemoryContext; |
| ResourceOwner oldowner = CurrentResourceOwner; |
| BeginInternalSubTransaction(NULL); |
| MemoryContextSwitchTo(oldcontext); |
| |
| PG_TRY(); |
| { |
| analyzeRelation(candidateRelation, lAttNames, stmt->rootonly); |
| |
| #ifdef FAULT_INJECTOR |
| FaultInjector_InjectFaultIfSet( |
| AnalyzeSubxactError, |
| DDLNotSpecified, |
| "", /* databaseName */ |
| ""); /* tableName */ |
| #endif /* FAULT_INJECTOR */ |
| |
| ReleaseCurrentSubTransaction(); |
| MemoryContextSwitchTo(oldcontext); |
| CurrentResourceOwner = oldowner; |
| successCount += 1; |
| } |
| PG_CATCH(); |
| { |
| ErrorData *edata; |
| |
| /* Save error info */ |
| MemoryContextSwitchTo(oldcontext); |
| edata = CopyErrorData(); |
| FlushErrorState(); |
| |
| elog(WARNING, "skipping \"%s\" --- error returned: %s", |
| RelationGetRelationName(candidateRelation), |
| edata->message); |
| failCount += 1; |
| |
| /* failNames uses memory from statement level context */ |
| MemoryContextSwitchTo(analyzeStatementContext); |
| appendStringInfo(&failNames, "%s", failCount == 1 ? "(" : ", "); |
| appendStringInfo(&failNames, "%s", RelationGetRelationName(candidateRelation)); |
| MemoryContextSwitchTo(oldcontext); |
| |
| /* rollback this table's sub-transaction */ |
| RollbackAndReleaseCurrentSubTransaction(); |
| MemoryContextSwitchTo(oldcontext); |
| CurrentResourceOwner = oldowner; |
| |
| /* Cancel from user should result in canceling ANALYZE, not just this table */ |
| if (edata->sqlerrcode == ERRCODE_QUERY_CANCELED) |
| { |
| ReThrowError(edata); |
| } |
| else |
| { |
| /* release error state */ |
| FreeErrorData(edata); |
| } |
| } |
| PG_END_TRY(); |
| |
| /* Switch back to statement context and reset relation context */ |
| MemoryContextSwitchTo(analyzeStatementContext); |
| MemoryContextResetAndDeleteChildren(analyzeRelationContext); |
| |
| /* |
| * Close source relation now, but keep lock so |
| * that no one deletes it before we commit. (If |
| * someone did, they'd fail to clean up the |
| * entries we made in pg_statistic. Also, |
| * releasing the lock before commit would expose |
| * us to concurrent-update failures.) |
| */ |
| relation_close_at_resource_owner(candidateRelation, NoLock, asOwner); |
| |
| /* MPP-6929: metadata tracking */ |
| if (!bTemp && (Gp_role == GP_ROLE_DISPATCH)) |
| { |
| char *asubtype = ""; |
| |
| if (IsAutoVacuumProcess()) |
| { |
| asubtype = "AUTO"; |
| } |
| |
| MetaTrackUpdObject(RelationRelationId, |
| RelationGetRelid(candidateRelation), |
| GetUserId(), |
| "ANALYZE", |
| asubtype |
| ); |
| } |
| } |
| } |
| else |
| { |
| /** |
| * We don't have permissions to ANALYZE the relation. Print warning and move on |
| * to the next relation. |
| */ |
| ereport(WARNING, |
| (errmsg("Skipping \"%s\" --- only table or database owner can analyze it", |
| RelationGetRelationName(candidateRelation)))); |
| |
| relation_close_at_resource_owner(candidateRelation, ShareUpdateExclusiveLock, asOwner); |
| } /* if (analyzePermitted(RelationGetRelid(candidateRelation))) */ |
| } |
| else |
| { |
| /* |
| * Relation may have been dropped out from under us. |
| * TODO: should we print a warning here? Do we print it during |
| * ANALYZE DB or AutoVacuum? |
| */ |
| } /* if (candidateRelation) */ |
| |
| if (bUseOwnXacts) |
| { |
| /** |
| * We commit the transaction so that locks on the relation may be released. |
| */ |
| CommitTransactionCommand(); |
| MemoryContextSwitchTo(analyzeStatementContext); |
| } |
| } |
| } /* End of PG_TRY() */ |
| PG_CATCH(); |
| { |
| FreeResource(resource); |
| resource = NULL; |
| UnsetActiveQueryResource(); |
| SetActiveQueryResource(savedResource); |
| |
| /* Carry on with error handling. */ |
| PG_RE_THROW(); |
| } |
| PG_END_TRY(); |
| |
| /** |
| * We now free query resource |
| */ |
| FreeResource(resource); |
| resource = NULL; |
| UnsetActiveQueryResource(); |
| SetActiveQueryResource(savedResource); |
| |
| ResourceOwnerRelease(asOwner, |
| RESOURCE_RELEASE_BEFORE_LOCKS, |
| false, false); |
| CurrentResourceOwner = oldOwner1; |
| ResourceOwnerDelete(asOwner); |
| |
| if (bUseOwnXacts) |
| { |
| /** |
| * We start a new transaction command to match the one in PostgresMain(). |
| */ |
| /* |
| * in hawq, there is no distributed transaction |
| */ |
| /*setupRegularDtxContext();*/ |
| StartTransactionCommand(); |
| ActiveSnapshot = CopySnapshot(GetTransactionSnapshot()); |
| MemoryContextSwitchTo(analyzeStatementContext); |
| } |
| |
| if (failCount > 0) |
| { |
| appendStringInfo(&failNames, ")"); |
| } |
| elog(failCount > 0 ? INFO : elevel, "ANALYZE completed. Success: %d, Failure: %d %s", successCount, failCount, failNames.data); |
| pfree(failNames.data); |
| |
| Assert(analyzeStatementContext == CurrentMemoryContext); |
| MemoryContextSwitchTo(callerContext); |
| MemoryContextDelete(analyzeStatementContext); |
| } |
| |
| /* |
| * calculate virtual segment number for analyze statement. |
| * if there is hash distributed relations exist, use the max bucket number. |
| * if all relation are random, use the data size to determine vseg number. |
| */ |
| static int calculate_virtual_segment_number(List* candidateOids) { |
| ListCell* le1; |
| int vsegNumber = 1; |
| int64 totalDataSize = 0; |
| bool isHashRelationExist = false; |
| int maxHashBucketNumber = 0; |
| bool isPartitionTableExist = false; |
| foreach (le1, candidateOids) |
| { |
| Oid candidateOid = InvalidOid; |
| candidateOid = lfirst_oid(le1); |
| |
| PartStatus ps = rel_part_status(candidateOid); |
| if(ps != PART_STATUS_NONE){ |
| isPartitionTableExist = true; |
| } |
| |
| Relation rel = relation_open(candidateOid, ShareUpdateExclusiveLock); |
| if (candidateOid > 0 ) { |
| GpPolicy *targetPolicy = GpPolicyFetch(CurrentMemoryContext, |
| candidateOid); |
| if(targetPolicy == NULL){ |
| return GetAnalyzeVSegNumLimit(isPartitionTableExist); |
| } |
| if (targetPolicy->nattrs > 0) { |
| isHashRelationExist = true; |
| if(maxHashBucketNumber < targetPolicy->bucketnum){ |
| maxHashBucketNumber = targetPolicy->bucketnum; |
| } |
| } |
| /* |
| * if no hash relation, we calculate the data size of all the relations. |
| */ |
| if (!isHashRelationExist) { |
| totalDataSize += calculate_relation_size(rel); |
| } |
| } |
| relation_close(rel, NoLock); |
| } |
| |
| if (isHashRelationExist) { |
| vsegNumber = maxHashBucketNumber; |
| } else { |
| /*we allocate one virtual segment for each 128M data */ |
| totalDataSize >>= 27; |
| vsegNumber = totalDataSize + 1; |
| } |
| Assert(vsegNumber > 0); |
| /*vsegNumber should be less than GetUtilPartitionNum*/ |
| if(vsegNumber > GetAnalyzeVSegNumLimit(isPartitionTableExist)){ |
| vsegNumber = GetAnalyzeVSegNumLimit(isPartitionTableExist); |
| } |
| |
| return vsegNumber; |
| } |
| |
| /* |
| * This method extracts the explicit attributes listed in a vacuum statement. It must |
| * be called only when it is known that explicit columns have been specified in the vacuum |
| * statement. It checks if attribute exists and also if attribute has been dropped. It |
| * also silently drops attributes that have stats target set to 0. |
| * Input: |
| * vacstmt - vacuum statement |
| * Output: |
| * list of attribute names that the vacuum statement requests |
| */ |
| static List* buildExplicitAttributeNames(Oid relationOid, VacuumStmt *stmt) |
| { |
| List *lExplicitAttNames = NIL; |
| ListCell *le = NULL; |
| Assert(stmt->va_cols != NULL); |
| /** |
| * va_col contains list of attributes that need to be analyzed. |
| */ |
| foreach (le, stmt->va_cols) |
| { |
| HeapTuple attributeTuple; |
| cqContext *pcqCtx; |
| char *attributeName = strVal(lfirst(le)); |
| Assert(attributeName); |
| |
| pcqCtx = caql_getattname_scan(NULL, relationOid, attributeName); |
| |
| attributeTuple = caql_get_current(pcqCtx); |
| |
| /* Ensure that we can actually analyze the attribute. */ |
| if (HeapTupleIsValid(attributeTuple)) |
| { |
| Form_pg_attribute att_tup = (Form_pg_attribute) GETSTRUCT(attributeTuple); |
| /* If attribute is dropped, print an error message */ |
| if (att_tup->attisdropped) |
| ereport(ERROR, |
| (errcode(ERRCODE_UNDEFINED_COLUMN), |
| errmsg("Attribute %s has been dropped in relation %s.", |
| attributeName, get_rel_name(relationOid)))); |
| |
| /* If statstarget=0 or type is "unknown", silently skip it. */ |
| if (att_tup->attstattarget != 0 |
| && att_tup->atttypid != UNKNOWNOID) |
| { |
| lExplicitAttNames = lappend(lExplicitAttNames, attributeName); |
| } |
| } |
| else |
| { |
| /* Attribute does not exist in relation. Error out */ |
| ereport(ERROR, |
| (errcode(ERRCODE_UNDEFINED_COLUMN), |
| errmsg("Relation %s does not have an attribute named %s.", |
| get_rel_name(relationOid),attributeName))); |
| } |
| |
| caql_endscan(pcqCtx); |
| |
| } |
| return lExplicitAttNames; |
| } |
| |
| /** |
| * This function determines if a table can be analyzed or not. |
| * Input: |
| * relationOid |
| * Output: |
| * true or false |
| */ |
| static bool analyzePermitted(Oid relationOid) |
| { |
| return (pg_class_ownercheck(relationOid, GetUserId()) |
| || pg_database_ownercheck(MyDatabaseId, GetUserId())); |
| } |
| |
| /** |
| * If ANALYZE is requested with no relations specified, this method is called to build |
| * the implicit list of relations from pg_class. Only those with relkind == RELKIND_RELATION |
| * are considered. |
| * If rootonly is true, we only analyze root partition table. |
| * |
| * Input: |
| * None |
| * Output: |
| * List of relids |
| */ |
| static List* analyzableRelations(bool rootonly, List **fullRelOids) |
| { |
| List *lRelOids = NIL; |
| cqContext *pcqCtx; |
| HeapTuple tuple = NULL; |
| |
| pcqCtx = caql_beginscan( |
| NULL, |
| cql("SELECT * FROM pg_class " |
| " WHERE relkind = :1 ", |
| CharGetDatum(RELKIND_RELATION))); |
| |
| while (HeapTupleIsValid(tuple = caql_getnext(pcqCtx))) |
| { |
| Oid candidateOid = HeapTupleGetOid(tuple); |
| bool isExternalHDFS = isExternalHDFSProtocol(candidateOid); |
| if (analyzePermitted(candidateOid) && |
| candidateOid != StatisticRelationId && |
| isExternalHDFS) |
| { |
| *fullRelOids = lappend_oid(*fullRelOids, candidateOid); |
| } |
| if (rootonly && !rel_is_partitioned(candidateOid)) |
| { |
| continue; |
| } |
| if (analyzePermitted(candidateOid) && |
| candidateOid != StatisticRelationId && |
| isExternalHDFS) |
| { |
| lRelOids = lappend_oid(lRelOids, candidateOid); |
| } |
| } |
| |
| caql_endscan(pcqCtx); |
| |
| return lRelOids; |
| } |
| |
| /** |
| * Given a relation's Oid, generate the list of attribute names that may be analyzed. |
| * This ignores columns that have been dropped or if stattarget is set to 0 by user. |
| * Input: |
| * relation |
| * Output: |
| * List of attribute names. This will need to be free'd by the caller. |
| */ |
| static List *analyzableAttributes(Relation candidateRelation) |
| { |
| int i = 0; |
| List *lAttNames = NIL; |
| Assert(candidateRelation != NULL); |
| for (i = 0; i < candidateRelation->rd_att->natts; i++) |
| { |
| Form_pg_attribute attr = candidateRelation->rd_att->attrs[i]; |
| Assert(attr); |
| /* |
| * Skip if one of these conditions is true: |
| * 1. Attribute is dropped |
| * 2. Stats target for attribute is 0 |
| * 3. Attribute has "unknown" type. |
| */ |
| if (!(attr->attisdropped |
| || attr->attstattarget == 0 |
| || attr->atttypid == UNKNOWNOID)) |
| { |
| char *attName = NULL; |
| attName = pstrdup(NameStr(attr->attname)); /* needs to be pfree'd by caller */ |
| Assert(attName); |
| lAttNames = lappend(lAttNames, (void *) attName); |
| } |
| } |
| return lAttNames; |
| } |
| |
| /** |
| * This method is called once all the transactions and snapshots have been set up. |
| * At a very high-level, it performs two functions: |
| * 1. Compute and update reltuples, relages for the relation. |
| * 2. Compute and update statistics on requested attributes. |
| * If the input relation is too large, it may create a sampled version of the relation |
| * and compute statistics. |
| * TODO: Check with Daria/Eric about messages. |
| * Input: |
| * relation - relation |
| * lAttributeNames - list of attribute names. |
| * Output: |
| * None |
| */ |
| static void analyzeRelation(Relation relation, List *lAttributeNames, bool rootonly) |
| { |
| Oid sampleTableOid = InvalidOid; |
| float4 minSampleTableSize = 0; |
| bool sampleTableRequired = true; |
| bool isExternalPxfReadOnly = false; |
| ListCell *le = NULL; |
| Oid relationOid = InvalidOid; |
| float4 estimatedRelTuples = 0.0; |
| float4 estimatedRelPages = 0.0; |
| float4 sampleTableRelTuples = 0.0; |
| List *indexOidList = NIL; |
| ListCell *lc = NULL; |
| StringInfoData location; |
| |
| initStringInfo(&location); |
| relationOid = RelationGetRelid(relation); |
| isExternalPxfReadOnly = RelationIsExternalPxfReadOnly(relation, &location); |
| |
| /* Step 1: estimate reltuples, relpages for the relation */ |
| if (!isExternalPxfReadOnly) |
| { |
| analyzeEstimateReltuplesRelpages(relationOid, &estimatedRelTuples, &estimatedRelPages, rootonly); |
| } |
| else |
| { |
| analyzePxfEstimateReltuplesRelpages(relation, &location, &estimatedRelTuples, &estimatedRelPages); |
| } |
| pfree(location.data); |
| |
| elog(elevel, "ANALYZE estimated reltuples=%f, relpages=%f for table %s", estimatedRelTuples, estimatedRelPages, RelationGetRelationName(relation)); |
| |
| /* Step 2: update the pg_class entry. */ |
| updateReltuplesRelpagesInCatalog(relationOid, estimatedRelTuples, estimatedRelPages); |
| |
| /* Find relpages of each index and update these as well */ |
| indexOidList = RelationGetIndexList(relation); |
| foreach (lc, indexOidList) |
| { |
| float4 estimatedIndexTuples = estimatedRelTuples; |
| float4 estimatedIndexPages = 0; |
| |
| Oid indexOid = lfirst_oid(lc); |
| Assert(indexOid != InvalidOid); |
| |
| if (estimatedRelTuples < 1.0) |
| { |
| /** |
| * If there are no rows in the relation, no point trying to estimate |
| * number of pages in the index. |
| */ |
| elog(elevel, "ANALYZE skipping index %s since relation %s has no rows.", get_rel_name(indexOid), get_rel_name(relationOid)); |
| } |
| else |
| { |
| /** |
| * NOTE: we don't attempt to estimate the number of tuples in an index. |
| * We will assume it to be equal to the estimated number of tuples in the relation. |
| * This does not hold for partial indexes. The number of tuples matching will be |
| * derived in selfuncs.c using the base table statistics. |
| */ |
| analyzeEstimateIndexpages(relationOid, indexOid, &estimatedIndexPages); |
| elog(elevel, "ANALYZE estimated reltuples=%f, relpages=%f for index %s", estimatedIndexTuples, estimatedIndexPages, get_rel_name(indexOid)); |
| } |
| |
| updateReltuplesRelpagesInCatalog(indexOid, estimatedIndexTuples, estimatedIndexPages); |
| } |
| |
| /* report results to the stats collector, too */ |
| pgstat_report_analyze(relation, estimatedRelTuples, 0 /*totaldeadrows*/); |
| |
| /** |
| * Does the relation have any rows. If not, no point analyzing columns. |
| */ |
| if (estimatedRelTuples < 1.0) |
| { |
| elog(elevel, "ANALYZE skipping computing statistics on table %s because it has no rows.", RelationGetRelationName(relation)); |
| return; |
| } |
| |
| /* Cannot compute statistics on pg_statistic due to locking issues */ |
| if (relationOid == StatisticRelationId) |
| { |
| elog(elevel, "ANALYZE skipping computing statistics on pg_statistic."); |
| return; |
| } |
| |
| /** |
| * For an interior (mid-level) partition, we merge the stats from the leaf children under it. |
| * The stats at mid-level will be used by the new query optimizer when querying mid-level partitions |
| * directly. The merge of stats has to be triggered from the root level. |
| */ |
| if (rel_part_status(relationOid) == PART_STATUS_INTERIOR) |
| { |
| foreach (le, lAttributeNames) |
| { |
| AttributeStatistics stats; |
| const char *lAttributeName = (const char *) lfirst(le); |
| elog(elevel, "ANALYZE merging statistics on attribute %s", lAttributeName); |
| analyzeComputeAttributeStatistics(relationOid, lAttributeName, estimatedRelTuples, relationOid, estimatedRelTuples, true /*mergeStats*/, &stats); |
| updateAttributeStatisticsInCatalog(relationOid, lAttributeName, &stats); |
| } |
| |
| return; |
| } |
| |
| /** |
| * Determine how many rows need to be sampled. |
| */ |
| foreach (le, lAttributeNames) |
| { |
| const char *attributeName = (const char *) lfirst(le); |
| float4 minRowsForAttribute = estimateSampleSize(relationOid, attributeName, estimatedRelTuples); |
| minSampleTableSize = Max(minSampleTableSize, minRowsForAttribute); |
| } |
| |
| /** |
| * If no statistics are needed on any attribute, then fall through quickly. |
| */ |
| if (minSampleTableSize == 0) |
| { |
| elog(elevel, "ANALYZE skipping computing statistics on table %s because no attribute needs it.", RelationGetRelationName(relation)); |
| return; |
| } |
| |
| /** |
| * Determine if a sample table needs to be created. If reltuples is very small, |
| * then, we'd rather work off the entire table. Also, if the sample required is |
| * the size of the table, then we'd rather work off the entire table. |
| * |
| * In case of PXF table, we always need a sample table because the various calculations |
| * should be done locally in HAWQ and not by retrieving the data again and again. |
| */ |
| if (!isExternalPxfReadOnly && |
| (estimatedRelTuples <= gp_statistics_sampling_threshold |
| || minSampleTableSize >= estimatedRelTuples)) /* maybe this should be K% of reltuples or something? */ |
| { |
| sampleTableRequired = false; |
| } |
| |
| /** |
| * Step 3: If required, create a sample table |
| */ |
| if (sampleTableRequired) |
| { |
| char * sampleTableName = temporarySampleTableName(relationOid, "pg_analyze"); /* must be pfreed */ |
| |
| elog(elevel, "ANALYZE building sample table of size %.0f on table %s because %s.", |
| minSampleTableSize, RelationGetRelationName(relation), |
| isExternalPxfReadOnly ? "it's a PXF table" : "it has too many rows"); |
| |
| if (isExternalPxfReadOnly) |
| { |
| sampleTableOid = buildPxfSampleTable(relationOid, sampleTableName, lAttributeNames, |
| estimatedRelTuples, estimatedRelPages, minSampleTableSize, |
| &sampleTableRelTuples); |
| } |
| else |
| { |
| sampleTableOid = buildSampleTable(relationOid, sampleTableName, lAttributeNames, |
| estimatedRelTuples, minSampleTableSize, &sampleTableRelTuples); |
| } |
| /* |
| * Update the sample table's reltuples, relpages. Without these, the queries to the sample table would call cdbRelsize which can be an expensive call. |
| * We know the number of tuples in the sample table, but don't have the information about the number of pages. We set it to 2 arbitrarily. |
| */ |
| updateReltuplesRelpagesInCatalog(sampleTableOid, sampleTableRelTuples, 2); |
| |
| /* We must have a non-empty sample table */ |
| Assert(sampleTableRelTuples > 0.0); |
| |
| pfree((void *) sampleTableName); |
| } |
| |
| /** |
| * Step 4: ANALYZE attributes, one at a time. |
| */ |
| foreach (le, lAttributeNames) |
| { |
| AttributeStatistics stats; |
| const char *lAttributeName = (const char *) lfirst(le); |
| elog(elevel, "ANALYZE computing statistics on attribute %s", lAttributeName); |
| if (sampleTableRequired) |
| analyzeComputeAttributeStatistics(relationOid, lAttributeName, estimatedRelTuples, sampleTableOid, sampleTableRelTuples, false /*mergeStats*/, &stats); |
| else |
| analyzeComputeAttributeStatistics(relationOid, lAttributeName, estimatedRelTuples, relationOid, estimatedRelTuples, false /*mergeStats*/, &stats); |
| updateAttributeStatisticsInCatalog(relationOid, lAttributeName, &stats); |
| } |
| |
| /** |
| * Step 5: Cleanup. Drop the sample table. |
| */ |
| if (sampleTableRequired) |
| { |
| elog(elevel, "ANALYZE dropping sample table"); |
| dropSampleTable(sampleTableOid, false); |
| } |
| |
| return; |
| } |
| |
| /** |
| * Generates a table name for the auxiliary sample table that may be created during ANALYZE. |
| * This is not super random. However, this should be sufficient for our purpose. |
| * Input: |
| * relationOid - relation |
| * prefix - sample name prefix |
| * Output: |
| * sample table name. This must be pfree'd by the caller. |
| */ |
| char* temporarySampleTableName(Oid relationOid, char* prefix) |
| { |
| char tmpname[NAMEDATALEN]; |
| snprintf(tmpname, NAMEDATALEN, "%s_%u_%i", prefix, relationOid, MyBackendId); |
| return pstrdup(tmpname); |
| } |
| |
| /** |
| * This method determines the number of mcv entries to be computed for a particular attribute. |
| * Input: |
| * relationOid - Oid of relation |
| * attributeName - Name of attribute |
| * Output: |
| * number of mcv entries. |
| */ |
| static int4 numberOfMCVEntries(Oid relationOid, const char *attributeName) |
| { |
| HeapTuple attributeTuple = NULL; |
| Form_pg_attribute attribute = NULL; |
| int4 nMCVEntries = 0; |
| cqContext *pcqCtx; |
| |
| pcqCtx = caql_getattname_scan(NULL, relationOid, attributeName); |
| |
| attributeTuple = caql_get_current(pcqCtx); |
| Assert(HeapTupleIsValid(attributeTuple)); |
| attribute = (Form_pg_attribute) GETSTRUCT(attributeTuple); |
| nMCVEntries = attribute->attstattarget; |
| |
| caql_endscan(pcqCtx); |
| |
| if (nMCVEntries < 0) |
| nMCVEntries = (int4) default_statistics_target; |
| return nMCVEntries; |
| } |
| |
| /** |
| * This method determines the number of histogram entries to be computed for a particular attribute. |
| * Input: |
| * relationOid - relation's Oid |
| * attributeName - Name of attribute |
| * Output: |
| * number of histogram entries. Currently, this is the same as the number of MCV entries. |
| */ |
| static int4 numberOfHistogramEntries(Oid relationOid, const char *attributeName) |
| { |
| /** for now, the number of histogram entries is same as mcv_entries */ |
| return numberOfMCVEntries(relationOid, attributeName); |
| } |
| |
| /** |
| * Determine the number of tuples from relation that will need to be sampled |
| * to compute statistics on the specific attribute. This method looks up the |
| * stats target for the said attribute (if one is not found, the default_statistics_target |
| * is used) and employs a formula (explained below) to compute the number of rows |
| * needed. |
| * Input: |
| * relationOid - relation whose sample size must be determined |
| * attributeName - attribute |
| * relTuples - number of tuples in relation. |
| * Output: |
| * sample size |
| * |
| */ |
| static float4 estimateSampleSize(Oid relationOid, const char *attributeName, float4 relTuples) |
| { |
| float4 sampleSize = 0.0; |
| int4 statsTarget = 0.0; |
| |
| statsTarget = Max(numberOfHistogramEntries(relationOid, attributeName), |
| numberOfMCVEntries(relationOid, attributeName)); |
| Assert(statsTarget >= 0); |
| /*-------------------- |
| * The following choice of minrows is based on the paper |
| * "Random sampling for histogram construction: how much is enough?" |
| * by Surajit Chaudhuri, Rajeev Motwani and Vivek Narasayya, in |
| * Proceedings of ACM SIGMOD International Conference on Management |
| * of Data, 1998, Pages 436-447. Their Corollary 1 to Theorem 5 |
| * says that for table size n, histogram size k, maximum relative |
| * error fraction, and error probability gamma, the minimum |
| * random sample size is |
| * r = 4 * k * ln(2*n/gamma) / error_fraction^2 |
| * We use gamma=0.01 in our calculations. |
| *-------------------- |
| */ |
| Assert(relTuples > 0.0); |
| if (analyze_relative_error > 0.0) |
| { |
| sampleSize = (float4) 4 * statsTarget * log(2 * relTuples / 0.01) / (analyze_relative_error * analyze_relative_error); |
| } |
| else |
| { |
| sampleSize = relTuples; |
| } |
| |
| sampleSize = Max(sampleSize, 0.0); /* Sanity check for low relTuples */ |
| sampleSize = Min(sampleSize, relTuples); /* Bound sample size to table size */ |
| |
| return sampleSize; |
| } |
| |
| /** |
| * This is a helper method that executes a SQL statement using the SPI interface. |
| * It optionally calls a callback function with result pointer. |
| * Input: |
| * src - SQL string |
| * read_only - is it a read-only call? |
| * tcount - execution tuple-count limit, or 0 for none |
| * callbackFn - callback function to be executed once SPI is done. |
| * clientData - argument to call back function (usually pointer to data-structure |
| * that the callback function populates). |
| * |
| */ |
| void spiExecuteWithCallback( |
| const char *src, |
| bool read_only, |
| long tcount, |
| spiCallback callbackFn, |
| void *clientData) |
| { |
| volatile bool connected = false; /* needs to be volatile when accessed by PG_CATCH */ |
| int ret = 0; |
| |
| PG_TRY(); |
| { |
| if (SPI_OK_CONNECT != SPI_connect()) |
| { |
| ereport(ERROR, (errcode(ERRCODE_CDB_INTERNAL_ERROR), |
| errmsg("Unable to connect to execute internal query: %s.", src))); |
| } |
| connected = true; |
| |
| elog(DEBUG2, "Executing SQL: %s", src); |
| |
| /* Do the query. */ |
| ret = SPI_execute(src, read_only, tcount); |
| Assert(ret > 0); |
| |
| if (callbackFn) |
| { |
| callbackFn(clientData); |
| } |
| connected = false; |
| int res = SPI_finish(); |
| elog(DEBUG5, "finish SPI %s, res %d, ret %d", src, res, ret); |
| } |
| /* Clean up in case of error. */ |
| PG_CATCH(); |
| { |
| if (connected) |
| { |
| int res = SPI_finish(); |
| elog(DEBUG5, "finish SPI %s after error, res %d, ret %d", src, res, ret); |
| } |
| |
| /* Carry on with error handling. */ |
| PG_RE_THROW(); |
| } |
| PG_END_TRY(); |
| } |
| |
| /** |
| * A callback function for use with spiExecuteWithCallback. Asserts that exactly one row was returned. |
| * Gets the row's first column as a float, using 0.0 if the value is null |
| */ |
| void spiCallback_getSingleResultRowColumnAsFloat4(void *clientData) |
| { |
| Datum datum_f; |
| bool isnull = false; |
| float4 *out = (float4*) clientData; |
| |
| Assert(SPI_tuptable != NULL); /* must have result */ |
| Assert(SPI_processed == 1); /* we expect only one tuple. */ |
| Assert(SPI_tuptable->tupdesc->attrs[0]->atttypid == FLOAT4OID); /* must be float4 */ |
| |
| datum_f = heap_getattr(SPI_tuptable->vals[0], 1, SPI_tuptable->tupdesc, &isnull); |
| |
| if (isnull) |
| { |
| *out = 0.0; |
| } |
| else |
| { |
| *out = DatumGetFloat4(datum_f); |
| } |
| } |
| |
| /** |
| * A callback function for use with spiExecuteWithCallback. Copies the SPI_processed value into |
| * *clientDataOut, treating it as a float4 pointer. |
| */ |
| void spiCallback_getProcessedAsFloat4(void *clientData) |
| { |
| float4 *out = (float4*) clientData; |
| *out = (float4)SPI_processed; |
| } |
| |
| /** |
| * A callback function for use with spiExecuteWithCallback. Reads each column of output into an array |
| * The number of arrays, the memory context for them, and the output location are determined by |
| * treating *clientData as a EachResultColumnAsArraySpec and using the values there |
| */ |
| void spiCallback_getEachResultColumnAsArray(void *clientData) |
| { |
| EachResultColumnAsArraySpec * spec = (EachResultColumnAsArraySpec*) clientData; |
| int i; |
| ArrayType ** out = spec->output; |
| |
| Assert(SPI_tuptable != NULL); |
| Assert(SPI_tuptable->tupdesc); |
| |
| for ( i = 1; i <= spec->numColumns; i++ ) |
| { |
| *out = SPIResultToArray(i, spec->memoryContext); |
| Assert(*out); |
| out++; |
| } |
| } |
| |
| /** |
| * This method builds a sampled version of the given relation. The sampled |
| * version is created in a temp namespace. Note that ANALYZE can be |
| * executed only by the database owner. It is safe to assume that the database |
| * owner has permissions to create temp tables. The sampling is done by |
| * using the random() built-in function. |
| * |
| * TODO: Once tablesample becomes available, rewrite this function to utilize tablesample. |
| * |
| * Input: |
| * relationOid - relation to be sampled |
| * sampleTableName - sample table name, moderately unique |
| * lAttributeNames - attributes to be included in the sample |
| * relTuples - estimated size of relation |
| * requestedSampleSize - as determined by attribute statistics requirements. |
| * sampleTableRelTuples - limit on size of the sample. |
| * Output: |
| * sampleTableRelTuples - number of tuples in the sample table created. |
| */ |
| static Oid buildSampleTable(Oid relationOid, |
| char* sampleTableName, |
| List *lAttributeNames, |
| float4 relTuples, |
| float4 requestedSampleSize, |
| float4 *sampleTableRelTuples) |
| { |
| int nAttributes = -1; |
| StringInfoData str; |
| int i = 0; |
| ListCell *le = NULL; |
| const char *schemaName = NULL; |
| const char *tableName = NULL; |
| char *sampleSchemaName = pstrdup("pg_temp"); |
| Oid sampleTableOid = InvalidOid; |
| float4 randomThreshold = 0.0; |
| RangeVar *rangeVar = NULL; |
| |
| Assert(requestedSampleSize > 0.0); |
| Assert(relTuples > 0.0); |
| |
| randomThreshold = requestedSampleSize / relTuples; |
| |
| schemaName = get_namespace_name(get_rel_namespace(relationOid)); /* must be pfreed */ |
| tableName = get_rel_name(relationOid); /* must be pfreed */ |
| |
| initStringInfo(&str); |
| |
| appendStringInfo(&str, "create table %s.%s as (select ", |
| quote_identifier(sampleSchemaName), quote_identifier(sampleTableName)); |
| |
| nAttributes = list_length(lAttributeNames); |
| |
| foreach_with_count(le, lAttributeNames, i) |
| { |
| appendStringInfo(&str, "Ta.%s", quote_identifier((const char *) lfirst(le))); |
| if (i < nAttributes - 1) |
| { |
| appendStringInfo(&str, ", "); |
| } |
| else |
| { |
| appendStringInfo(&str, " "); |
| } |
| } |
| |
| /* if table is partitioned, we create a sample over all parts */ |
| appendStringInfo(&str, "from %s.%s as Ta where random() < %.38f limit %lu) distributed randomly", |
| quote_identifier(schemaName), |
| quote_identifier(tableName), randomThreshold, (unsigned long) requestedSampleSize); |
| |
| spiExecuteWithCallback(str.data, false /*readonly*/, 0 /*tcount */, |
| spiCallback_getProcessedAsFloat4, sampleTableRelTuples); |
| |
| pfree(str.data); |
| |
| elog(elevel, "Created sample table %s.%s with nrows=%.0f", |
| quote_identifier(sampleSchemaName), |
| quote_identifier(sampleTableName), |
| *sampleTableRelTuples); |
| |
| rangeVar = makeRangeVar(NULL /*catalogname*/, sampleSchemaName, sampleTableName, -1); |
| sampleTableOid = RangeVarGetRelid(rangeVar, true, false /*allowHcatalog*/); |
| |
| Assert(sampleTableOid != InvalidOid); |
| |
| /** |
| * MPP-10723: Very rarely, we may be unlucky and generate an empty sample table. We error out in this case rather than |
| * generate bad statistics. |
| */ |
| |
| if (*sampleTableRelTuples < 1.0) |
| { |
| elog(ERROR, "ANALYZE unable to generate accurate statistics on table %s.%s. Try lowering gp_analyze_relative_error", |
| quote_identifier(schemaName), |
| quote_identifier(tableName)); |
| } |
| |
| pfree((void *) rangeVar); |
| pfree((void *) tableName); |
| pfree((void *) schemaName); |
| pfree((void *) sampleSchemaName); |
| return sampleTableOid; |
| } |
| |
| /** |
| * Drops the sample table created during ANALYZE. |
| */ |
| void dropSampleTable(Oid sampleTableOid, bool isExternal) |
| { |
| StringInfoData str; |
| const char *sampleSchemaName = NULL; |
| const char *sampleTableName = NULL; |
| |
| sampleSchemaName = get_namespace_name(get_rel_namespace(sampleTableOid)); /* must be pfreed */ |
| sampleTableName = get_rel_name(sampleTableOid); /* must be pfreed */ |
| |
| initStringInfo(&str); |
| appendStringInfo(&str, "drop %stable %s.%s", |
| isExternal ? "external " : "", |
| quote_identifier(sampleSchemaName), |
| quote_identifier(sampleTableName)); |
| |
| spiExecuteWithCallback(str.data, false /*readonly*/, 0 /*tcount */, |
| NULL, NULL); |
| |
| pfree(str.data); |
| pfree((void *)sampleSchemaName); |
| pfree((void *)sampleTableName); |
| } |
| |
| /** |
| * This method determines the number of pages corresponding to an index. |
| * Input: |
| * relationOid - relation being analyzed |
| * indexOid - index whose size is to be determined |
| * Output: |
| * indexPages - number of pages in the index |
| */ |
| static void analyzeEstimateIndexpages(Oid relationOid, Oid indexOid, float4 *indexPages) |
| { |
| *indexPages = 0; |
| |
| Relation rel = try_relation_open(indexOid, AccessShareLock, false); |
| |
| if (rel != NULL) |
| { |
| Assert(rel->rd_rel->relkind == RELKIND_INDEX); |
| |
| *indexPages = RelationGetNumberOfBlocks(rel); |
| |
| relation_close(rel, AccessShareLock); |
| } |
| return; |
| } |
| |
| /** |
| * This method estimates reltuples/relpages for a relation. To do this, it employs |
| * the built-in function 'gp_statistics_estimate_reltuples_relpages'. If the table to be |
| * analyzed is a system table, then it calculates statistics only using the master. |
| * Input: |
| * relationOid - relation's Oid |
| * Output: |
| * relTuples - estimated number of tuples |
| * relPages - estimated number of pages |
| */ |
| static void analyzeEstimateReltuplesRelpages(Oid relationOid, float4 *relTuples, float4 *relPages, bool rootonly) |
| { |
| *relPages = 0.0; |
| *relTuples = 0.0; |
| |
| List *allRelOids = NIL; |
| |
| /* if GUC optimizer_analyze_root_partition is off, we do not analyze root partitions, unless |
| * using the 'ANALYZE ROOTPARITION tablename' command. |
| * This is done by estimating the reltuples to be 0 and thus bypass the actual analyze. |
| * See MPP-21427. |
| * For mid-level parititions, we aggregate the reltuples and relpages from all leaf children beneath. |
| */ |
| if (rel_part_status(relationOid) == PART_STATUS_INTERIOR || |
| (rel_is_partitioned(relationOid) && (optimizer_analyze_root_partition || rootonly))) |
| { |
| allRelOids = rel_get_leaf_children_relids(relationOid); |
| } |
| else |
| { |
| allRelOids = list_make1_oid(relationOid); |
| } |
| |
| /* iterate over all parts and add up estimates */ |
| ListCell *lc = NULL; |
| float4 partRelPages = 0.0; |
| float4 partRelTuples = 0.0; |
| foreach (lc, allRelOids) |
| { |
| Oid singleOid = lfirst_oid(lc); |
| Relation rel = try_relation_open(singleOid, AccessShareLock, false); |
| |
| if (rel != NULL ) { |
| Assert(rel->rd_rel->relkind == RELKIND_RELATION); |
| if (RelationIsHeap(rel)) |
| { |
| /* |
| * in gpsql, all heap table should not be distributed. |
| */ |
| gp_statistics_estimate_reltuples_relpages_heap(rel, &partRelTuples, |
| &partRelPages); |
| } |
| else if (RelationIsAoRows(rel)) |
| { |
| gp_statistics_estimate_reltuples_relpages_ao_rows(rel, &partRelTuples, |
| &partRelPages); |
| } |
| else if (RelationIsParquet(rel)){ |
| gp_statistics_estimate_reltuples_relpages_parquet(rel, &partRelTuples, |
| &partRelPages); |
| } |
| else if( RelationIsExternal(rel)) |
| { |
| gp_statistics_estimate_reltuples_relpages_external(rel, &partRelTuples, |
| &partRelPages); |
| } |
| |
| relation_close(rel, AccessShareLock); |
| *relPages += partRelPages; |
| *relTuples += partRelTuples; |
| } |
| } |
| |
| } |
| |
| /** |
| * This method updates reltuples, relpages in the pg_class entry |
| * for the specified table. |
| * Input: |
| * relationOid - relation whose entry must be changed |
| * reltuples - number of tuples as estimated during ANALYZE |
| * relpages - number of pages as estimated during ANALYZE |
| */ |
| static void updateReltuplesRelpagesInCatalog(Oid relationOid, float4 relTuples, float4 relPages) |
| { |
| Relation pgclass = NULL; |
| HeapTuple tuple = NULL; |
| Form_pg_class pgcform = NULL; |
| bool dirty = false; |
| cqContext cqc; |
| cqContext *pcqCtx; |
| |
| Assert(relationOid != InvalidOid); |
| Assert(relTuples > -1.0); |
| Assert(relPages > -1.0); |
| |
| /* |
| * We need a way to distinguish these 2 cases: |
| * a) ANALYZEd table is empty |
| * b) Table has never been ANALYZEd |
| * To do this, in case (a), we set relPages = 1. For case (b), relPages = 0. |
| */ |
| if (relPages < 1.0) |
| { |
| Assert(relTuples < 1.0); |
| relPages = 1.0; |
| } |
| |
| /* |
| * update number of tuples and number of pages in pg_class |
| */ |
| pgclass = heap_open(RelationRelationId, RowExclusiveLock); |
| |
| pcqCtx = caql_addrel(cqclr(&cqc), pgclass); |
| |
| /* Fetch a copy of the tuple to scribble on */ |
| tuple = caql_getfirst( |
| pcqCtx, |
| cql("SELECT * FROM pg_class " |
| " WHERE oid = :1 " |
| " FOR UPDATE ", |
| ObjectIdGetDatum(relationOid))); |
| |
| /* We have locked the relation. We should not have trouble finding pg_class tuple */ |
| Assert(HeapTupleIsValid(tuple)); |
| pgcform = (Form_pg_class) GETSTRUCT(tuple); |
| |
| /* Apply required updates, if any, to copied tuple */ |
| |
| if (pgcform->relpages != (int32) relPages) |
| { |
| pgcform->relpages = (int32) relPages; |
| dirty = true; |
| } |
| if (pgcform->reltuples != (float4) relTuples) |
| { |
| pgcform->reltuples = (float4) relTuples; |
| dirty = true; |
| } |
| |
| elog(DEBUG3, "ANALYZE oid=%u pages=%d tuples=%f", |
| relationOid, pgcform->relpages, pgcform->reltuples); |
| |
| /* |
| * If anything changed, write out the tuple. |
| */ |
| if (dirty) |
| { |
| heap_inplace_update(pgclass, tuple); |
| /* the above sends a cache inval message */ |
| } |
| |
| heap_close(pgclass, RowExclusiveLock); |
| } |
| |
| /** |
| * Does this attribute have a total ordering defined i.e. does it have "<", "=" |
| * and is the attribute hashable? |
| * Input: |
| * relationOid - relation's Oid |
| * attributeName - attribute in question |
| * Output: |
| * true if operators are defined and the attribute type is hashable, false otherwise. |
| */ |
| static bool isOrderedAndHashable(Oid relationOid, const char *attributeName) |
| { |
| HeapTuple attributeTuple = NULL; |
| Form_pg_attribute attribute = NULL; |
| Operator equalityOperator = NULL; |
| Operator ltOperator = NULL; |
| bool result = true; |
| cqContext *pcqCtx; |
| |
| pcqCtx = caql_getattname_scan(NULL, relationOid, attributeName); |
| |
| attributeTuple = caql_get_current(pcqCtx); |
| |
| Assert(HeapTupleIsValid(attributeTuple)); |
| attribute = (Form_pg_attribute) GETSTRUCT(attributeTuple); |
| |
| /* Does type have "=" operator */ |
| equalityOperator = equality_oper(attribute->atttypid, true); |
| if (!equalityOperator) |
| result = false; |
| else |
| ReleaseOperator(equalityOperator); |
| |
| /* Does type have "<" operator */ |
| ltOperator = ordering_oper(attribute->atttypid, true); |
| if (!ltOperator) |
| result = false; |
| else |
| ReleaseOperator(ltOperator); |
| |
| /* Is the attribute hashable?*/ |
| if (!isGreenplumDbHashable(attribute->atttypid)) |
| result = false; |
| |
| caql_endscan(pcqCtx); |
| |
| return result; |
| } |
| /** |
| * Does attribute have "max" aggregate function defined? We can compute histogram |
| * only if this function is defined. |
| * Input: |
| * relationOid - relation's Oid |
| * attributeName - attribute in question |
| * Output: |
| * true if "max" function is defined, false otherwise. |
| */ |
| static bool hasMaxDefined(Oid relationOid, const char *attributeName) |
| { |
| HeapTuple attributeTuple = NULL; |
| Form_pg_attribute attribute = NULL; |
| Oid maxAggregateFunction = InvalidOid; |
| bool result = true; |
| List *funcNames = NIL; |
| cqContext *pcqCtx; |
| |
| pcqCtx = caql_getattname_scan(NULL, relationOid, attributeName); |
| |
| attributeTuple = caql_get_current(pcqCtx); |
| |
| Assert(HeapTupleIsValid(attributeTuple)); |
| attribute = (Form_pg_attribute) GETSTRUCT(attributeTuple); |
| |
| /* Does type have "max" operator */ |
| funcNames = list_make1(makeString("max")); |
| maxAggregateFunction = LookupFuncName(funcNames, 1 /* nargs to function */, |
| &attribute->atttypid, true); |
| if (!OidIsValid(maxAggregateFunction)) |
| result = false; |
| |
| caql_endscan(pcqCtx); |
| return result; |
| } |
| |
| |
| /** |
| * Is the attribute of type boolean? |
| * Input: |
| * relationOid - relation's Oid |
| * attributeName - attribute in question |
| * Output: |
| * true or false |
| */ |
| static bool isBoolType(Oid relationOid, const char *attributeName) |
| { |
| HeapTuple attributeTuple = NULL; |
| Form_pg_attribute attribute = NULL; |
| bool isBool = false; |
| cqContext *pcqCtx; |
| |
| pcqCtx = caql_getattname_scan(NULL, relationOid, attributeName); |
| |
| attributeTuple = caql_get_current(pcqCtx); |
| |
| Assert(HeapTupleIsValid(attributeTuple)); |
| attribute = (Form_pg_attribute) GETSTRUCT(attributeTuple); |
| isBool = (attribute->atttypid == BOOLOID); |
| |
| caql_endscan(pcqCtx); |
| return isBool; |
| } |
| |
| /** |
| * Compute absolute number of distinct values in the sample table. This method constructs |
| * a SQL string using the sampleTableOid and issues an SPI query. |
| * Input: |
| * sampleTableOid - sample table's oid |
| * sampleTableRelTuples - number of tuples in the sample table (this may be estimated or actual) |
| * attributeName - attribute in question |
| * Output: |
| * number of distinct values of attribute |
| */ |
| static float4 analyzeComputeNDistinctAbsolute(Oid sampleTableOid, |
| float4 sampleTableRelTuples, |
| const char *attributeName) |
| { |
| |
| StringInfoData str; |
| float4 ndistinct = -1.0; |
| |
| const char *sampleSchemaName = NULL; |
| const char *sampleTableName = NULL; |
| |
| sampleSchemaName = get_namespace_name(get_rel_namespace(sampleTableOid)); /* must be pfreed */ |
| sampleTableName = get_rel_name(sampleTableOid); /* must be pfreed */ |
| |
| initStringInfo(&str); |
| appendStringInfo(&str, "select count(*)::float4 from (select Ta.%s from %s.%s as Ta group by Ta.%s) as Tb", |
| quote_identifier(attributeName), |
| quote_identifier(sampleSchemaName), |
| quote_identifier(sampleTableName), |
| quote_identifier(attributeName)); |
| |
| spiExecuteWithCallback(str.data, false /*readonly*/, 0 /*tcount */, |
| spiCallback_getSingleResultRowColumnAsFloat4, &ndistinct); |
| |
| pfree(str.data); |
| pfree((void *) sampleTableName); |
| pfree((void *) sampleSchemaName); |
| |
| elog(elevel, "count(ndistinct()) gives %f values.", ndistinct); |
| |
| return ndistinct; |
| } |
| |
| /** |
| * Compute the number of repeating values in a relation. |
| * Input: |
| * relationOid - relation's oid |
| * attributeName - attribute |
| * Output: |
| * number of values that repeat i.e. their frequency is greater than 1. |
| * |
| * TODO: this query is very similar to MCV query. Can we do some optimization here? |
| */ |
| static float4 analyzeComputeNRepeating(Oid relationOid, |
| const char *attributeName) |
| { |
| StringInfoData str; |
| float4 nRepeating = -1.0; |
| |
| const char *sampleSchemaName = NULL; |
| const char *sampleTableName = NULL; |
| |
| sampleSchemaName = get_namespace_name(get_rel_namespace(relationOid)); /* must be pfreed */ |
| sampleTableName = get_rel_name(relationOid); /* must be pfreed */ |
| |
| initStringInfo(&str); |
| appendStringInfo(&str, "select count(v)::float4 from (select Ta.%s as v, count(Ta.%s) as f from %s.%s as Ta group by Ta.%s) as foo where f > 1", |
| quote_identifier(attributeName), |
| quote_identifier(attributeName), |
| quote_identifier(sampleSchemaName), |
| quote_identifier(sampleTableName), |
| quote_identifier(attributeName)); |
| |
| spiExecuteWithCallback(str.data, false /*readonly*/, 0 /*tcount */, |
| spiCallback_getSingleResultRowColumnAsFloat4, &nRepeating); |
| |
| pfree(str.data); |
| pfree((void *) sampleTableName); |
| pfree((void *) sampleSchemaName); |
| |
| return nRepeating; |
| } |
| |
| /** |
| * This routine creates an array from one of the result attributes after an SPI call. |
| * It allocates this array in the specified context. Note that, in general, the allocation |
| * context must not the SPI context because that is likely to get cleaned out soon. |
| * |
| * Input: |
| * resultAttributeNumber - attribute to flatten into an array (1 based) |
| * Output: |
| * array of attribute type |
| */ |
| ArrayType * SPIResultToArray(int resultAttributeNumber, MemoryContext allocationContext) |
| { |
| ArrayType *result = NULL; |
| int i = 0; |
| Oid typOutput = InvalidOid; |
| bool isVarLena = false; |
| MemoryContext callerContext; |
| Form_pg_attribute attribute = SPI_tuptable->tupdesc->attrs[resultAttributeNumber - 1]; |
| |
| Assert(attribute); |
| |
| callerContext = MemoryContextSwitchTo(allocationContext); |
| result = construct_empty_array(attribute->atttypid); |
| |
| /** |
| * We should not need to detoast the type. |
| */ |
| getTypeOutputInfo(attribute->atttypid, &typOutput, &isVarLena); |
| |
| for (i=0;i<SPI_processed;i++) |
| { |
| Datum dValue = 0; |
| bool isnull = false; |
| int index = 0; |
| Datum deToastedValue = 0; |
| |
| dValue = heap_getattr(SPI_tuptable->vals[i], resultAttributeNumber, SPI_tuptable->tupdesc, &isnull); |
| |
| if (!isnull) |
| { |
| if (isVarLena) |
| { |
| deToastedValue = PointerGetDatum(PG_DETOAST_DATUM(dValue)); |
| } |
| else |
| { |
| deToastedValue = dValue; |
| } |
| |
| /** |
| * Add this value to the result array. |
| */ |
| index = i+1; /* array indices are 1 based */ |
| result = array_set(result, 1 /* nSubscripts */, &index, deToastedValue, isnull, |
| -1 /* arraytyplen */, |
| attribute->attlen, |
| attribute->attbyval, |
| attribute->attalign); |
| |
| if (deToastedValue != dValue) |
| { |
| pfree(DatumGetPointer(deToastedValue)); |
| } |
| } |
| } |
| MemoryContextSwitchTo(callerContext); |
| return result; |
| } |
| |
| /** |
| * This method determines if a particular attribute has the "not null" property. |
| * Input: |
| * relationOid - relation's oid |
| * attributeName |
| * Output: |
| * true if attribute is not-null. false otherwise. |
| */ |
| |
| static bool isNotNull(Oid relationOid, const char *attributeName) |
| { |
| HeapTuple attributeTuple = NULL; |
| Form_pg_attribute attribute = NULL; |
| bool nonNull = false; |
| cqContext *pcqCtx; |
| |
| pcqCtx = caql_getattname_scan(NULL, relationOid, attributeName); |
| |
| attributeTuple = caql_get_current(pcqCtx); |
| |
| Assert(HeapTupleIsValid(attributeTuple)); |
| attribute = (Form_pg_attribute) GETSTRUCT(attributeTuple); |
| nonNull = attribute->attnotnull; |
| |
| caql_endscan(pcqCtx); |
| |
| return (nonNull); |
| } |
| |
| /** |
| * Computes the absolute number of NULLs in the sample table. |
| * Input: |
| * sampleTableOid - sample table to query on |
| * relationOid - original table |
| * attributeName - attribute in question |
| * mergeStats - for root or interior partition, whether to merge stats from leaf children partitions |
| * Output: |
| * Number of NULLs |
| */ |
| static float4 analyzeNullCount(Oid sampleTableOid, Oid relationOid, const char *attributeName, bool mergeStats) |
| { |
| StringInfoData str; |
| float4 nullcount = 0.0; |
| |
| if (mergeStats) |
| { |
| /* To merge stats, we compute the null fraction of this column |
| * by taking the average of null fractions of the leaf children partitions. |
| */ |
| StringInfo str2 = getStringLeafPartitionOids(relationOid); |
| initStringInfo(&str); |
| appendStringInfo(&str, "select sum(stanullfrac*reltuples)::float4 from pg_statistic, pg_class where " |
| "pg_class.oid = pg_statistic.starelid and starelid %s and staattnum = %d", |
| str2->data, get_attnum(relationOid, attributeName)); |
| |
| spiExecuteWithCallback(str.data, false /*readonly*/, 0 /*tcount */, |
| spiCallback_getSingleResultRowColumnAsFloat4, &nullcount); |
| pfree(str2->data); |
| } |
| |
| else |
| { |
| const char *schemaName = NULL; |
| const char *tableName = NULL; |
| |
| schemaName = get_namespace_name(get_rel_namespace(sampleTableOid)); /* must be pfreed */ |
| tableName = get_rel_name(sampleTableOid); /* must be pfreed */ |
| |
| initStringInfo(&str); |
| appendStringInfo(&str, "select count(*)::float4 from %s.%s as Ta where Ta.%s is null", |
| quote_identifier(schemaName), |
| quote_identifier(tableName), |
| quote_identifier(attributeName)); |
| |
| spiExecuteWithCallback(str.data, false /*readonly*/, 0 /*tcount */, |
| spiCallback_getSingleResultRowColumnAsFloat4, &nullcount); |
| pfree((void *) tableName); |
| pfree((void *) schemaName); |
| } |
| |
| pfree(str.data); |
| |
| return nullcount; |
| } |
| |
| /** |
| * Is the attribute fixed-width? If so, this method extracts the width of the attribute. |
| * Input: |
| * relationOid - relation's oid |
| * attributeName - name of attribute |
| * Output: |
| * returns true if fixed width |
| * width - must be non-null. sets width if it returns true. |
| */ |
| |
| static bool isFixedWidth(Oid relationOid, const char *attributeName, float4 *width) |
| { |
| HeapTuple attributeTuple = NULL; |
| Form_pg_attribute attribute = NULL; |
| float4 avgwidth = 0.0; |
| cqContext *pcqCtx; |
| |
| Assert(width); |
| |
| pcqCtx = caql_getattname_scan(NULL, relationOid, attributeName); |
| |
| attributeTuple = caql_get_current(pcqCtx); |
| |
| Assert(HeapTupleIsValid(attributeTuple)); |
| attribute = (Form_pg_attribute) GETSTRUCT(attributeTuple); |
| avgwidth = get_typlen(attribute->atttypid); |
| |
| caql_endscan(pcqCtx); |
| |
| if (avgwidth > 0) |
| { |
| *width = avgwidth; |
| return true; |
| } |
| else |
| { |
| return false; |
| } |
| } |
| |
| /** |
| * Computes average width of an attribute. This uses the built-in pg_column_size. |
| * Input: |
| * sampleTableOid - relation on which to compute average width. |
| * relationOid - original table |
| * attributeName - name of attribute |
| * sampleTableRelTuples - size of relation |
| * mergeStats - for root or interior partition, whether to merge stats from leaf children partitions |
| * Output: |
| * Average width of attribute. |
| */ |
| static float4 analyzeComputeAverageWidth(Oid sampleTableOid, |
| Oid relationOid, |
| const char *attributeName, |
| float4 relTuples, |
| bool mergeStats) |
| { |
| StringInfoData str; |
| float4 avgwidth = 0.0; |
| |
| if (mergeStats) |
| { |
| /* To merge stats, we compute the average width of this column |
| * by taking the average of average width of the leaf children partitions. |
| */ |
| Assert(0.0 < relTuples); |
| StringInfo str2 = getStringLeafPartitionOids(relationOid); |
| initStringInfo(&str); |
| appendStringInfo(&str, "select (sum(stawidth*reltuples)/%f)::float4 from pg_statistic, pg_class where " |
| "pg_class.oid = pg_statistic.starelid and starelid %s and staattnum = %d", |
| relTuples, str2->data, get_attnum(relationOid, attributeName)); |
| |
| spiExecuteWithCallback(str.data, false /*readonly*/, 0 /*tcount*/, |
| spiCallback_getSingleResultRowColumnAsFloat4, &avgwidth); |
| pfree(str2->data); |
| } |
| |
| else |
| { |
| const char *sampleSchemaName = NULL; |
| const char *sampleTableName = NULL; |
| |
| sampleSchemaName = get_namespace_name(get_rel_namespace(sampleTableOid)); /* must be pfreed */ |
| sampleTableName = get_rel_name(sampleTableOid); /* must be pfreed */ |
| |
| initStringInfo(&str); |
| appendStringInfo(&str, "select avg(pg_column_size(Ta.%s))::float4 from %s.%s as Ta where Ta.%s is not null", |
| quote_identifier(attributeName), |
| quote_identifier(sampleSchemaName), |
| quote_identifier(sampleTableName), |
| quote_identifier(attributeName)); |
| |
| spiExecuteWithCallback(str.data, false /*readonly*/, 0 /*tcount */, |
| spiCallback_getSingleResultRowColumnAsFloat4, &avgwidth); |
| |
| pfree((void *) sampleTableName); |
| pfree((void *) sampleSchemaName); |
| } |
| |
| pfree(str.data); |
| |
| return avgwidth; |
| } |
| |
| /** |
| * Computes the most common values and their frequencies for relation. |
| * Input: |
| * relationOid - relation's oid |
| * sampleTableOid - oid of sample table |
| * attributeName - attribute in question |
| * relTuples - number of rows in relation |
| * nEntries - number of MCV entries |
| * mergeStats - for root or interior partition, whether to merge stats from leaf children partitions |
| * Output: |
| * mcv - array of most common values (type is the same as attribute type) |
| * freq - array of float4's |
| */ |
| static void analyzeComputeMCV(Oid relationOid, |
| Oid sampleTableOid, |
| const char *attributeName, |
| float4 relTuples, |
| unsigned int nEntries, |
| bool mergeStats, |
| ArrayType **mcv, |
| ArrayType **freq) |
| { |
| if (mergeStats) |
| { |
| /* for partitioned table, we aggregate the MCVs/Freqs of the leaf partitions |
| to compute MCVs/Freqs of the interior or root partition */ |
| ArrayType *result[2]; |
| AttrNumber attnum = get_attnum(relationOid, attributeName); |
| aggregate_leaf_partition_MCVs(relationOid, attnum, nEntries, result); |
| |
| *mcv = result[0]; |
| *freq = result[1]; |
| } |
| else |
| { |
| StringInfoData str; |
| const char *sampleSchemaName = NULL; |
| const char *sampleTableName = NULL; |
| ArrayType *spiResult[2]; |
| EachResultColumnAsArraySpec spec; |
| |
| Assert(relTuples > 0.0); |
| Assert(nEntries > 0); |
| |
| sampleSchemaName = get_namespace_name(get_rel_namespace(sampleTableOid)); /* must be pfreed */ |
| sampleTableName = get_rel_name(sampleTableOid); /* must be pfreed */ |
| |
| initStringInfo(&str); |
| appendStringInfo(&str, "select Ta.%s as v, count(Ta.%s)::float4/%f::float4 as f from %s.%s as Ta " |
| "where Ta.%s is not null group by (Ta.%s) order by f desc limit %u", |
| quote_identifier(attributeName), |
| quote_identifier(attributeName), |
| relTuples, |
| quote_identifier(sampleSchemaName), |
| quote_identifier(sampleTableName), |
| quote_identifier(attributeName), |
| quote_identifier(attributeName), |
| nEntries); |
| |
| |
| spec.numColumns = 2; |
| spec.output = spiResult; |
| spec.memoryContext = CurrentMemoryContext; |
| spiExecuteWithCallback(str.data, false /*readonly*/, 0 /*tcount */, |
| spiCallback_getEachResultColumnAsArray, &spec); |
| |
| *mcv = spiResult[0]; /* first attribute of result are mcvs */ |
| *freq = spiResult[1]; /* second attribute of result are freqs */ |
| |
| pfree(str.data); |
| pfree((void *) sampleTableName); |
| pfree((void *) sampleSchemaName); |
| } |
| |
| return; |
| } |
| |
| /** |
| * Compute histogram entries for relation. |
| * Input: |
| * relationOid - relation |
| * sampleTableOid - oid of sample table |
| * attributeName - attribute |
| * relTuples - estimated number of tuples in relation |
| * nEntries - number of histogram entries requested |
| * mergeStats - for root or interior partition, whether to merge stats from leaf children partitions |
| * mcv - most common values calculated. these must be excluded before calculating |
| * histogram. this may be null. |
| * Output: |
| * hist - array of values representing a histogram. This histogram contains no repeating values. |
| */ |
| static void analyzeComputeHistogram(Oid relationOid, |
| Oid sampleTableOid, |
| const char *attributeName, |
| float4 relTuples, |
| unsigned int nEntries, |
| bool mergeStats, |
| ArrayType *mcv, |
| ArrayType **hist) |
| { |
| if (mergeStats) |
| { |
| /* for partitioned table, we aggregate the histogram of the leaf partitions |
| to compute histogram of the interior or root partition */ |
| ArrayType *result = NULL; |
| AttrNumber attnum = get_attnum(relationOid, attributeName); |
| aggregate_leaf_partition_histograms(relationOid, attnum, nEntries, &result); |
| |
| *hist = result; |
| } |
| else |
| { |
| StringInfoData str; |
| unsigned int bucketSize = 0; |
| EachResultColumnAsArraySpec spec; |
| StringInfoData whereClause; |
| |
| Assert(relTuples > 0.0); |
| Assert(nEntries > 0); |
| |
| bucketSize = relTuples / nEntries; |
| |
| if (bucketSize <= 1) /* histogram will be empty if bucketSize <= 1 */ |
| { |
| *hist = NULL; |
| return; |
| } |
| |
| const char *sampleSchemaName = NULL; |
| const char *sampleTableName = NULL; |
| sampleSchemaName = get_namespace_name(get_rel_namespace(sampleTableOid)); /* must be pfreed */ |
| sampleTableName = get_rel_name(sampleTableOid); /* must be pfreed */ |
| |
| initStringInfo(&str); |
| initStringInfo(&whereClause); |
| appendStringInfo(&whereClause, "%s is not null", quote_identifier(attributeName)); |
| |
| /** |
| * This query orders the table by rank on the value and chooses values that occur every |
| * 'bucketSize' interval. It also chooses the maximum value from the relation. It then |
| * removes duplicate values and orders the values in ascending order. This corresponds to |
| * the histogram. |
| */ |
| |
| appendStringInfo(&str, "select v from (" |
| "select Ta.%s as v, row_number() over (order by Ta.%s) as r from %s.%s as Ta where %s " |
| "union select max(Tb.%s) as v, 1 as r from %s.%s as Tb where %s) as foo " |
| "where r %% %u = 1 group by v order by v", |
| quote_identifier(attributeName), |
| quote_identifier(attributeName), |
| quote_identifier(sampleSchemaName), |
| quote_identifier(sampleTableName), |
| whereClause.data, |
| quote_identifier(attributeName), |
| quote_identifier(sampleSchemaName), |
| quote_identifier(sampleTableName), |
| whereClause.data, |
| bucketSize); |
| |
| spec.numColumns = 1; |
| spec.output = hist; |
| spec.memoryContext = CurrentMemoryContext; |
| |
| spiExecuteWithCallback(str.data, false /*readonly*/, 0 /*tcount */, |
| spiCallback_getEachResultColumnAsArray, &spec); |
| pfree(str.data); |
| pfree((void *) sampleTableName); |
| pfree((void *) sampleSchemaName); |
| } |
| |
| return; |
| } |
| |
| /** |
| * Compute NDistinct of a column using sample table. |
| */ |
| static void analyzeComputeNDistinctBySample |
| ( |
| Oid relationOid, |
| float4 relTuples, |
| Oid sampleTableOid, |
| float4 sampleTableRelTuples, |
| const char *attributeName, |
| bool *computeMCV, |
| bool *computeHist, |
| AttributeStatistics *stats |
| ) |
| { |
| float4 nDistinctAbsolute = analyzeComputeNDistinctAbsolute(sampleTableOid, sampleTableRelTuples, attributeName); |
| |
| if (nDistinctAbsolute < 1.0) |
| { |
| /* |
| * This can happen if no sample table was employed and all values happen |
| * to be null and we did not estimate relTuples accurately. |
| */ |
| Assert(sampleTableOid == relationOid); |
| stats->ndistinct = 0.0; |
| *computeMCV = false; |
| *computeHist = false; |
| } |
| else if (ceil(sampleTableRelTuples) == ceil(nDistinctAbsolute)) |
| { |
| /** |
| * Since all values are distinct, we assume that all values in the original |
| * relation are distinct. |
| */ |
| stats->ndistinct = -1.0; |
| |
| /* |
| * If there are many more distinct values than mcv buckets, |
| * we can ignore computing mcv values. |
| */ |
| if (nDistinctAbsolute > numberOfMCVEntries(relationOid, attributeName)) |
| *computeMCV = false; |
| } |
| else |
| { |
| /** |
| * Some values repeated - the number of repeating values is used |
| * to determine the total number of distinct values in the original relation. |
| */ |
| float4 nRepeatingAbsolute = analyzeComputeNRepeating(sampleTableOid, attributeName); |
| |
| /** |
| * MPP-10301. Note that some time has passed since nDistinct was computed. If the table being analyzed |
| * is a catalog table, it could have changed in this period. |
| */ |
| if (nRepeatingAbsolute > nDistinctAbsolute) |
| { |
| nRepeatingAbsolute = nDistinctAbsolute; |
| } |
| |
| /** |
| * It is possible that all values are distinct but it reaches here due to |
| * inaccurate sampleTableRelTuples (without sampling). |
| */ |
| if (0 == nRepeatingAbsolute) |
| { |
| stats->ndistinct = -1.0; |
| if (nDistinctAbsolute > numberOfMCVEntries(relationOid, attributeName)) |
| { |
| *computeMCV = false; |
| } |
| |
| return; |
| } |
| |
| if (ceil(nRepeatingAbsolute) == ceil(nDistinctAbsolute)) |
| { |
| /* All distinct values are in the sample. Assumption is that there exist no distinct values outside this world. */ |
| stats->ndistinct = nDistinctAbsolute; |
| } |
| else |
| { |
| /*---------- |
| * Estimate the number of distinct values using the estimator |
| * proposed by Haas and Stokes in IBM Research Report RJ 10025: |
| * n*d / (n - f1 + f1*n/N) |
| * where f1 is the number of distinct values that occurred |
| * exactly once in our sample of n rows (from a total of N), |
| * and d is the total number of distinct values in the sample. |
| * This is their Duj1 estimator; the other estimators they |
| * recommend are considerably more complex, and are numerically |
| * very unstable when n is much smaller than N. |
| */ |
| double onceInSample = nDistinctAbsolute - nRepeatingAbsolute; |
| double numer = sampleTableRelTuples * nDistinctAbsolute; |
| double denom = sampleTableRelTuples - onceInSample + onceInSample * (sampleTableRelTuples / relTuples); |
| stats->ndistinct = (float4) numer / denom; |
| |
| /* lower bound */ |
| if (stats->ndistinct < nDistinctAbsolute) |
| stats->ndistinct = nDistinctAbsolute; |
| } |
| } |
| } |
| |
| /* Aggregating ndistinct of leaf partitions to figure out the root or interior |
| * partition table ndistinct without actual data access. |
| * We assume that the leaf stats are already in the catalog. |
| * This approach has its limitation - we are trying to infer parent table ndistinct |
| * by observing the ndistinct of the largest leaf partition. |
| */ |
| static float4 |
| analyzeComputeNDistinctByLargestPartition |
| ( |
| Oid relationOid, |
| float4 relTuples, |
| const char *attributeName |
| ) |
| { |
| float4 result = 0; |
| AttrNumber attnum = get_attnum(relationOid, attributeName); |
| |
| /* For partition key in single-level single-partition key hierarchy, |
| * the values for each partition must be disjoint. Therefore we sum |
| * up the ndistinct from every partition. This is also true for the |
| * mid-level partitions that are one level above the leaf level. |
| */ |
| PartStatus ps = rel_part_status(relationOid); |
| bool oneLevelAboveLeaf = true; |
| List *lpart_keys = NIL; |
| |
| Assert(PART_STATUS_ROOT == ps || PART_STATUS_INTERIOR == ps); |
| |
| if (PART_STATUS_ROOT == ps) |
| { |
| lpart_keys = rel_partition_key_attrs(relationOid); |
| } |
| else /* PART_STATUS_INTERIOR == ps */ |
| { |
| List *lpart_keys_all_level = rel_partition_keys_ordered(rel_partition_get_master(relationOid)); |
| int nlevels = list_length(lpart_keys_all_level); /* total number of partitioning levels, each level has one or more partition keys */ |
| List *lpath_relids = rel_get_part_path1(relationOid); /* list of Oids starting from this node (including) and up to the root (excluding) */ |
| int nlevels_up = list_length(lpath_relids); /* number of levels above this node */ |
| lpart_keys = (List*) list_nth(lpart_keys_all_level, nlevels_up); /* partition key(s) of this level */ |
| if (nlevels != nlevels_up + 1) |
| { |
| oneLevelAboveLeaf = false; |
| } |
| } |
| |
| if (oneLevelAboveLeaf && 1 == list_length(lpart_keys) && attnum == linitial_int(lpart_keys)) |
| { |
| List *lRelOids = rel_get_leaf_children_relids(relationOid); |
| ListCell *lc = NULL; |
| foreach(lc, lRelOids) |
| { |
| Oid partOid = lfirst_oid(lc); |
| float4 partNDV = get_attdistinct(partOid, attnum); |
| if (partNDV < 0.0) |
| { |
| float4 partReltuples = get_rel_reltuples(partOid); |
| result += (-1.0) * partNDV * partReltuples; |
| } |
| else |
| { |
| result += partNDV; |
| } |
| } |
| |
| pfree(lpart_keys); |
| |
| return result; |
| } |
| |
| |
| /* For non-partition key column or composition partition keys, we try to reason with the |
| * ndistinct of the largest leaf partition. |
| * 1. if ndistinct<=-1.0 in the child partition, the column is a unique column in the child partition. We |
| * expect the column to remain distinct in the root as well. |
| * 2. if -1.0 < ndistinct < 0.0, the absolute number of ndistinct values in the child partition is a fraction |
| * of the number of rows in the partition. We expect that the absolute number of ndistinct in the root |
| * to stay the same. Therefore, we convert this to a positive number. |
| * 3. if ndistinct is positive, it indicates a small absolute number of distinct values. We expect these |
| * values to be repeated in all partitions. Therefore, we expect no change in the ndistinct in the root. |
| */ |
| |
| Oid largestLeafOid = get_largest_leaf_partition(relationOid); |
| float4 largestLeafNDV = get_attdistinct(largestLeafOid, attnum); |
| |
| if (largestLeafNDV <= -1.0) |
| { |
| result = largestLeafNDV; |
| } |
| else if (largestLeafNDV < 0.0) |
| { |
| result = (-1.0) * largestLeafNDV * get_rel_reltuples(largestLeafOid); |
| } |
| else |
| { |
| result = largestLeafNDV; |
| } |
| |
| pfree(lpart_keys); |
| |
| return result; |
| } |
| |
| /* Get the oid of the largest leaf partition of a partition table (root or interior). |
| * Assuming the reltuples for leaf-level partitions have already been populated. |
| * Cannot be called on a leaf-level partition. |
| */ |
| static Oid |
| get_largest_leaf_partition(Oid relid) |
| { |
| float4 largestReltuples = 0; |
| |
| List *lChildrenOid = rel_get_leaf_children_relids(relid); |
| Assert(lChildrenOid); |
| |
| ListCell *lc = NULL; |
| Oid oidLargestPart = linitial_oid(lChildrenOid); |
| |
| foreach(lc, lChildrenOid) |
| { |
| Oid currOid = lfirst_oid(lc); |
| float4 currReltuples = get_rel_reltuples(currOid); |
| if (currReltuples > largestReltuples) |
| { |
| oidLargestPart = currOid; |
| largestReltuples = currReltuples; |
| } |
| } |
| |
| pfree(lChildrenOid); |
| return oidLargestPart; |
| } |
| |
| |
| /** |
| * Helper function to create the IN clause used for calculating merged stats. |
| * StringInfo is allocated in the current memory context, |
| * so the caller should free it. |
| * Input: relationOid - oid of a root or interior partition |
| * Output: string "in (oid_1, oid_2, ..., oid_n)" where oid_i's are the oid of leaf children partitions |
| * of relationOid |
| */ |
| static StringInfo |
| getStringLeafPartitionOids(Oid relationOid) |
| { |
| StringInfo str = makeStringInfo(); |
| ListCell *le = NULL; |
| |
| List *lRelOids = rel_get_leaf_children_relids(relationOid); |
| |
| appendStringInfo(str, "in ("); |
| |
| for (le = list_head(lRelOids); le != list_tail(lRelOids); le = lnext(le)) |
| { |
| appendStringInfo(str, "%u, ", lfirst_oid(le)); |
| } |
| appendStringInfo(str, "%u)", lfirst_oid(le)); |
| |
| return str; |
| } |
| |
| |
| /** |
| * Responsible for computing statistics on relationOid. |
| * Input: |
| * relationOid - original relation on which statistics must be computed. |
| * attributeName - name of attribute |
| * relTuples - expected number of tuples in relation |
| * sampleTableOid - Oid of the sampled version of the table. It is possible that sampleTableOid == relationOid. |
| * sampleTableRelTuples - number of tuples in sampled version |
| * mergeStats - for root or interior partition, whether to merge stats of leaf children partitions |
| * Output: |
| * stats - structure containing nullfrac, avgwidth, ndistinct, mcv, hist |
| */ |
| static void analyzeComputeAttributeStatistics(Oid relationOid, |
| const char *attributeName, |
| float4 relTuples, |
| Oid sampleTableOid, |
| float4 sampleTableRelTuples, |
| bool mergeStats, |
| AttributeStatistics *stats) |
| { |
| bool computeWidth = true; |
| bool computeDistinct = true; |
| bool computeMCV = true; |
| bool computeHist = true; |
| |
| Assert(stats != NULL); |
| Assert(sampleTableRelTuples > 0.0); |
| |
| /* Default values */ |
| stats->ndistinct = -1.0; |
| stats->nullFraction = 0.0; |
| stats->avgWidth = 0.0; |
| stats->mcv = NULL; |
| stats->freq = NULL; |
| stats->hist = NULL; |
| |
| if (isNotNull(relationOid, attributeName)) |
| { |
| stats->nullFraction = 0.0; |
| } |
| else |
| { |
| float4 nullCount = analyzeNullCount(sampleTableOid, relationOid, attributeName, mergeStats); |
| if (ceil(nullCount) >= ceil(sampleTableRelTuples)) |
| { |
| /** |
| * All values are null. no point computing other statistics. |
| */ |
| computeWidth = false; |
| computeDistinct = false; |
| computeMCV = false; |
| computeHist = false; |
| } |
| stats->nullFraction = Min(nullCount / sampleTableRelTuples, 1.0); |
| } |
| elog(elevel, "nullfrac = %.6f", stats->nullFraction); |
| |
| if (computeWidth && !isFixedWidth(relationOid, attributeName, &stats->avgWidth)) |
| { |
| stats->avgWidth = analyzeComputeAverageWidth(sampleTableOid, relationOid, attributeName, relTuples, mergeStats); |
| } |
| |
| elog(elevel, "avgwidth = %f", stats->avgWidth); |
| |
| if (!isOrderedAndHashable(relationOid, attributeName)) |
| { |
| computeDistinct = false; |
| computeMCV = false; |
| computeHist = false; |
| } |
| |
| if (!hasMaxDefined(relationOid, attributeName)) |
| { |
| computeHist = false; |
| } |
| |
| if (isBoolType(relationOid, attributeName)) |
| { |
| stats->ndistinct = 2; |
| computeDistinct = false; |
| computeHist = false; |
| } |
| |
| if (stats->avgWidth > COLUMN_WIDTH_THRESHOLD) |
| { |
| /* Extremely wide columns are considered to be fully distinct. See comments |
| * against COLUMN_WIDTH_THRESHOLD */ |
| computeDistinct = false; |
| computeMCV = false; |
| computeHist = false; |
| } |
| |
| if (computeDistinct) |
| { |
| if (mergeStats) |
| { |
| stats->ndistinct = analyzeComputeNDistinctByLargestPartition(relationOid, relTuples, attributeName); |
| } |
| else |
| { |
| analyzeComputeNDistinctBySample(relationOid, relTuples, sampleTableOid, sampleTableRelTuples, |
| attributeName, &computeMCV, &computeHist, stats); |
| } |
| |
| /* upper bound */ |
| if (stats->ndistinct > relTuples) |
| { |
| stats->ndistinct = relTuples; |
| } |
| |
| /** |
| * Does ndistinct scale with reltuples? |
| */ |
| if (stats->ndistinct > relTuples * gp_statistics_ndistinct_scaling_ratio_threshold) |
| { |
| stats->ndistinct = -1.0 * stats->ndistinct / relTuples; |
| } |
| |
| } |
| elog(elevel, "ndistinct = %.6f", stats->ndistinct); |
| |
| if (computeMCV) |
| { |
| unsigned int nMCVEntries = numberOfMCVEntries(relationOid, attributeName); |
| analyzeComputeMCV(relationOid, sampleTableOid, attributeName, sampleTableRelTuples, |
| nMCVEntries, mergeStats, &stats->mcv, &stats->freq); |
| if (stats->mcv) |
| elog(elevel, "mcv=%s", OidOutputFunctionCall(751,PointerGetDatum(stats->mcv))); |
| if (stats->freq) |
| elog(elevel, "freq=%s", OidOutputFunctionCall(751,PointerGetDatum(stats->freq))); |
| } |
| |
| if (computeHist) |
| { |
| unsigned int nHistEntries = numberOfHistogramEntries(relationOid, attributeName); |
| analyzeComputeHistogram(relationOid, sampleTableOid, attributeName, sampleTableRelTuples, |
| nHistEntries, mergeStats, stats->mcv, &stats->hist); |
| if (stats->hist) |
| elog(elevel, "hist=%s", OidOutputFunctionCall(751,PointerGetDatum(stats->hist))); |
| } |
| } |
| |
| /** |
| * This method updates the pg_statistic tuple for the specific attribute. It populates |
| * it with the statistics computed by ANALYZE. |
| * Input: |
| * relationOid - relation |
| * attributeName - name of the attribute |
| * stats - struct containing all the computed statistics |
| * |
| */ |
| static void updateAttributeStatisticsInCatalog(Oid relationOid, const char *attributeName, AttributeStatistics *stats) |
| { |
| Datum values[Natts_pg_statistic]; |
| bool nulls[Natts_pg_statistic]; |
| bool replaces[Natts_pg_statistic]; |
| int2 attNum = -1; |
| int i = 0; |
| Oid equalityOid = InvalidOid; |
| Oid ltOid = InvalidOid; |
| |
| Assert(stats); |
| |
| { |
| /** |
| * For some reason, pg_statistic stores the "<" and "=" operator associated |
| * with the attribute. |
| */ |
| HeapTuple attributeTuple; |
| Form_pg_attribute attribute; |
| Operator equalityOperator; |
| Operator ltOperator; |
| cqContext *pcqCtx; |
| |
| pcqCtx = caql_getattname_scan(NULL, relationOid, attributeName); |
| |
| attributeTuple = caql_get_current(pcqCtx); |
| |
| Assert(HeapTupleIsValid(attributeTuple)); |
| attribute = (Form_pg_attribute) GETSTRUCT(attributeTuple); |
| attNum = attribute->attnum; |
| equalityOperator = equality_oper(attribute->atttypid, true); |
| |
| if (equalityOperator) |
| { |
| equalityOid = oprid(equalityOperator); |
| ReleaseOperator(equalityOperator); |
| } |
| |
| ltOperator = ordering_oper(attribute->atttypid, true); |
| |
| if (ltOperator) |
| { |
| ltOid = oprid(ltOperator); |
| ReleaseOperator(ltOperator); |
| } |
| caql_endscan(pcqCtx); |
| } |
| |
| for (i = 0; i < Natts_pg_statistic; ++i) |
| { |
| nulls[i] = false; |
| replaces[i] = true; |
| } |
| |
| values[Anum_pg_statistic_starelid - 1] = ObjectIdGetDatum(relationOid); /* starelid */ |
| values[Anum_pg_statistic_staattnum - 1] = Int16GetDatum(attNum); /* staattnum */ |
| values[Anum_pg_statistic_stanullfrac - 1] = Float4GetDatum(stats->nullFraction); /* stanullfrac */ |
| values[Anum_pg_statistic_stawidth - 1] = Int32GetDatum((int32) stats->avgWidth); /* stawidth */ |
| values[Anum_pg_statistic_stadistinct - 1] = Float4GetDatum(stats->ndistinct); /* stadistinct */ |
| |
| |
| /* We will always put MCV in slot 1 and histogram in slot 2 */ |
| |
| if (stats->mcv) |
| { |
| values[Anum_pg_statistic_stakind1 - 1] = Int16GetDatum(STATISTIC_KIND_MCV); |
| } |
| else |
| { |
| values[Anum_pg_statistic_stakind1 - 1] = Int16GetDatum((int2) 0); /* no mcv */ |
| } |
| |
| if (stats->hist) |
| { |
| values[Anum_pg_statistic_stakind2 - 1] = Int16GetDatum(STATISTIC_KIND_HISTOGRAM); |
| } |
| else |
| { |
| values[Anum_pg_statistic_stakind2 - 1] = Int16GetDatum((int2) 0); |
| } |
| |
| /* correlation */ |
| values[Anum_pg_statistic_stakind3 - 1] = Int16GetDatum((int2) 0); /* we do not compute correlation anymore */ |
| |
| /* an extra slot */ |
| values[Anum_pg_statistic_stakind4 - 1] = Int16GetDatum((int2) 0); |
| |
| /* staops .. which correspond to operators. Sigh.. */ |
| if (stats->mcv) |
| { |
| values[Anum_pg_statistic_staop1 - 1] = ObjectIdGetDatum(equalityOid); |
| } |
| else |
| { |
| values[Anum_pg_statistic_staop1 - 1] = 0; |
| } |
| |
| if (stats->hist) |
| { |
| values[Anum_pg_statistic_staop2 - 1] = ObjectIdGetDatum(ltOid); |
| } |
| else |
| { |
| values[Anum_pg_statistic_staop2 - 1] = 0; |
| } |
| |
| /* dummy fields */ |
| values[Anum_pg_statistic_staop3 - 1] = 0; |
| values[Anum_pg_statistic_staop4 - 1] = 0; |
| |
| /* Now working on stanumbers */ |
| if (stats->freq) |
| { |
| values[Anum_pg_statistic_stanumbers1 - 1] = PointerGetDatum(stats->freq); |
| } |
| else |
| { |
| nulls[Anum_pg_statistic_stanumbers1 - 1] = true; |
| values[Anum_pg_statistic_stanumbers1 - 1] = 0; |
| } |
| |
| /* dummy fields */ |
| nulls[Anum_pg_statistic_stanumbers2 - 1] = true; |
| values[Anum_pg_statistic_stanumbers2 - 1] = 0; |
| nulls[Anum_pg_statistic_stanumbers3 - 1] = true; |
| values[Anum_pg_statistic_stanumbers3 - 1] = 0; |
| nulls[Anum_pg_statistic_stanumbers4 - 1] = true; |
| values[Anum_pg_statistic_stanumbers4 - 1] = 0; |
| |
| /* Now working on stavalues */ |
| if (stats->mcv) |
| { |
| values[Anum_pg_statistic_stavalues1 - 1] = PointerGetDatum(stats->mcv); |
| } |
| else |
| { |
| nulls[Anum_pg_statistic_stavalues1 - 1] = true; |
| values[Anum_pg_statistic_stavalues1 - 1] = 0; |
| } |
| |
| if (stats->hist) |
| { |
| values[Anum_pg_statistic_stavalues2 - 1] = PointerGetDatum(stats->hist); |
| } |
| else |
| { |
| nulls[Anum_pg_statistic_stavalues2 - 1] = true; |
| values[Anum_pg_statistic_stavalues2 - 1] = 0; |
| } |
| |
| /* dummy values */ |
| nulls[Anum_pg_statistic_stavalues3 - 1] = true; |
| values[Anum_pg_statistic_stavalues3 - 1] = 0; |
| nulls[Anum_pg_statistic_stavalues4 - 1] = true; |
| values[Anum_pg_statistic_stavalues4 - 1] = 0; |
| |
| /* Now work on pg_statistic */ |
| { |
| HeapTuple oldStatisticsTuple; |
| cqContext *pcqCtx; |
| |
| pcqCtx = caql_beginscan( |
| NULL, |
| cql("SELECT * FROM pg_statistic " |
| " WHERE starelid = :1 " |
| " AND staattnum = :2 " |
| " FOR UPDATE ", |
| ObjectIdGetDatum(relationOid), |
| Int16GetDatum(attNum))); |
| |
| oldStatisticsTuple = caql_getnext(pcqCtx); |
| |
| if (HeapTupleIsValid(oldStatisticsTuple)) |
| { |
| /* pg_statistic tuple exists. */ |
| HeapTuple stup = caql_modify_current(pcqCtx, |
| values, |
| nulls, |
| replaces); |
| |
| caql_update_current(pcqCtx, stup); |
| /* and Update indexes (implicit) */ |
| |
| heap_freetuple(stup); |
| } |
| else |
| { |
| /* insert new tuple into pg_statistic. we are guaranteed |
| * no-one else will overwrite this row because of |
| * ShareUpdateExclusiveLock on the relation. |
| */ |
| |
| HeapTuple stup = caql_form_tuple(pcqCtx, values, nulls); |
| |
| caql_insert(pcqCtx, stup); |
| /* and Update indexes (implicit) */ |
| |
| heap_freetuple(stup); |
| } |
| caql_endscan(pcqCtx); |
| } |
| |
| } |
| |
| /** |
| * This method estimates the number of tuples and pages in a heaptable relation. Getting the number of blocks is straightforward. |
| * Estimating the number of tuples is a little trickier. There are two factors that complicate this: |
| * 1. Tuples may be of variable length. |
| * 2. There may be dead tuples lying around. |
| * To do this, it chooses a certain number of blocks (as determined by a guc) randomly. The process of choosing is not strictly |
| * uniformly random since we have a target number of blocks in mind. We start processing blocks in order and choose an block |
| * with a probability p determined by the ratio of target to total blocks. It is possible that we get really unlucky and reject |
| * a large number of blocks up front. We compensate for this by increasing p dynamically. Thus, we are guaranteed to choose the target number |
| * of blocks. We read all heaptuples from these blocks and keep count of number of live tuples. We scale up this count to |
| * estimate reltuples. Relpages is an exact value. |
| * |
| * Input: |
| * rel - Relation. Must be a heaptable. |
| * |
| * Output: |
| * reltuples - estimated number of tuples in relation. |
| * relpages - exact number of pages. |
| */ |
| static void gp_statistics_estimate_reltuples_relpages_heap(Relation rel, float4 *reltuples, float4 *relpages) |
| { |
| MIRROREDLOCK_BUFMGR_DECLARE; |
| |
| float4 nrowsseen = 0; /* # rows seen */ |
| float4 nrowsdead = 0; /* # rows seen */ |
| BlockNumber nblockstotal = 0; /* nblocks in relation */ |
| BlockNumber nblockstarget = (BlockNumber) gp_statistics_blocks_target; |
| BlockNumber nblocksseen = 0; |
| int j = 0; /* counter */ |
| |
| /** |
| * Ensure that the right kind of relation with the right kind of storage is passed to us. |
| */ |
| Assert(rel->rd_rel->relkind == RELKIND_RELATION); |
| Assert(RelationIsHeap(rel)); |
| |
| nblockstotal = RelationGetNumberOfBlocks(rel); |
| |
| if (nblockstotal == 0 || nblockstarget == 0) |
| { |
| /** |
| * If there are no blocks, there cannot be tuples. |
| */ |
| *reltuples = 0.0; |
| *relpages = 0.0; |
| return; |
| } |
| |
| for (j=0 ; j<nblockstotal; j++) |
| { |
| /** |
| * Threshold is dynamically adjusted based on how many blocks we need to examine and how many blocks |
| * are left. |
| */ |
| double threshold = ((double) nblockstarget - nblocksseen)/((double) nblockstotal - j); |
| |
| /** |
| * Random dice thrown to determine if current block is chosen. |
| */ |
| double diceValue = ((double) random()) / ((double) MAX_RANDOM_VALUE); |
| |
| if (threshold >= 1.0 || diceValue <= threshold) |
| { |
| /** |
| * Block j shall be examined! |
| */ |
| BlockNumber targblock = j; |
| Buffer targbuffer; |
| Page targpage; |
| OffsetNumber targoffset, |
| maxoffset; |
| |
| /** |
| * Check for cancellations. |
| */ |
| CHECK_FOR_INTERRUPTS(); |
| |
| /* |
| * We must maintain a pin on the target page's buffer to ensure that |
| * the maxoffset value stays good (else concurrent VACUUM might delete |
| * tuples out from under us). Hence, pin the page until we are done |
| * looking at it. We don't maintain a lock on the page, so tuples |
| * could get added to it, but we ignore such tuples. |
| */ |
| |
| /* -------- MirroredLock ---------- */ |
| MIRROREDLOCK_BUFMGR_LOCK; |
| |
| targbuffer = ReadBuffer(rel, targblock); |
| LockBuffer(targbuffer, BUFFER_LOCK_SHARE); |
| targpage = BufferGetPage(targbuffer); |
| maxoffset = PageGetMaxOffsetNumber(targpage); |
| |
| /* Inner loop over all tuples on the selected block. */ |
| for (targoffset = FirstOffsetNumber; targoffset <= maxoffset; targoffset++) |
| { |
| ItemId itemid; |
| itemid = PageGetItemId(targpage, targoffset); |
| nrowsseen++; |
| if(!ItemIdIsNormal(itemid)) |
| { |
| nrowsdead += 1; |
| } |
| else |
| { |
| HeapTupleData targtuple; |
| ItemPointerSet(&targtuple.t_self, targblock, targoffset); |
| targtuple.t_data = (HeapTupleHeader) PageGetItem(targpage, itemid); |
| targtuple.t_len = ItemIdGetLength(itemid); |
| |
| if(!HeapTupleSatisfiesVisibility(rel, &targtuple, SnapshotNow, targbuffer)) |
| { |
| nrowsdead += 1; |
| } |
| } |
| } |
| |
| /* Now release the pin on the page */ |
| UnlockReleaseBuffer(targbuffer); |
| |
| MIRROREDLOCK_BUFMGR_UNLOCK; |
| /* -------- MirroredLock ---------- */ |
| |
| nblocksseen++; |
| } |
| } |
| |
| Assert(nblocksseen > 0); |
| /** |
| * To calculate reltuples, scale up the number of live rows per block seen to the total number |
| * of blocks. |
| */ |
| *reltuples = ceil((nrowsseen - nrowsdead) * nblockstotal / nblocksseen); |
| *relpages = nblockstotal; |
| return; |
| } |
| |
| /** |
| * This method estimates the number of tuples and pages in an append-only relation. AO tables maintain accurate |
| * tuple counts in the catalog. Therefore, we will only require access to segment catalogs to determine reltuples. |
| * Relpages is obtained by fudging AO block sizes. |
| * |
| * Input: |
| * rel - Relation. Must be an AO table. |
| * |
| * Output: |
| * reltuples - estimated number of tuples in relation. |
| * relpages - exact number of pages. |
| */ |
| |
| static void gp_statistics_estimate_reltuples_relpages_ao_rows(Relation rel, float4 *reltuples, float4 *relpages) |
| { |
| FileSegTotals *fstotal; |
| |
| /** |
| * Ensure that the right kind of relation with the right type of storage is passed to us. |
| */ |
| Assert(rel->rd_rel->relkind == RELKIND_RELATION); |
| Assert(RelationIsAoRows(rel)); |
| |
| Assert(Gp_role == GP_ROLE_DISPATCH); |
| |
| *relpages = 0.0; |
| *reltuples = 0.0; |
| |
| fstotal = GetSegFilesTotals(rel, SnapshotNow); |
| Assert(fstotal); |
| |
| /** |
| * The planner doesn't understand AO's blocks, so need this method to try to fudge up a number for |
| * the planner. |
| */ |
| *relpages += RelationGuessNumberOfBlocks((double) fstotal->totalbytes); |
| /** |
| * The number of tuples in AO table is known accurately. Therefore, we just utilize this value. |
| */ |
| *reltuples += (double) fstotal->totaltuples; |
| |
| pfree(fstotal); |
| |
| return; |
| } |
| |
| /** |
| * This method estimates the number of tuples and pages in a parquet relation. Parquet tables maintain accurate |
| * tuple counts in the catalog. Therefore, we will only require access to segment catalogs to determine reltuples. |
| * Relpages is obtained by fudging AO block sizes. |
| * |
| * Input: |
| * rel - Relation. Must be a parquet table. |
| * |
| * Output: |
| * reltuples - estimated number of tuples in relation. |
| * relpages - exact number of pages. |
| */ |
| |
| static void gp_statistics_estimate_reltuples_relpages_parquet(Relation rel, float4 *reltuples, float4 *relpages) |
| { |
| ParquetFileSegTotals *fstotal; |
| |
| /** |
| * Ensure that the right kind of relation with the right type of storage is passed to us. |
| */ |
| Assert(rel->rd_rel->relkind == RELKIND_RELATION); |
| Assert(RelationIsParquet(rel)); |
| |
| Assert(Gp_role == GP_ROLE_DISPATCH); |
| |
| *relpages = 0.0; |
| *reltuples = 0.0; |
| |
| fstotal = GetParquetSegFilesTotals(rel, SnapshotNow); |
| Assert(fstotal); |
| /** |
| * The planner doesn't understand AO's blocks, so need this method to try to fudge up a number for |
| * the planner. |
| */ |
| *relpages += RelationGuessNumberOfBlocks((double) fstotal->totalbytes); |
| /** |
| * The number of tuples in AO table is known accurately. Therefore, we just utilize this value. |
| */ |
| *reltuples += (double) fstotal->totaltuples; |
| |
| pfree(fstotal); |
| return; |
| } |
| |
| /** |
| * This method estimates the number of tuples and pages in an extern relation. We can not get accurate tuple counts |
| * and pages counts in the catalog. Therefore, we have to get reltuples and relpages manually. |
| * |
| * Input: |
| * rel - Relation. Must be an external table. |
| * |
| * Output: |
| * reltuples - exact number of tuples in relation. |
| * relpages - exact number of pages. |
| */ |
| static void gp_statistics_estimate_reltuples_relpages_external(Relation rel, float4 *relTuples, float4 *relPages){ |
| Oid extRelationOid = RelationGetRelid(rel); |
| getExternalRelTuples(extRelationOid, relTuples); |
| getExternalRelPages(extRelationOid, relPages, rel); |
| } |
| |
| /** |
| * This method called by analyzeExternalEstimateReltuplesRelpages, |
| * to get External Relation reltuple counts, we run count(*) sql manually |
| * |
| * Input: |
| * extRelationOid - External Table Relation Oid |
| * Output: |
| * relTuples - exact number of tuples in relation. |
| */ |
| static void getExternalRelTuples(Oid extRelationOid, float4 *relTuples){ |
| const char *schemaName = NULL; |
| const char *tableName = NULL; |
| schemaName = get_namespace_name(get_rel_namespace(extRelationOid)); /* must be pfreed */ |
| tableName = get_rel_name(extRelationOid); /* must be pfreed */ |
| |
| StringInfoData str; |
| initStringInfo(&str); |
| appendStringInfo(&str, "select count(*)::float4 from %s.%s as Ta", |
| quote_identifier(schemaName), |
| quote_identifier(tableName)); |
| |
| spiExecuteWithCallback(str.data, false /*readonly*/, 0 /*tcount */, |
| spiCallback_getSingleResultRowColumnAsFloat4, relTuples); |
| pfree(tableName); |
| pfree(schemaName); |
| pfree(str.data); |
| } |
| |
| /** |
| * This method called by analyzeExternalEstimateReltuplesRelpages,to get External Relation relpages counts. |
| * We call GetExtTableEntry method to get get List of external Table Locations.And then we go through every |
| * location url to sum the count of relpages.External Relation now support some different protocals, therefore |
| * we need to process them in different way. |
| * |
| * Input: |
| * extRelationOid - External Table Relation Oid |
| * Output: |
| * relTuples - exact number of pages in relation. |
| */ |
| static void getExternalRelPages(Oid extRelationOid, float4 *relPages , Relation rel){ |
| |
| ExtTableEntry* entry = GetExtTableEntry(extRelationOid); |
| List* extLocations = entry->locations; |
| int num_urls = list_length(extLocations); |
| ListCell *cell = list_head(extLocations); |
| ListCell *cellTmp = NULL; |
| while(cell != NULL) |
| { |
| char *url = pstrdup(((Value*)lfirst(cell))->val.str); |
| Assert(url != NULL); |
| Uri *uri = ParseExternalTableUri(url); |
| Assert(uri != NULL); |
| switch (uri->protocol){ |
| case URI_HDFS: |
| *relPages += getExtrelPagesHDFS(uri); |
| break; |
| |
| /* |
| * to be done |
| */ |
| case URI_GPFDIST: |
| *relPages = 1.0; |
| elog(NOTICE,"In external table ANALYZE command are not supported in GPFDIST location so far."); |
| break; |
| case URI_FILE: |
| *relPages = 1.0; |
| elog(NOTICE,"In external table ANALYZE command are not supported in FILE location so far."); |
| break; |
| case URI_FTP: |
| *relPages = 1.0; |
| elog(NOTICE,"In external table ANALYZE command are not supported in FTP location so far."); |
| break; |
| case URI_HTTP: |
| *relPages = 1.0; |
| elog(NOTICE,"In external table ANALYZE command are not supported in HTTP location so far."); |
| break; |
| case URI_CUSTOM: |
| *relPages = 1.0; |
| elog(NOTICE,"In external table ANALYZE command are not supported in CUSTOM location so far."); |
| break; |
| case URI_GPFDISTS: |
| *relPages = 1.0; |
| elog(NOTICE,"In external table ANALYZE command are not supported in GPFDISTS location so far."); |
| break; |
| default: |
| *relPages = 1.0; |
| elog(NOTICE,"should not go here"); |
| break; |
| } |
| |
| cell = cell->next; |
| |
| /* free resourse */ |
| pfree(url); |
| if(uri->customprotocol != NULL){ pfree(uri->customprotocol);} |
| pfree(uri->hostname); |
| if(uri->path!=NULL){pfree(uri->path);} |
| FreeExternalTableUri(uri); |
| } |
| /* pfree entry->location*/ |
| list_free_deep(extLocations); |
| /* pfree entry */ |
| if(entry->fmtopts != NULL){ pfree(entry->fmtopts);} |
| if(entry->command != NULL){ pfree(entry->command);} |
| pfree(entry); |
| } |
| |
| /** |
| * This method get the number of pages external table which location uri protocol is HDFS. We hold that |
| * the concept of the page number in external table is same as the concept of block number in hdfs. |
| * Therefore we get the number of pages for external table by get the number of blocks in hdfs |
| * |
| * Input: |
| * uri - hdfs uri which refers to external table storage location, uri can refer to a file or a folder |
| * |
| */ |
| static float4 getExtrelPagesHDFS(Uri *uri){ |
| int numOfBlock = 0; |
| int nsize = 0; |
| float4 relpages = 0.0; |
| hdfsFS fs = hdfsConnect(uri->hostname, uri->port); |
| |
| //hdfsFileInfo *fiarray = hdfsGetPathInfo(fs, uri->path); |
| hdfsFileInfo *fiarray = hdfsListDirectory(fs, uri->path,&nsize); |
| if (fs == NULL) |
| { |
| elog(ERROR, "hdfsprotocol_blocklocation : " |
| "failed to get files of path %s", |
| uri->path); |
| } |
| |
| /* Call block location api to get data location for each file */ |
| for (int i = 0 ; i < nsize ; i++) |
| { |
| // FscHdfsFileInfoC *fi = FscHdfsGetFileInfoFromArray(fiarray, i); |
| hdfsFileInfo *fi = &fiarray[i]; |
| /* break condition of this for loop */ |
| if (fi == NULL) {break;} |
| |
| /* Build file name full path. */ |
| const char *fname = fi->mName; |
| char *fullpath = palloc0( |
| strlen(fname) + /* name */ |
| 1); /* \0 */ |
| sprintf(fullpath, "%s/%s", uri->path, fname); |
| |
| /* Get file full length. */ |
| // int64_t len = FscHdfsGetFileInfoLength(fi); |
| int64_t len = fi->mSize; |
| if (len == 0) { |
| pfree(fullpath); |
| continue; |
| } |
| |
| /* Get block location data for this file */ |
| BlockLocation *bla = hdfsGetFileBlockLocations(fs, fullpath, 0, len,&numOfBlock); |
| if (bla == NULL) |
| { |
| elog(ERROR, "hdfsprotocol_blocklocation : " |
| "failed to get block location of path %s. " |
| "It is reported generally due to HDFS service errors or " |
| "another session's ongoing writing.", |
| fullpath); |
| } |
| |
| relpages += numOfBlock; |
| |
| |
| /* We don't need it any longer */ |
| pfree(fullpath); |
| |
| /* Clean up block location instances created by the lib. */ |
| hdfsFreeFileBlockLocations(&bla,numOfBlock); |
| } |
| |
| /* Clean up file info array created by the lib for this location. */ |
| // FscHdfsFreeFileInfoArrayC(&fiarray); |
| hdfsFreeFileInfo(fiarray,nsize); |
| hdfsDisconnect(fs); |
| return relpages; |
| } |
| |
| |
| /* |
| * Get total bytes of external table with HDFS protocol |
| */ |
| uint64 GetExternalTotalBytesHDFS(Uri *uri) |
| { |
| uint64 totalBytes = 0; |
| int nsize = 0; |
| |
| hdfsFS fs = hdfsConnect(uri->hostname, uri->port); |
| |
| hdfsFileInfo *fiarray = hdfsListDirectory(fs, uri->path,&nsize); |
| if (fiarray == NULL) |
| { |
| elog(ERROR, "hdfsprotocol_blocklocation : " |
| "failed to get files of path %s.", |
| uri->path); |
| } |
| |
| /* Call block location api to get data location for each file */ |
| for (int i = 0 ; i < nsize ; i++) |
| { |
| hdfsFileInfo *fi = &fiarray[i]; |
| |
| /* Break condition of this for loop */ |
| if (fi == NULL) |
| { |
| break; |
| } |
| |
| /* Get file full length. */ |
| totalBytes += fi->mSize; |
| |
| } |
| |
| /* Clean up file info array created by the lib for this location. */ |
| hdfsFreeFileInfo(fiarray,nsize); |
| hdfsDisconnect(fs); |
| |
| return totalBytes; |
| } |
| |
| /* |
| * Get total bytes of external table |
| */ |
| uint64 GetExternalTotalBytes(Relation rel) |
| { |
| Oid extRelOid = RelationGetRelid(rel); |
| ExtTableEntry *entry = GetExtTableEntry(extRelOid); |
| List *extLocations = entry->locations; |
| int num_urls = list_length(extLocations); |
| ListCell *cell = list_head(extLocations); |
| ListCell *cellTmp = NULL; |
| uint64 totalBytes = 0; |
| |
| while(cell != NULL) |
| { |
| char *url = pstrdup(((Value*)lfirst(cell))->val.str); |
| Assert(url != NULL); |
| |
| Uri *uri = ParseExternalTableUri(url); |
| Assert(uri != NULL); |
| |
| switch (uri->protocol) |
| { |
| case URI_HDFS: |
| totalBytes += GetExternalTotalBytesHDFS(uri); |
| break; |
| /* |
| * Support analyze for external table. |
| * For now, HDFS protocol external table is supported. |
| */ |
| case URI_GPFDIST: |
| totalBytes += 0; |
| elog(ERROR,"In external table ANALYZE command are not supported in GPFDIST location so far."); |
| break; |
| |
| case URI_FILE: |
| totalBytes += 0; |
| elog(ERROR,"In external table ANALYZE command are not supported in FILE location so far."); |
| break; |
| |
| case URI_FTP: |
| totalBytes += 0; |
| elog(ERROR,"In external table ANALYZE command are not supported in FTP location so far."); |
| break; |
| |
| case URI_HTTP: |
| totalBytes += 0; |
| elog(ERROR,"In external table ANALYZE command are not supported in HTTP location so far."); |
| break; |
| |
| case URI_CUSTOM: |
| totalBytes += 0; |
| elog(ERROR,"In external table ANALYZE command are not supported in CUSTOM location so far."); |
| break; |
| |
| case URI_GPFDISTS: |
| totalBytes += 0; |
| elog(ERROR,"In external table ANALYZE command are not supported in GPFDISTS location so far."); |
| break; |
| |
| default: |
| totalBytes += 0; |
| elog(ERROR,"should not go here"); |
| break; |
| } |
| |
| cell = cell->next; |
| |
| /* free resourse */ |
| pfree(url); |
| if (uri->customprotocol != NULL) |
| { |
| pfree(uri->customprotocol); |
| } |
| pfree(uri->hostname); |
| |
| if (uri->path != NULL) |
| { |
| pfree(uri->path); |
| } |
| FreeExternalTableUri(uri); |
| |
| } |
| /* pfree entry->location*/ |
| list_free_deep(extLocations); |
| /* pfree entry */ |
| if (entry->fmtopts != NULL) |
| { |
| pfree(entry->fmtopts); |
| } |
| if (entry->command != NULL) |
| { |
| pfree(entry->command); |
| } |
| pfree(entry); |
| |
| return totalBytes; |
| } |
| |
| /* |
| * Check if a relation is external table with HDFS protocol |
| */ |
| static bool isExternalHDFSProtocol(Oid relOid) |
| { |
| bool ret = true; |
| |
| Relation rel = try_relation_open(relOid, AccessShareLock, false); |
| if (rel != NULL) |
| { |
| if ((rel->rd_rel->relkind == RELKIND_RELATION) && |
| RelationIsExternal(rel)) |
| { |
| ExtTableEntry* entry = GetExtTableEntry(relOid); |
| List* extLocations = entry->locations; |
| ListCell *cell = list_head(extLocations); |
| while(cell != NULL) |
| { |
| char *url = ((Value*)lfirst(cell))->val.str; |
| Assert(url != NULL); |
| // if (!IS_HDFS_URI(url)) |
| if (!IS_HDFS_URI(url)) |
| { |
| ret = false; |
| break; |
| } |
| |
| cell = cell->next; |
| } |
| } |
| |
| relation_close(rel, AccessShareLock); |
| } |
| |
| return ret; |
| } |