| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /*------------------------------------------------------------------------- |
| * |
| * vacuum.c |
| * The postgres vacuum cleaner. |
| * |
| * This file includes the "full" version of VACUUM, as well as control code |
| * used by all three of full VACUUM, lazy VACUUM, and ANALYZE. See |
| * vacuumlazy.c and analyze.c for the rest of the code for the latter two. |
| * |
| * |
| * Portions Copyright (c) 2005-2010, Greenplum inc |
| * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group |
| * Portions Copyright (c) 1994, Regents of the University of California |
| * |
| * |
| * IDENTIFICATION |
| * $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.342.2.4 2008/01/03 21:23:45 tgl Exp $ |
| * |
| *------------------------------------------------------------------------- |
| */ |
| #include "postgres.h" |
| |
| #include <sys/time.h> |
| #include <unistd.h> |
| |
| #include "access/clog.h" |
| #include "access/genam.h" |
| #include "access/heapam.h" |
| #include "catalog/heap.h" |
| #include "access/transam.h" |
| #include "access/xact.h" |
| #include "catalog/catalog.h" |
| #include "catalog/catquery.h" |
| #include "catalog/namespace.h" |
| #include "catalog/pg_database.h" |
| #include "catalog/pg_index.h" |
| #include "catalog/indexing.h" |
| #include "catalog/pg_namespace.h" |
| #include "commands/dbcommands.h" |
| #include "commands/tablecmds.h" |
| #include "commands/vacuum.h" |
| #include "cdb/cdbanalyze.h" |
| #include "cdb/cdbdisp.h" |
| #include "cdb/cdbpartition.h" |
| #include "cdb/cdbvars.h" |
| #include "cdb/cdbsrlz.h" |
| #include "cdb/cdbrelsize.h" |
| #include "cdb/cdbdispatchresult.h" /* CdbDispatchResults */ |
| #include "executor/executor.h" |
| #include "lib/stringinfo.h" |
| #include "libpq/pqformat.h" /* pq_beginmessage() etc. */ |
| #include "miscadmin.h" |
| #include "optimizer/prep.h" |
| #include "postmaster/autovacuum.h" |
| #include "storage/freespace.h" |
| #include "storage/proc.h" |
| #include "storage/procarray.h" |
| #include "utils/acl.h" |
| #include "utils/builtins.h" |
| #include "utils/flatfiles.h" |
| #include "utils/fmgroids.h" |
| #include "utils/inval.h" |
| #include "utils/lsyscache.h" |
| #include "utils/memutils.h" |
| #include "utils/pg_rusage.h" |
| #include "utils/relcache.h" |
| #include "utils/syscache.h" |
| #include "pgstat.h" |
| #include "nodes/makefuncs.h" /* makeRangeVar */ |
| #include "gp-libpq-fe.h" |
| #include "gp-libpq-int.h" |
| #include "storage/lwlock.h" |
| |
| /* |
| * GUC parameters |
| */ |
| int vacuum_freeze_min_age; |
| |
| /* |
| * VacPage structures keep track of each page on which we find useful |
| * amounts of free space. |
| */ |
| typedef struct VacPageData |
| { |
| BlockNumber blkno; /* BlockNumber of this Page */ |
| Size free; /* FreeSpace on this Page */ |
| uint16 offsets_used; /* Number of OffNums used by vacuum */ |
| uint16 offsets_free; /* Number of OffNums free or to be free */ |
| OffsetNumber offsets[1]; /* Array of free OffNums */ |
| } VacPageData; |
| |
| typedef VacPageData *VacPage; |
| |
| typedef struct VacPageListData |
| { |
| BlockNumber empty_end_pages; /* Number of "empty" end-pages */ |
| int num_pages; /* Number of pages in pagedesc */ |
| int num_allocated_pages; /* Number of allocated pages in |
| * pagedesc */ |
| VacPage *pagedesc; /* Descriptions of pages */ |
| } VacPageListData; |
| |
| typedef VacPageListData *VacPageList; |
| |
| |
| /* |
| * We use an array of VTupleMoveData to plan a chain tuple move fully |
| * before we do it. |
| */ |
| typedef struct VTupleMoveData |
| { |
| ItemPointerData tid; /* tuple ID */ |
| VacPage vacpage; /* where to move it to */ |
| bool cleanVpd; /* clean vacpage before using? */ |
| } VTupleMoveData; |
| |
| typedef VTupleMoveData *VTupleMove; |
| |
| |
| /*---------------------------------------------------------------------- |
| * ExecContext: |
| * |
| * As these variables always appear together, we put them into one struct |
| * and pull initialization and cleanup into separate routines. |
| * ExecContext is used by repair_frag() and move_xxx_tuple(). More |
| * accurately: It is *used* only in move_xxx_tuple(), but because this |
| * routine is called many times, we initialize the struct just once in |
| * repair_frag() and pass it on to move_xxx_tuple(). |
| */ |
| typedef struct ExecContextData |
| { |
| ResultRelInfo *resultRelInfo; |
| EState *estate; |
| TupleTableSlot *slot; |
| } ExecContextData; |
| |
| typedef ExecContextData *ExecContext; |
| |
| /* |
| * Currently, vacuuming on a relation with a bitmap index is done through |
| * reindex. We need to pass down OIDs to ensure that all segments use |
| * the same set of OIDs. In some situations, such as vacuuming a table with |
| * lots of deleted tuples and vacuum full, reindex may be called multiple |
| * times. We can not really tell how many time reindex will be called |
| * in advance. Here we set the maxmimal number of oids to be passed down |
| * to QEs. If any more is needed, the vacuum will fail with an error. |
| * |
| * Note that each reindex requires 3 OIDs, so this number should be a multiply |
| * of 3. |
| */ |
| #define NUM_EXTRA_OIDS_FOR_BITMAP (3 * 10) |
| |
| static void |
| ExecContext_Init(ExecContext ec, Relation rel) |
| { |
| TupleDesc tupdesc = RelationGetDescr(rel); |
| |
| /* |
| * We need a ResultRelInfo and an EState so we can use the regular |
| * executor's index-entry-making machinery. |
| */ |
| ec->estate = CreateExecutorState(); |
| |
| ec->resultRelInfo = makeNode(ResultRelInfo); |
| ec->resultRelInfo->ri_RangeTableIndex = 1; /* dummy */ |
| ec->resultRelInfo->ri_RelationDesc = rel; |
| ec->resultRelInfo->ri_TrigDesc = NULL; /* we don't fire triggers */ |
| |
| ExecOpenIndices(ec->resultRelInfo); |
| |
| ec->estate->es_result_relations = ec->resultRelInfo; |
| ec->estate->es_num_result_relations = 1; |
| ec->estate->es_result_relation_info = ec->resultRelInfo; |
| |
| /* Set up a tuple slot too */ |
| ec->slot = MakeSingleTupleTableSlot(tupdesc); |
| } |
| |
| static void |
| ExecContext_Finish(ExecContext ec) |
| { |
| ExecDropSingleTupleTableSlot(ec->slot); |
| ExecCloseIndices(ec->resultRelInfo); |
| FreeExecutorState(ec->estate); |
| } |
| |
| /* |
| * End of ExecContext Implementation |
| *---------------------------------------------------------------------- |
| */ |
| |
| /* A few variables that don't seem worth passing around as parameters */ |
| static MemoryContext vac_context = NULL; |
| |
| static int elevel = -1; |
| |
| static TransactionId OldestXmin; |
| static TransactionId FreezeLimit; |
| |
| |
| /* non-export function prototypes */ |
| static List *get_rel_oids(List *relids, const RangeVar *vacrel, |
| const char *stmttype, bool *expanded, bool rootonly); |
| static void vac_truncate_clog(TransactionId frozenXID); |
| static void vacuum_rel(Relation onerel, VacuumStmt *vacstmt, LOCKMODE lmode, List *updated_stats); |
| static void full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt, List *updated_stats); |
| static void scan_heap(VRelStats *vacrelstats, Relation onerel, |
| VacPageList vacuum_pages, VacPageList fraged_pages); |
| static void repair_frag(VRelStats *vacrelstats, Relation onerel, |
| VacPageList vacuum_pages, VacPageList fraged_pages, |
| int nindexes, Relation *Irel, List *updated_stats, |
| List *all_extra_oids, int reindex_count); |
| static void move_chain_tuple(Relation rel, |
| Buffer old_buf, Page old_page, HeapTuple old_tup, |
| Buffer dst_buf, Page dst_page, VacPage dst_vacpage, |
| ExecContext ec, ItemPointer ctid, bool cleanVpd); |
| static void move_plain_tuple(Relation rel, |
| Buffer old_buf, Page old_page, HeapTuple old_tup, |
| Buffer dst_buf, Page dst_page, VacPage dst_vacpage, |
| ExecContext ec); |
| static void update_hint_bits(Relation rel, VacPageList fraged_pages, |
| int num_fraged_pages, BlockNumber last_move_dest_block, |
| int num_moved); |
| static void vacuum_heap(VRelStats *vacrelstats, Relation onerel, |
| VacPageList vacpagelist); |
| static void vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage); |
| static void vacuum_index(VacPageList vacpagelist, Relation indrel, |
| double num_tuples, int keep_tuples, List *updated_stats, |
| List *extra_oids); |
| static void scan_index(Relation indrel, double num_tuples, List *updated_stats); |
| static bool tid_reaped(ItemPointer itemptr, void *state); |
| static void vac_update_fsm(Relation onerel, VacPageList fraged_pages, |
| BlockNumber rel_pages); |
| static VacPage copy_vac_page(VacPage vacpage); |
| static void vpage_insert(VacPageList vacpagelist, VacPage vpnew); |
| static void *vac_bsearch(const void *key, const void *base, |
| size_t nelem, size_t size, |
| int (*compar) (const void *, const void *)); |
| static int vac_cmp_blk(const void *left, const void *right); |
| static int vac_cmp_offno(const void *left, const void *right); |
| static int vac_cmp_vtlinks(const void *left, const void *right); |
| static bool enough_space(VacPage vacpage, Size len); |
| static Size PageGetFreeSpaceWithFillFactor(Relation relation, Page page); |
| static Relation open_relation_and_check_permission(VacuumStmt *vacstmt, |
| Oid relid, |
| char expected_relkind); |
| static void vacuumStatement(VacuumStmt *vacstmt, List *relids); |
| |
| |
| /**************************************************************************** |
| * * |
| * Code common to all flavors of VACUUM and ANALYZE * |
| * * |
| **************************************************************************** |
| */ |
| |
| /** |
| * Primary entry points for VACUUM, VACUUM FULL and ANALYZE commands. |
| * It calls subroutines vacuumStatement and analyzeStatement depending |
| * on the intent of vacstmt. Not both of vacstmt and relids can be non-null. |
| * Input: |
| * vacstmt - vacuum statement. |
| * relids - list of relations (used by autovacuum) |
| */ |
| void vacuum(VacuumStmt *vacstmt, List *relids, int preferred_seg_num) |
| { |
| VacuumStmt *analyzeStmt = copyObject(vacstmt); |
| bool doAnalyze = vacstmt->analyze; |
| bool doVacuum = vacstmt->vacuum; |
| |
| Assert(!(vacstmt != NULL && relids != NULL)); |
| |
| if (doVacuum) |
| { |
| if (vacstmt->rootonly) |
| { |
| ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), |
| errmsg("ROOTPARTITION option cannot be used together with VACUUM, try ANALYZE ROOTPARTITION"))); |
| } |
| else |
| { |
| /** |
| * Perform vacuum. |
| */ |
| vacstmt->analyze = false; |
| vacstmt->vacuum = true; |
| vacuumStatement(vacstmt, NIL); |
| } |
| } |
| |
| if (doAnalyze) |
| { |
| /** |
| * Perform ANALYZE. |
| */ |
| analyzeStmt->analyze = true; |
| analyzeStmt->vacuum = false; |
| analyzeStatement(analyzeStmt, NIL, preferred_seg_num); |
| } |
| |
| vacstmt->analyze = doAnalyze; |
| vacstmt->vacuum = doVacuum; |
| } |
| |
| /* |
| * Primary entry point for VACUUM (incl FULL) commands. |
| * |
| * relids is normally NIL; if it is not, then it provides the list of |
| * relation OIDs to be processed, and vacstmt->relation is ignored. |
| * (The non-NIL case is currently only used by autovacuum.) |
| * |
| * It is the caller's responsibility that both vacstmt and relids |
| * (if given) be allocated in a memory context that won't disappear |
| * at transaction commit. In fact this context must be QueryContext |
| * to avoid complaints from PreventTransactionChain. |
| * |
| * vacuum() has been changed so that it is an entry point only for vacuum |
| * commands. ANALYZE is now handled by analyzeStatement() in analyze.c. |
| */ |
| static void |
| vacuumStatement(VacuumStmt *vacstmt, List *relids) |
| { |
| const char *stmttype = "VACUUM"; |
| volatile bool all_rels = false; |
| List *relations = NIL; |
| bool expanded = false; |
| |
| /** |
| * Handles only vacuum (incl FULL). Does not handle ANALYZE. |
| */ |
| Assert(vacstmt->vacuum); |
| Assert(!vacstmt->analyze); |
| Assert(Gp_role != GP_ROLE_EXECUTE); |
| |
| if (vacstmt->verbose) |
| elevel = INFO; |
| else |
| elevel = DEBUG2; |
| |
| if (Gp_role == GP_ROLE_DISPATCH) |
| clear_relsize_cache(); |
| |
| if (Gp_role == GP_ROLE_DISPATCH) |
| elevel = DEBUG2; /* vacuum messages aren't interesting from the QD */ |
| |
| /* |
| * We cannot run VACUUM inside a user transaction block; if we were inside |
| * a transaction, then our commit- and start-transaction-command calls |
| * would not have the intended effect! Furthermore, the forced commit that |
| * occurs before truncating the relation's file would have the effect of |
| * committing the rest of the user's transaction too, which would |
| * certainly not be the desired behavior. (This only applies to VACUUM |
| * FULL, though. We could in theory run lazy VACUUM inside a transaction |
| * block, but we choose to disallow that case because we'd rather commit |
| * as soon as possible after finishing the vacuum. This is mainly so that |
| * we can let go the AccessExclusiveLock that we may be holding.) |
| */ |
| if (Gp_role == GP_ROLE_DISPATCH) |
| { |
| PreventTransactionChain((void *) vacstmt, stmttype); |
| } |
| |
| /* |
| * Greenplum Database: send a scary warning message about vacuum full. |
| * its not safe for large tables. |
| */ |
| if (vacstmt->full) |
| { |
| if (Gp_role == GP_ROLE_DISPATCH) |
| ereport(NOTICE, |
| (errcode(ERRCODE_WARNING), |
| errmsg("'VACUUM FULL' is not safe for large tables and has " |
| "been known to yield unpredictable runtimes."), |
| errhint("Use 'VACUUM' instead."))); |
| } |
| |
| /* |
| * Send info about dead objects to the statistics collector, unless we are |
| * in autovacuum --- autovacuum.c does this for itself. |
| */ |
| if (!IsAutoVacuumProcess()) |
| pgstat_vacuum_stat(); |
| |
| /* |
| * Create special memory context for cross-transaction storage. |
| * |
| * Since it is a child of PortalContext, it will go away eventually even |
| * if we suffer an error; there's no need for special abort cleanup logic. |
| */ |
| vac_context = AllocSetContextCreate(PortalContext, |
| "Vacuum", |
| ALLOCSET_DEFAULT_MINSIZE, |
| ALLOCSET_DEFAULT_INITSIZE, |
| ALLOCSET_DEFAULT_MAXSIZE); |
| |
| /* Remember whether we are processing everything in the DB */ |
| all_rels = (relids == NIL && vacstmt->relation == NULL); |
| |
| /* |
| * Build list of relations to process, unless caller gave us one. (If we |
| * build one, we put it in vac_context for safekeeping.) |
| */ |
| relations = get_rel_oids(relids, vacstmt->relation, stmttype, &expanded, vacstmt->rootonly); |
| |
| /* |
| * vacuum_rel expects to be entered with no transaction active; it will |
| * start and commit its own transaction. But we are called by an SQL |
| * command, and so we are executing inside a transaction already. We |
| * commit the transaction started in PostgresMain() here, and start |
| * another one before exiting to match the commit waiting for us back in |
| * PostgresMain(). |
| */ |
| if (Gp_role != GP_ROLE_EXECUTE) |
| CommitTransactionCommand(); |
| |
| PG_TRY(); |
| { |
| ListCell *cur; |
| |
| /* Turn vacuum cost accounting on or off */ |
| VacuumCostActive = (VacuumCostDelay > 0); |
| VacuumCostBalance = 0; |
| |
| /* |
| * Loop to process each selected relation. |
| */ |
| foreach(cur, relations) |
| { |
| Oid relid = lfirst_oid(cur); |
| Relation onerel; |
| LOCKMODE lmode = NoLock; |
| LockRelId onerelid; |
| MemoryContext oldctx; |
| bool bTemp; |
| |
| bTemp = false; |
| |
| /* |
| * Decide whether we need to start/commit our own transactions. |
| * |
| * For VACUUM, we always start/commit our own |
| * transactions, so that we can release locks as soon as |
| * possible. (We could possibly use the outer transaction |
| * for a one-table VACUUM, but handling TOAST tables would |
| * be problematic.) |
| */ |
| |
| StartTransactionCommand(); |
| /* |
| * Functions in indexes may want a snapshot set. Also, setting |
| * a snapshot ensures that RecentGlobalXmin is kept truly recent. |
| */ |
| ActiveSnapshot = CopySnapshot(GetTransactionSnapshot()); |
| |
| /* |
| * Open the relation with an appropriate lock, and check the permission. |
| */ |
| onerel = open_relation_and_check_permission(vacstmt, relid, RELKIND_RELATION); |
| |
| if (onerel == NULL) |
| { |
| if (Gp_role != GP_ROLE_EXECUTE) |
| CommitTransactionCommand(); |
| |
| continue; |
| } |
| |
| /* MPP-7576: don't track internal namespace tables */ |
| switch (get_rel_namespace(relid)) |
| { |
| case PG_CATALOG_NAMESPACE: |
| /* MPP-7773: don't track objects in system namespace |
| * if modifying system tables (eg during upgrade) |
| */ |
| if (allowSystemTableModsDDL) |
| bTemp = true; |
| break; |
| |
| case PG_TOAST_NAMESPACE: |
| case PG_BITMAPINDEX_NAMESPACE: |
| case PG_AOSEGMENT_NAMESPACE: |
| bTemp = true; |
| break; |
| default: |
| break; |
| } |
| |
| /* MPP-7572: Don't track metadata if table in any |
| * temporary namespace |
| */ |
| if (!bTemp) |
| bTemp = isAnyTempNamespace(get_rel_namespace(relid)); |
| |
| /* |
| * Modify the Vacuum statement to vacuum an individual |
| * relation. This ensures that only one relation will be |
| * locked for vacuum, when the user issues a "vacuum <db>" |
| * command, or a "vacuum <parent_partition_table>" |
| * command. |
| */ |
| if (list_length(relations) > 1 || vacstmt->relation == NULL) |
| { |
| char *relname = get_rel_name(relid); |
| char *namespace_name = |
| get_namespace_name(get_rel_namespace(relid)); |
| |
| if (relname == NULL) |
| { |
| elog(ERROR, "Relation name does not exist for relation with oid %d", relid); |
| return; |
| } |
| |
| if (namespace_name == NULL) |
| { |
| elog(ERROR, "Namespace does not exist for relation with oid %d", relid); |
| return; |
| } |
| |
| vacstmt->relation = makeRangeVar(NULL /*catalogname*/, namespace_name, relname, -1); |
| } |
| |
| /* |
| * Record the relation that is in the vacuum process, so |
| * that we can clear up its freespace map entry when the |
| * vacuum process crashes or is cancelled. |
| * |
| * XXX: Have to allocate the space inside ToMemoryContext, |
| * since it is required during commit. |
| */ |
| oldctx = MemoryContextSwitchTo(TopMemoryContext); |
| AppendRelToVacuumRels(onerel); |
| MemoryContextSwitchTo(oldctx); |
| |
| |
| /* Generate extra oids for relfilenodes to be used in |
| * bitmap indexes if any. */ |
| gen_oids_for_bitmaps(vacstmt, onerel); |
| |
| /* |
| * We have to acquire a ShareLock for the relation |
| * which has bitmap indexes, since reindex is used |
| * later. Otherwise, concurrent vacuum and insert may |
| * cause deadlock, see MPP-5960. |
| */ |
| if (vacstmt->extra_oids != NULL) |
| LockRelation(onerel, ShareLock); |
| |
| onerelid = onerel->rd_lockInfo.lockRelId; |
| |
| lmode = vacstmt->full ? AccessExclusiveLock : ShareUpdateExclusiveLock; |
| |
| /* |
| * Get a session-level lock too. This will protect our |
| * access to the relation across multiple transactions, so |
| * that we can vacuum the relation's TOAST table (if any) |
| * secure in the knowledge that no one is deleting the |
| * parent relation. |
| * |
| * NOTE: this cannot block, even if someone else is |
| * waiting for access, because the lock manager knows that |
| * both lock requests are from the same process. |
| */ |
| LockRelationIdForSession(&onerelid, lmode); |
| |
| vacuum_rel(onerel, vacstmt, lmode, NULL); |
| |
| list_free(vacstmt->extra_oids); |
| vacstmt->extra_oids = NIL; |
| |
| /* |
| * Close source relation now, but keep lock so that no one |
| * deletes it before we commit. (If someone did, they'd |
| * fail to clean up the entries we made in pg_statistic. |
| * Also, releasing the lock before commit would expose us |
| * to concurrent-update failures in update_attstats.) |
| */ |
| relation_close(onerel, NoLock); |
| |
| /* MPP-6929: metadata tracking */ |
| if (!bTemp && (Gp_role == GP_ROLE_DISPATCH)) |
| { |
| char *vsubtype = ""; /* NOFULL */ |
| |
| if (IsAutoVacuumProcess()) |
| vsubtype = "AUTO"; |
| else |
| { |
| if (vacstmt->full && |
| (0 == vacstmt->freeze_min_age)) |
| vsubtype = "FULL FREEZE"; |
| else if (vacstmt->full) |
| vsubtype = "FULL"; |
| else if (0 == vacstmt->freeze_min_age) |
| vsubtype = "FREEZE"; |
| } |
| MetaTrackUpdObject(RelationRelationId, |
| relid, |
| GetUserId(), |
| "VACUUM", |
| vsubtype |
| ); |
| } |
| |
| |
| if (list_length(relations) > 1) |
| { |
| pfree(vacstmt->relation->schemaname); |
| pfree(vacstmt->relation->relname); |
| pfree(vacstmt->relation); |
| vacstmt->relation = NULL; |
| } |
| |
| if (Gp_role != GP_ROLE_EXECUTE) |
| CommitTransactionCommand(); |
| |
| /* |
| * Now release the session-level lock on the master table. |
| */ |
| UnlockRelationIdForSession(&onerelid, lmode); |
| } |
| } |
| PG_CATCH(); |
| { |
| /* Make sure cost accounting is turned off after error */ |
| VacuumCostActive = false; |
| /* And reset buffer replacement strategy, too */ |
| StrategyHintVacuum(false); |
| PG_RE_THROW(); |
| } |
| PG_END_TRY(); |
| |
| /* Turn off vacuum cost accounting */ |
| VacuumCostActive = false; |
| |
| StartTransactionCommand(); |
| |
| /* |
| * Re-establish the transaction snapshot. This is wasted effort when |
| * we are called as a normal utility command, because the new |
| * transaction will be dropped immediately by PostgresMain(); but it's |
| * necessary if we are called from autovacuum because autovacuum might |
| * continue on to do an ANALYZE-only call. |
| */ |
| ActiveSnapshot = CopySnapshot(GetTransactionSnapshot()); |
| |
| if (!IsAutoVacuumProcess()) |
| { |
| /* |
| * Update pg_database.datfrozenxid, and truncate pg_clog if possible. |
| * (autovacuum.c does this for itself.) |
| */ |
| vac_update_datfrozenxid(); |
| |
| /* |
| * If it was a database-wide VACUUM, print FSM usage statistics (we |
| * don't make you be superuser to see these). We suppress this in |
| * autovacuum, too. |
| */ |
| if (all_rels) |
| PrintFreeSpaceMapStatistics(elevel); |
| } |
| |
| /* |
| * Clean up working storage --- note we must do this after |
| * StartTransactionCommand, else we might be trying to delete the active |
| * context! |
| */ |
| Assert(CurrentMemoryContext != vac_context); |
| MemoryContextDelete(vac_context); |
| vac_context = NULL; |
| } |
| |
| /* |
| * Build a list of Oids for each relation to be processed |
| * |
| * The list is built in vac_context so that it will survive across our |
| * per-relation transactions. |
| */ |
| static List * |
| get_rel_oids(List *relids, const RangeVar *vacrel, const char *stmttype, |
| bool *expanded, bool rootonly) |
| { |
| List *oid_list = NIL; |
| MemoryContext oldcontext; |
| |
| /* List supplied by VACUUM's caller? */ |
| if (relids) |
| return relids; |
| |
| if (vacrel) |
| { |
| /* Process a specific relation */ |
| Oid relid; |
| List *prels = NIL; |
| |
| relid = RangeVarGetRelid(vacrel, false, false /*allowHcatalog*/); |
| |
| if (rel_is_partitioned(relid)) |
| { |
| PartitionNode *pn; |
| |
| pn = get_parts(relid, 0, 0, false, CurrentMemoryContext, true /*includesubparts*/); |
| |
| prels = all_partition_relids(pn); |
| } |
| else if (rel_is_child_partition(relid)) |
| { |
| /* get my children */ |
| prels = find_all_inheritors(relid); |
| } |
| |
| if (list_length(prels)) |
| *expanded = true; |
| |
| /* Make a relation list entry for this guy */ |
| oldcontext = MemoryContextSwitchTo(vac_context); |
| oid_list = lappend_oid(oid_list, relid); |
| oid_list = list_concat_unique_oid(oid_list, prels); |
| MemoryContextSwitchTo(oldcontext); |
| } |
| else |
| { |
| /* Process all plain relations listed in pg_class */ |
| HeapTuple tuple; |
| cqContext cqc; |
| cqContext *pcqCtx; |
| |
| /* NOTE: force heapscan in caql */ |
| pcqCtx = caql_beginscan( |
| caql_syscache( |
| caql_indexOK(cqclr(&cqc), false), |
| false), |
| cql("SELECT * FROM pg_class " |
| " WHERE relkind = :1 ", |
| CharGetDatum(RELKIND_RELATION))); |
| |
| while (HeapTupleIsValid(tuple = caql_getnext(pcqCtx))) |
| { |
| Form_pg_class classForm = (Form_pg_class) GETSTRUCT(tuple); |
| |
| /* |
| * Don't include non-vacuum-able relations: |
| * - External tables |
| * - Foreign tables |
| * - etc. |
| */ |
| if (classForm->relkind == RELKIND_RELATION && ( |
| classForm->relstorage == RELSTORAGE_EXTERNAL || |
| classForm->relstorage == RELSTORAGE_FOREIGN || |
| classForm->relstorage == RELSTORAGE_VIRTUAL)) |
| continue; |
| |
| /* Make a relation list entry for this guy */ |
| oldcontext = MemoryContextSwitchTo(vac_context); |
| oid_list = lappend_oid(oid_list, HeapTupleGetOid(tuple)); |
| MemoryContextSwitchTo(oldcontext); |
| } |
| |
| caql_endscan(pcqCtx); |
| } |
| |
| return oid_list; |
| } |
| |
| /* |
| * vacuum_set_xid_limits() -- compute oldest-Xmin and freeze cutoff points |
| */ |
| void |
| vacuum_set_xid_limits(VacuumStmt *vacstmt, bool sharedRel, |
| TransactionId *oldestXmin, |
| TransactionId *freezeLimit) |
| { |
| int freezemin; |
| TransactionId limit; |
| TransactionId safeLimit; |
| |
| /* |
| * We can always ignore processes running lazy vacuum. This is because we |
| * use these values only for deciding which tuples we must keep in the |
| * tables. Since lazy vacuum doesn't write its XID anywhere, it's |
| * safe to ignore it. In theory it could be problematic to ignore lazy |
| * vacuums on a full vacuum, but keep in mind that only one vacuum process |
| * can be working on a particular table at any time, and that each vacuum |
| * is always an independent transaction. |
| */ |
| *oldestXmin = GetOldestXmin(sharedRel); |
| |
| Assert(TransactionIdIsNormal(*oldestXmin)); |
| |
| /* |
| * Determine the minimum freeze age to use: as specified in the vacstmt, |
| * or vacuum_freeze_min_age, but in any case not more than half |
| * autovacuum_freeze_max_age, so that autovacuums to prevent XID |
| * wraparound won't occur too frequently. |
| */ |
| freezemin = vacstmt->freeze_min_age; |
| if (freezemin < 0) |
| freezemin = vacuum_freeze_min_age; |
| freezemin = Min(freezemin, autovacuum_freeze_max_age / 2); |
| Assert(freezemin >= 0); |
| |
| /* |
| * Compute the cutoff XID, being careful not to generate a "permanent" XID |
| */ |
| limit = *oldestXmin - freezemin; |
| if (!TransactionIdIsNormal(limit)) |
| limit = FirstNormalTransactionId; |
| |
| /* |
| * If oldestXmin is very far back (in practice, more than |
| * autovacuum_freeze_max_age / 2 XIDs old), complain and force a |
| * minimum freeze age of zero. |
| */ |
| safeLimit = ReadNewTransactionId() - autovacuum_freeze_max_age; |
| if (!TransactionIdIsNormal(safeLimit)) |
| safeLimit = FirstNormalTransactionId; |
| |
| if (TransactionIdPrecedes(limit, safeLimit)) |
| { |
| ereport(WARNING, |
| (errmsg("oldest xmin is far in the past"), |
| errhint("Close open transactions soon to avoid wraparound problems."))); |
| limit = *oldestXmin; |
| } |
| |
| *freezeLimit = limit; |
| } |
| |
| |
| /* |
| * vac_update_relstats() -- update statistics for one relation |
| * |
| * Update the whole-relation statistics that are kept in its pg_class |
| * row. There are additional stats that will be updated if we are |
| * doing ANALYZE, but we always update these stats. This routine works |
| * for both index and heap relation entries in pg_class. |
| * |
| * We violate transaction semantics here by overwriting the rel's |
| * existing pg_class tuple with the new values. This is reasonably |
| * safe since the new values are correct whether or not this transaction |
| * commits. The reason for this is that if we updated these tuples in |
| * the usual way, vacuuming pg_class itself wouldn't work very well --- |
| * by the time we got done with a vacuum cycle, most of the tuples in |
| * pg_class would've been obsoleted. Of course, this only works for |
| * fixed-size never-null columns, but these are. |
| * |
| * Another reason for doing it this way is that when we are in a lazy |
| * VACUUM and have inVacuum set, we mustn't do any updates --- somebody |
| * vacuuming pg_class might think they could delete a tuple marked with |
| * xmin = our xid. |
| * |
| * MPP: 8.2 introduced XLOG entries for "inplace" stats updates so we |
| * no longer need the out-of-place hack. |
| * |
| * This routine is shared by full VACUUM and lazy VACUUM. |
| */ |
| void |
| vac_update_relstats(Relation rel, BlockNumber num_pages, double num_tuples, |
| bool hasindex, TransactionId frozenxid, List *updated_stats) |
| { |
| Relation rd; |
| HeapTuple ctup; |
| Form_pg_class pgcform; |
| Oid relid = RelationGetRelid(rel); |
| bool dirty; |
| cqContext cqc; |
| cqContext *pcqCtx; |
| |
| Assert(relid != InvalidOid); |
| Assert (Gp_role != GP_ROLE_EXECUTE); |
| |
| |
| /* |
| * We need a way to distinguish these 2 cases: |
| * a) ANALYZEd/VACUUMed table is empty |
| * b) Table has never been ANALYZEd/VACUUMed |
| * To do this, in case (a), we set relPages = 1. For case (b), relPages = 0. |
| */ |
| if (num_pages < 1.0) |
| { |
| Assert(num_tuples < 1.0); |
| num_pages = 1.0; |
| } |
| |
| /* |
| * update number of tuples and number of pages in pg_class |
| */ |
| rd = heap_open(RelationRelationId, RowExclusiveLock); |
| |
| pcqCtx = caql_addrel(cqclr(&cqc), rd); |
| |
| /* Fetch a copy of the tuple to scribble on */ |
| ctup = caql_getfirst( |
| pcqCtx, |
| cql("SELECT * FROM pg_class " |
| " WHERE oid = :1 " |
| " FOR UPDATE ", |
| ObjectIdGetDatum(relid))); |
| |
| if (!HeapTupleIsValid(ctup)) |
| elog(ERROR, "pg_class entry for relid %u vanished during vacuuming", |
| relid); |
| pgcform = (Form_pg_class) GETSTRUCT(ctup); |
| |
| /* Apply required updates, if any, to copied tuple */ |
| |
| dirty = false; |
| if (pgcform->relpages != (int32) num_pages) |
| { |
| pgcform->relpages = (int32) num_pages; |
| dirty = true; |
| } |
| if (pgcform->reltuples != (float4) num_tuples) |
| { |
| pgcform->reltuples = (float4) num_tuples; |
| dirty = true; |
| } |
| if (pgcform->relhasindex != hasindex) |
| { |
| pgcform->relhasindex = hasindex; |
| dirty = true; |
| } |
| |
| elog(DEBUG2, "Vacuum oid=%u pages=%d tuples=%f", |
| relid, pgcform->relpages, pgcform->reltuples); |
| /* |
| * If we have discovered that there are no indexes, then there's no |
| * primary key either. This could be done more thoroughly... |
| */ |
| if (!hasindex) |
| { |
| if (pgcform->relhaspkey) |
| { |
| pgcform->relhaspkey = false; |
| dirty = true; |
| } |
| } |
| |
| /* |
| * relfrozenxid should never go backward. Caller can pass |
| * InvalidTransactionId if it has no new data. |
| */ |
| if (TransactionIdIsNormal(frozenxid) && |
| TransactionIdPrecedes(pgcform->relfrozenxid, frozenxid)) |
| { |
| pgcform->relfrozenxid = frozenxid; |
| dirty = true; |
| } |
| |
| /* |
| * If anything changed, write out the tuple. Even if nothing changed, |
| * force relcache invalidation so all backends reset their rd_targblock |
| * --- otherwise it might point to a page we truncated away. |
| */ |
| if (dirty) |
| { |
| heap_inplace_update(rd, ctup); |
| /* the above sends a cache inval message */ |
| } |
| else |
| { |
| /* no need to change tuple, but force relcache inval anyway */ |
| CacheInvalidateRelcacheByTuple(ctup); |
| } |
| |
| heap_close(rd, RowExclusiveLock); |
| } |
| |
| |
| /* |
| * vac_update_datfrozenxid() -- update pg_database.datfrozenxid for our DB |
| * |
| * Update pg_database's datfrozenxid entry for our database to be the |
| * minimum of the pg_class.relfrozenxid values. If we are able to |
| * advance pg_database.datfrozenxid, also try to truncate pg_clog. |
| * |
| * We violate transaction semantics here by overwriting the database's |
| * existing pg_database tuple with the new value. This is reasonably |
| * safe since the new value is correct whether or not this transaction |
| * commits. As with vac_update_relstats, this avoids leaving dead tuples |
| * behind after a VACUUM. |
| * |
| * This routine is shared by full and lazy VACUUM. |
| */ |
| void |
| vac_update_datfrozenxid(void) |
| { |
| HeapTuple tuple; |
| Form_pg_database dbform; |
| Relation relation; |
| HeapTuple classTup; |
| cqContext *pcqCtx; |
| cqContext cqc; |
| |
| TransactionId newFrozenXid; |
| bool dirty = false; |
| |
| /* |
| * Initialize the "min" calculation with GetOldestXmin, which is a |
| * reasonable approximation to the minimum relfrozenxid for not-yet- |
| * committed pg_class entries for new tables; see AddNewRelationTuple(). |
| * Se we cannot produce a wrong minimum by starting with this. |
| */ |
| newFrozenXid = GetOldestXmin(true); |
| |
| /* |
| * We must seqscan pg_class to find the minimum Xid, because there is no |
| * index that can help us here. |
| */ |
| pcqCtx = caql_beginscan( |
| caql_indexOK(cqclr(&cqc), false), |
| cql("SELECT * FROM pg_class ", NULL)); |
| |
| while (HeapTupleIsValid(classTup = caql_getnext(pcqCtx))) |
| { |
| Form_pg_class classForm = (Form_pg_class) GETSTRUCT(classTup); |
| |
| /* |
| * Only consider heap and TOAST tables (anything else should have |
| * InvalidTransactionId in relfrozenxid anyway.) |
| */ |
| if (classForm->relkind != RELKIND_RELATION && |
| classForm->relkind != RELKIND_TOASTVALUE && |
| classForm->relkind != RELKIND_AOSEGMENTS && |
| classForm->relkind != RELKIND_AOBLOCKDIR) |
| continue; |
| |
| /* MPP-10108 - exclude relations with external storage */ |
| if (classForm->relkind == RELKIND_RELATION && ( |
| classForm->relstorage == RELSTORAGE_EXTERNAL || |
| classForm->relstorage == RELSTORAGE_FOREIGN || |
| classForm->relstorage == RELSTORAGE_VIRTUAL)) |
| continue; |
| |
| Assert(TransactionIdIsNormal(classForm->relfrozenxid)); |
| |
| if (TransactionIdPrecedes(classForm->relfrozenxid, newFrozenXid)) |
| newFrozenXid = classForm->relfrozenxid; |
| } |
| |
| /* we're done with pg_class */ |
| caql_endscan(pcqCtx); |
| |
| Assert(TransactionIdIsNormal(newFrozenXid)); |
| |
| /* Now fetch the pg_database tuple we need to update. */ |
| relation = heap_open(DatabaseRelationId, RowExclusiveLock); |
| |
| cqContext *dbcqCtx; |
| cqContext dbcqc; |
| |
| dbcqCtx = caql_addrel(cqclr(&dbcqc), relation); |
| |
| /* Fetch a copy of the tuple to scribble on */ |
| |
| tuple = caql_getfirst( |
| dbcqCtx, |
| cql("SELECT * FROM pg_database " |
| " WHERE oid = :1 " |
| " FOR UPDATE ", |
| ObjectIdGetDatum(MyDatabaseId))); |
| |
| if (!HeapTupleIsValid(tuple)) |
| elog(ERROR, "could not find tuple for database %u", MyDatabaseId); |
| dbform = (Form_pg_database) GETSTRUCT(tuple); |
| |
| /* |
| * Don't allow datfrozenxid to go backward (probably can't happen anyway); |
| * and detect the common case where it doesn't go forward either. |
| */ |
| if (TransactionIdPrecedes(dbform->datfrozenxid, newFrozenXid)) |
| { |
| dbform->datfrozenxid = newFrozenXid; |
| dirty = true; |
| } |
| |
| if (dirty) |
| heap_inplace_update(relation, tuple); |
| |
| heap_freetuple(tuple); |
| heap_close(relation, RowExclusiveLock); |
| |
| /* |
| * If we were able to advance datfrozenxid, mark the flat-file copy of |
| * pg_database for update at commit, and see if we can truncate |
| * pg_clog. |
| */ |
| if (dirty) |
| { |
| database_file_update_needed(); |
| vac_truncate_clog(newFrozenXid); |
| } |
| } |
| |
| |
| /* |
| * vac_truncate_clog() -- attempt to truncate the commit log |
| * |
| * Scan pg_database to determine the system-wide oldest datfrozenxid, |
| * and use it to truncate the transaction commit log (pg_clog). |
| * Also update the XID wrap limit info maintained by varsup.c. |
| * |
| * The passed XID is simply the one I just wrote into my pg_database |
| * entry. It's used to initialize the "min" calculation. |
| * |
| * This routine is shared by full and lazy VACUUM. Note that it's |
| * only invoked when we've managed to change our DB's datfrozenxid |
| * entry. |
| */ |
| static void |
| vac_truncate_clog(TransactionId frozenXID) |
| { |
| TransactionId myXID = GetCurrentTransactionId(); |
| HeapTuple tuple; |
| cqContext cqc; |
| cqContext *pcqCtx; |
| NameData oldest_datname; |
| bool frozenAlreadyWrapped = false; |
| |
| /* init oldest_datname to sync with my frozenXID */ |
| namestrcpy(&oldest_datname, get_database_name(MyDatabaseId)); |
| |
| /* |
| * Scan pg_database to compute the minimum datfrozenxid |
| * |
| * Note: we need not worry about a race condition with new entries being |
| * inserted by CREATE DATABASE. Any such entry will have a copy of some |
| * existing DB's datfrozenxid, and that source DB cannot be ours because |
| * of the interlock against copying a DB containing an active backend. |
| * Hence the new entry will not reduce the minimum. Also, if two |
| * VACUUMs concurrently modify the datfrozenxid's of different databases, |
| * the worst possible outcome is that pg_clog is not truncated as |
| * aggressively as it could be. |
| */ |
| pcqCtx = caql_beginscan( |
| caql_indexOK(cqclr(&cqc), false), |
| cql("SELECT * FROM pg_database ", NULL)); |
| |
| while (HeapTupleIsValid(tuple = caql_getnext(pcqCtx))) |
| { |
| Form_pg_database dbform = (Form_pg_database) GETSTRUCT(tuple); |
| |
| Assert(TransactionIdIsNormal(dbform->datfrozenxid)); |
| |
| if (TransactionIdPrecedes(myXID, dbform->datfrozenxid)) |
| frozenAlreadyWrapped = true; |
| else if (TransactionIdPrecedes(dbform->datfrozenxid, frozenXID)) |
| { |
| frozenXID = dbform->datfrozenxid; |
| namecpy(&oldest_datname, &dbform->datname); |
| } |
| } |
| |
| caql_endscan(pcqCtx); |
| |
| /* |
| * Do not truncate CLOG if we seem to have suffered wraparound already; |
| * the computed minimum XID might be bogus. This case should now be |
| * impossible due to the defenses in GetNewTransactionId, but we keep the |
| * test anyway. |
| */ |
| if (frozenAlreadyWrapped) |
| { |
| ereport(WARNING, |
| (errmsg("some databases have not been vacuumed in over 2 billion transactions"), |
| errdetail("You may have already suffered transaction-wraparound data loss."))); |
| return; |
| } |
| |
| /* Truncate CLOG to the oldest frozenxid */ |
| TruncateCLOG(frozenXID); |
| |
| /* |
| * Update the wrap limit for GetNewTransactionId. Note: this function |
| * will also signal the postmaster for an(other) autovac cycle if needed. |
| */ |
| SetTransactionIdLimit(frozenXID, &oldest_datname); |
| } |
| |
| |
| /**************************************************************************** |
| * * |
| * Code common to both flavors of VACUUM * |
| * * |
| **************************************************************************** |
| */ |
| |
| |
| /* |
| * vacuum_rel() -- vacuum one heap relation |
| * |
| * Doing one heap at a time incurs extra overhead, since we need to |
| * check that the heap exists again just before we vacuum it. The |
| * reason that we do this is so that vacuuming can be spread across |
| * many small transactions. Otherwise, two-phase locking would require |
| * us to lock the entire database during one pass of the vacuum cleaner. |
| */ |
| static void |
| vacuum_rel(Relation onerel, VacuumStmt *vacstmt, LOCKMODE lmode, List *updated_stats) |
| { |
| Oid toast_relid; |
| Oid aoseg_relid = InvalidOid; |
| Oid aoblkdir_relid = InvalidOid; |
| Oid save_userid; |
| bool save_secdefcxt; |
| |
| /* |
| * Check for user-requested abort. Note we want this to be inside a |
| * transaction, so xact.c doesn't issue useless WARNING. |
| */ |
| CHECK_FOR_INTERRUPTS(); |
| |
| /* |
| * Remember the relation's TOAST and AO segments relations for later |
| */ |
| toast_relid = onerel->rd_rel->reltoastrelid; |
| if (RelationIsAoRows(onerel) || |
| RelationIsParquet(onerel)) |
| GetAppendOnlyEntryAuxOids(RelationGetRelid(onerel), SnapshotNow, |
| &aoseg_relid, NULL, |
| &aoblkdir_relid, NULL); |
| |
| /* |
| * Switch to the table owner's userid, so that any index functions are |
| * run as that user. (This is unnecessary, but harmless, for lazy |
| * VACUUM.) |
| */ |
| GetUserIdAndContext(&save_userid, &save_secdefcxt); |
| SetUserIdAndContext(onerel->rd_rel->relowner, true); |
| |
| /* |
| * Tell the cache replacement strategy that vacuum is causing all |
| * following IO |
| */ |
| StrategyHintVacuum(true); |
| |
| /* |
| * Do the actual work --- either FULL or "lazy" vacuum |
| */ |
| if (vacstmt->full) |
| full_vacuum_rel(onerel, vacstmt, updated_stats); |
| else |
| lazy_vacuum_rel(onerel, vacstmt, updated_stats); |
| |
| StrategyHintVacuum(false); |
| |
| /* Restore userid */ |
| SetUserIdAndContext(save_userid, save_secdefcxt); |
| |
| /* |
| * If the relation has a secondary toast rel, vacuum that too while we |
| * still hold the session lock on the master table. Note however that |
| * "analyze" will not get done on the toast table. This is good, because |
| * the toaster always uses hardcoded index access and statistics are |
| * totally unimportant for toast relations. |
| */ |
| if (toast_relid != InvalidOid) |
| { |
| Relation toast_rel = open_relation_and_check_permission(vacstmt, toast_relid, |
| RELKIND_TOASTVALUE); |
| if (toast_rel != NULL) |
| { |
| vacuum_rel(toast_rel, vacstmt, lmode, updated_stats); |
| |
| /* all done with this class, but hold lock until commit */ |
| relation_close(toast_rel, NoLock); |
| } |
| } |
| |
| /* do the same for an AO segments table, if any */ |
| if (aoseg_relid != InvalidOid) |
| { |
| Relation aoseg_rel = open_relation_and_check_permission(vacstmt, aoseg_relid, |
| RELKIND_AOSEGMENTS); |
| if (aoseg_rel != NULL) |
| { |
| vacuum_rel(aoseg_rel, vacstmt, lmode, updated_stats); |
| |
| /* all done with this class, but hold lock until commit */ |
| relation_close(aoseg_rel, NoLock); |
| } |
| } |
| |
| /* do the same for an AO block directory table, if any */ |
| if (aoblkdir_relid != InvalidOid) |
| { |
| Relation aoblkdir_rel = open_relation_and_check_permission(vacstmt, aoblkdir_relid, |
| RELKIND_AOBLOCKDIR); |
| if (aoblkdir_rel != NULL) |
| { |
| vacuum_rel(aoblkdir_rel, vacstmt, lmode, updated_stats); |
| |
| /* all done with this class, but hold lock until commit */ |
| relation_close(aoblkdir_rel, NoLock); |
| } |
| } |
| } |
| |
| |
| /**************************************************************************** |
| * * |
| * Code for VACUUM FULL (only) * |
| * * |
| **************************************************************************** |
| */ |
| |
| |
| /* |
| * full_vacuum_rel() -- perform FULL VACUUM for one heap relation |
| * |
| * This routine vacuums a single heap, cleans out its indexes, and |
| * updates its num_pages and num_tuples statistics. |
| * |
| * At entry, we have already established a transaction and opened |
| * and locked the relation. |
| */ |
| static void |
| full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt, List *updated_stats) |
| { |
| VacPageListData vacuum_pages; /* List of pages to vacuum and/or |
| * clean indexes */ |
| VacPageListData fraged_pages = /* List of pages with space enough for */ |
| { /* re-using */ |
| 0, /* empty_end_pages */ |
| 0, /* num_pages */ |
| 0, /* num_allocated_pages */ |
| NULL /* pageesc */ |
| }; |
| |
| Relation *Irel; |
| int nindexes, |
| i; |
| VRelStats *vacrelstats; |
| int reindex_count = 1; |
| |
| vacuum_set_xid_limits(vacstmt, onerel->rd_rel->relisshared, |
| &OldestXmin, &FreezeLimit); |
| |
| /* |
| * Set up statistics-gathering machinery. |
| */ |
| vacrelstats = (VRelStats *) palloc(sizeof(VRelStats)); |
| vacrelstats->rel_pages = 0; |
| vacrelstats->rel_tuples = 0; |
| vacrelstats->hasindex = false; |
| |
| if(RelationIsAoRows(onerel)) |
| { |
| /* append-only relation. has a special path */ |
| vacuum_appendonly_rel(onerel, vacrelstats, /* FULL */ true); |
| } |
| else if (RelationIsParquet(onerel)) |
| { |
| vacuum_parquet_rel(onerel, vacrelstats, true); |
| } |
| else |
| { |
| Assert(RelationIsHeap(onerel)); |
| |
| /* scan the heap */ |
| vacuum_pages.num_pages = fraged_pages.num_pages = 0; |
| scan_heap(vacrelstats, onerel, &vacuum_pages, &fraged_pages); |
| |
| /* Now open all indexes of the relation */ |
| vac_open_indexes(onerel, AccessExclusiveLock, &nindexes, &Irel); |
| if (nindexes > 0) |
| vacrelstats->hasindex = true; |
| |
| /* Clean/scan index relation(s) */ |
| if (Irel != NULL) |
| { |
| if (vacuum_pages.num_pages > 0) |
| { |
| for (i = 0; i < nindexes; i++) |
| { |
| List *extra_oids = |
| get_oids_for_bitmap(vacstmt->extra_oids, Irel[i], onerel, reindex_count); |
| |
| vacuum_index(&vacuum_pages, Irel[i], |
| vacrelstats->rel_tuples, 0, updated_stats, extra_oids); |
| list_free(extra_oids); |
| } |
| reindex_count++; |
| } |
| else |
| { |
| /* just scan indexes to update statistic */ |
| for (i = 0; i < nindexes; i++) |
| scan_index(Irel[i], vacrelstats->rel_tuples, updated_stats); |
| } |
| } |
| |
| if (fraged_pages.num_pages > 0) |
| { |
| /* Try to shrink heap */ |
| repair_frag(vacrelstats, onerel, &vacuum_pages, &fraged_pages, |
| nindexes, Irel, updated_stats, vacstmt->extra_oids, reindex_count); |
| vac_close_indexes(nindexes, Irel, NoLock); |
| } |
| else |
| { |
| vac_close_indexes(nindexes, Irel, NoLock); |
| if (vacuum_pages.num_pages > 0) |
| { |
| /* Clean pages from vacuum_pages list */ |
| vacuum_heap(vacrelstats, onerel, &vacuum_pages); |
| } |
| } |
| |
| /* update shared free space map with final free space info */ |
| vac_update_fsm(onerel, &fraged_pages, vacrelstats->rel_pages); |
| } |
| |
| /* update statistics in pg_class */ |
| vac_update_relstats(onerel, vacrelstats->rel_pages, |
| vacrelstats->rel_tuples, vacrelstats->hasindex, |
| FreezeLimit, updated_stats); |
| |
| /* report results to the stats collector, too */ |
| pgstat_report_vacuum(RelationGetRelid(onerel), onerel->rd_rel->relisshared, true, |
| vacstmt->analyze, vacrelstats->rel_tuples); |
| } |
| |
| |
| /* |
| * scan_heap() -- scan an open heap relation |
| * |
| * This routine sets commit status bits, constructs vacuum_pages (list |
| * of pages we need to compact free space on and/or clean indexes of |
| * deleted tuples), constructs fraged_pages (list of pages with free |
| * space that tuples could be moved into), and calculates statistics |
| * on the number of live tuples in the heap. |
| */ |
| static void |
| scan_heap(VRelStats *vacrelstats, Relation onerel, |
| VacPageList vacuum_pages, VacPageList fraged_pages) |
| { |
| MIRROREDLOCK_BUFMGR_DECLARE; |
| |
| BlockNumber nblocks, |
| blkno; |
| char *relname; |
| VacPage vacpage; |
| BlockNumber empty_pages, |
| empty_end_pages; |
| double num_tuples, |
| tups_vacuumed, |
| nkeep, |
| nunused; |
| double free_space, |
| usable_free_space; |
| Size min_tlen = MaxTupleSize; |
| Size max_tlen = 0; |
| bool do_shrinking = true; |
| VTupleLink vtlinks = (VTupleLink) palloc(100 * sizeof(VTupleLinkData)); |
| int num_vtlinks = 0; |
| int free_vtlinks = 100; |
| PGRUsage ru0; |
| |
| pg_rusage_init(&ru0); |
| |
| relname = RelationGetRelationName(onerel); |
| ereport(elevel, |
| (errmsg("vacuuming \"%s.%s\"", |
| get_namespace_name(RelationGetNamespace(onerel)), |
| relname))); |
| |
| empty_pages = empty_end_pages = 0; |
| num_tuples = tups_vacuumed = nkeep = nunused = 0; |
| free_space = 0; |
| |
| nblocks = RelationGetNumberOfBlocks(onerel); |
| |
| /* |
| * We initially create each VacPage item in a maximal-sized workspace, |
| * then copy the workspace into a just-large-enough copy. |
| */ |
| vacpage = (VacPage) palloc(sizeof(VacPageData) + MaxOffsetNumber * sizeof(OffsetNumber)); |
| |
| for (blkno = 0; blkno < nblocks; blkno++) |
| { |
| Page page, |
| tempPage = NULL; |
| bool do_reap, |
| do_frag; |
| Buffer buf; |
| OffsetNumber offnum, |
| maxoff; |
| bool notup; |
| OffsetNumber frozen[MaxOffsetNumber]; |
| int nfrozen; |
| |
| vacuum_delay_point(); |
| |
| // -------- MirroredLock ---------- |
| MIRROREDLOCK_BUFMGR_LOCK; |
| |
| buf = ReadBuffer(onerel, blkno); |
| page = BufferGetPage(buf); |
| |
| /* |
| * Since we are holding exclusive lock on the relation, no other |
| * backend can be accessing the page; however it is possible that the |
| * background writer will try to write the page if it's already marked |
| * dirty. To ensure that invalid data doesn't get written to disk, we |
| * must take exclusive buffer lock wherever we potentially modify |
| * pages. |
| */ |
| LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); |
| |
| vacpage->blkno = blkno; |
| vacpage->offsets_used = 0; |
| vacpage->offsets_free = 0; |
| |
| if (PageIsNew(page)) |
| { |
| VacPage vacpagecopy; |
| |
| ereport(WARNING, |
| (errmsg("relation \"%s\" page %u is uninitialized --- fixing", |
| relname, blkno))); |
| PageInit(page, BufferGetPageSize(buf), 0); |
| MarkBufferDirty(buf); |
| vacpage->free = PageGetFreeSpaceWithFillFactor(onerel, page); |
| free_space += vacpage->free; |
| empty_pages++; |
| empty_end_pages++; |
| vacpagecopy = copy_vac_page(vacpage); |
| vpage_insert(vacuum_pages, vacpagecopy); |
| vpage_insert(fraged_pages, vacpagecopy); |
| UnlockReleaseBuffer(buf); |
| |
| MIRROREDLOCK_BUFMGR_UNLOCK; |
| // -------- MirroredLock ---------- |
| |
| continue; |
| } |
| |
| if (PageIsEmpty(page)) |
| { |
| VacPage vacpagecopy; |
| |
| vacpage->free = PageGetFreeSpaceWithFillFactor(onerel, page); |
| free_space += vacpage->free; |
| empty_pages++; |
| empty_end_pages++; |
| vacpagecopy = copy_vac_page(vacpage); |
| vpage_insert(vacuum_pages, vacpagecopy); |
| vpage_insert(fraged_pages, vacpagecopy); |
| UnlockReleaseBuffer(buf); |
| |
| MIRROREDLOCK_BUFMGR_UNLOCK; |
| // -------- MirroredLock ---------- |
| |
| continue; |
| } |
| |
| nfrozen = 0; |
| notup = true; |
| maxoff = PageGetMaxOffsetNumber(page); |
| for (offnum = FirstOffsetNumber; |
| offnum <= maxoff; |
| offnum = OffsetNumberNext(offnum)) |
| { |
| ItemId itemid = PageGetItemId(page, offnum); |
| bool tupgone = false; |
| HeapTupleData tuple; |
| |
| /* |
| * Collect un-used items too - it's possible to have indexes |
| * pointing here after crash. |
| */ |
| if (!ItemIdIsUsed(itemid)) |
| { |
| vacpage->offsets[vacpage->offsets_free++] = offnum; |
| nunused += 1; |
| continue; |
| } |
| |
| tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); |
| tuple.t_len = ItemIdGetLength(itemid); |
| ItemPointerSet(&(tuple.t_self), blkno, offnum); |
| |
| switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf, true)) |
| { |
| case HEAPTUPLE_DEAD: |
| tupgone = true; /* we can delete the tuple */ |
| break; |
| case HEAPTUPLE_LIVE: |
| /* Tuple is good --- but let's do some validity checks */ |
| if (onerel->rd_rel->relhasoids && |
| !OidIsValid(HeapTupleGetOid(&tuple))) |
| elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid", |
| relname, blkno, offnum); |
| break; |
| case HEAPTUPLE_RECENTLY_DEAD: |
| |
| /* |
| * If tuple is recently deleted then we must not remove it |
| * from relation. |
| */ |
| nkeep += 1; |
| |
| /* |
| * If we do shrinking and this tuple is updated one then |
| * remember it to construct updated tuple dependencies. |
| */ |
| if (do_shrinking && |
| !(ItemPointerEquals(&(tuple.t_self), |
| &(tuple.t_data->t_ctid)))) |
| { |
| if (free_vtlinks == 0) |
| { |
| free_vtlinks = 1000; |
| vtlinks = (VTupleLink) repalloc(vtlinks, |
| (free_vtlinks + num_vtlinks) * |
| sizeof(VTupleLinkData)); |
| } |
| vtlinks[num_vtlinks].new_tid = tuple.t_data->t_ctid; |
| vtlinks[num_vtlinks].this_tid = tuple.t_self; |
| free_vtlinks--; |
| num_vtlinks++; |
| } |
| break; |
| case HEAPTUPLE_INSERT_IN_PROGRESS: |
| |
| /* |
| * This should not happen, since we hold exclusive lock on |
| * the relation; shouldn't we raise an error? (Actually, |
| * it can happen in system catalogs, since we tend to |
| * release write lock before commit there.) |
| */ |
| ereport(NOTICE, |
| (errmsg("relation \"%s\" TID %u/%u: InsertTransactionInProgress %u --- can't shrink relation", |
| relname, blkno, offnum, HeapTupleHeaderGetXmin(tuple.t_data)))); |
| do_shrinking = false; |
| break; |
| case HEAPTUPLE_DELETE_IN_PROGRESS: |
| |
| /* |
| * This should not happen, since we hold exclusive lock on |
| * the relation; shouldn't we raise an error? (Actually, |
| * it can happen in system catalogs, since we tend to |
| * release write lock before commit there.) |
| */ |
| ereport(NOTICE, |
| (errmsg("relation \"%s\" TID %u/%u: DeleteTransactionInProgress %u --- can't shrink relation", |
| relname, blkno, offnum, HeapTupleHeaderGetXmax(tuple.t_data)))); |
| do_shrinking = false; |
| break; |
| default: |
| elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); |
| break; |
| } |
| |
| if (tupgone) |
| { |
| ItemId lpp; |
| |
| /* |
| * Here we are building a temporary copy of the page with dead |
| * tuples removed. Below we will apply |
| * PageRepairFragmentation to the copy, so that we can |
| * determine how much space will be available after removal of |
| * dead tuples. But note we are NOT changing the real page |
| * yet... |
| */ |
| if (tempPage == NULL) |
| { |
| Size pageSize; |
| |
| pageSize = PageGetPageSize(page); |
| tempPage = (Page) palloc(pageSize); |
| memcpy(tempPage, page, pageSize); |
| } |
| |
| /* mark it unused on the temp page */ |
| lpp = PageGetItemId(tempPage, offnum); |
| lpp->lp_flags &= ~LP_USED; |
| |
| vacpage->offsets[vacpage->offsets_free++] = offnum; |
| tups_vacuumed += 1; |
| } |
| else |
| { |
| num_tuples += 1; |
| notup = false; |
| if (tuple.t_len < min_tlen) |
| min_tlen = tuple.t_len; |
| if (tuple.t_len > max_tlen) |
| max_tlen = tuple.t_len; |
| |
| /* |
| * Each non-removable tuple must be checked to see if it |
| * needs freezing. |
| */ |
| if (heap_freeze_tuple(tuple.t_data, FreezeLimit, |
| InvalidBuffer)) |
| frozen[nfrozen++] = offnum; |
| } |
| } /* scan along page */ |
| |
| if (tempPage != NULL) |
| { |
| /* Some tuples are removable; figure free space after removal */ |
| PageRepairFragmentation(tempPage, NULL); |
| vacpage->free = PageGetFreeSpaceWithFillFactor(onerel, tempPage); |
| pfree(tempPage); |
| do_reap = true; |
| } |
| else |
| { |
| /* Just use current available space */ |
| vacpage->free = PageGetFreeSpaceWithFillFactor(onerel, page); |
| /* Need to reap the page if it has ~LP_USED line pointers */ |
| do_reap = (vacpage->offsets_free > 0); |
| } |
| |
| free_space += vacpage->free; |
| |
| /* |
| * Add the page to vacuum_pages if it requires reaping, and add it to |
| * fraged_pages if it has a useful amount of free space. "Useful" |
| * means enough for a minimal-sized tuple. But we don't know that |
| * accurately near the start of the relation, so add pages |
| * unconditionally if they have >= BLCKSZ/10 free space. Also |
| * forcibly add pages with no live tuples, to avoid confusing the |
| * empty_end_pages logic. (In the presence of unreasonably small |
| * fillfactor, it seems possible that such pages might not pass |
| * the free-space test, but they had better be in the list anyway.) |
| */ |
| do_frag = (vacpage->free >= min_tlen || vacpage->free >= BLCKSZ / 10 || |
| notup); |
| |
| if (do_reap || do_frag) |
| { |
| VacPage vacpagecopy = copy_vac_page(vacpage); |
| |
| if (do_reap) |
| vpage_insert(vacuum_pages, vacpagecopy); |
| if (do_frag) |
| vpage_insert(fraged_pages, vacpagecopy); |
| } |
| |
| /* |
| * Include the page in empty_end_pages if it will be empty after |
| * vacuuming; this is to keep us from using it as a move destination. |
| * Note that such pages are guaranteed to be in fraged_pages. |
| */ |
| if (notup) |
| { |
| empty_pages++; |
| empty_end_pages++; |
| } |
| else |
| empty_end_pages = 0; |
| |
| /* |
| * If we froze any tuples, mark the buffer dirty, and write a WAL |
| * record recording the changes. We must log the changes to be |
| * crash-safe against future truncation of CLOG. |
| */ |
| if (nfrozen > 0) |
| { |
| MarkBufferDirty(buf); |
| /* no XLOG for temp tables, though */ |
| if (!onerel->rd_istemp) |
| { |
| XLogRecPtr recptr; |
| |
| recptr = log_heap_freeze(onerel, buf, FreezeLimit, |
| frozen, nfrozen); |
| PageSetLSN(page, recptr); |
| PageSetTLI(page, ThisTimeLineID); |
| } |
| } |
| |
| UnlockReleaseBuffer(buf); |
| |
| MIRROREDLOCK_BUFMGR_UNLOCK; |
| // -------- MirroredLock ---------- |
| |
| } |
| |
| pfree(vacpage); |
| |
| /* save stats in the rel list for use later */ |
| vacrelstats->rel_tuples = num_tuples; |
| vacrelstats->rel_pages = nblocks; |
| if (num_tuples == 0) |
| min_tlen = max_tlen = 0; |
| vacrelstats->min_tlen = min_tlen; |
| vacrelstats->max_tlen = max_tlen; |
| |
| vacuum_pages->empty_end_pages = empty_end_pages; |
| fraged_pages->empty_end_pages = empty_end_pages; |
| |
| /* |
| * Clear the fraged_pages list if we found we couldn't shrink. Else, |
| * remove any "empty" end-pages from the list, and compute usable free |
| * space = free space in remaining pages. |
| */ |
| if (do_shrinking) |
| { |
| int i; |
| |
| Assert((BlockNumber) fraged_pages->num_pages >= empty_end_pages); |
| fraged_pages->num_pages -= empty_end_pages; |
| usable_free_space = 0; |
| for (i = 0; i < fraged_pages->num_pages; i++) |
| usable_free_space += fraged_pages->pagedesc[i]->free; |
| } |
| else |
| { |
| fraged_pages->num_pages = 0; |
| usable_free_space = 0; |
| } |
| |
| /* don't bother to save vtlinks if we will not call repair_frag */ |
| if (fraged_pages->num_pages > 0 && num_vtlinks > 0) |
| { |
| qsort((char *) vtlinks, num_vtlinks, sizeof(VTupleLinkData), |
| vac_cmp_vtlinks); |
| vacrelstats->vtlinks = vtlinks; |
| vacrelstats->num_vtlinks = num_vtlinks; |
| } |
| else |
| { |
| vacrelstats->vtlinks = NULL; |
| vacrelstats->num_vtlinks = 0; |
| pfree(vtlinks); |
| } |
| |
| ereport(elevel, |
| (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u pages", |
| RelationGetRelationName(onerel), |
| tups_vacuumed, num_tuples, nblocks), |
| errdetail("%.0f dead row versions cannot be removed yet.\n" |
| "Nonremovable row versions range from %lu to %lu bytes long.\n" |
| "There were %.0f unused item pointers.\n" |
| "Total free space (including removable row versions) is %.0f bytes.\n" |
| "%u pages are or will become empty, including %u at the end of the table.\n" |
| "%u pages containing %.0f free bytes are potential move destinations.\n" |
| "%s.", |
| nkeep, |
| (unsigned long) min_tlen, (unsigned long) max_tlen, |
| nunused, |
| free_space, |
| empty_pages, empty_end_pages, |
| fraged_pages->num_pages, usable_free_space, |
| pg_rusage_show(&ru0)))); |
| } |
| |
| |
| /* |
| * repair_frag() -- try to repair relation's fragmentation |
| * |
| * This routine marks dead tuples as unused and tries re-use dead space |
| * by moving tuples (and inserting indexes if needed). It constructs |
| * Nvacpagelist list of free-ed pages (moved tuples) and clean indexes |
| * for them after committing (in hack-manner - without losing locks |
| * and freeing memory!) current transaction. It truncates relation |
| * if some end-blocks are gone away. |
| */ |
| static void |
| repair_frag(VRelStats *vacrelstats, Relation onerel, |
| VacPageList vacuum_pages, VacPageList fraged_pages, |
| int nindexes, Relation *Irel, List *updated_stats, |
| List *all_extra_oids, int reindex_count) |
| { |
| MIRROREDLOCK_BUFMGR_DECLARE; |
| |
| TransactionId myXID = GetCurrentTransactionId(); |
| Buffer dst_buffer = InvalidBuffer; |
| BlockNumber nblocks, |
| blkno; |
| BlockNumber last_move_dest_block = 0, |
| last_vacuum_block; |
| Page dst_page = NULL; |
| ExecContextData ec; |
| VacPageListData Nvacpagelist; |
| VacPage dst_vacpage = NULL, |
| last_vacuum_page, |
| vacpage, |
| *curpage; |
| int i; |
| int num_moved = 0, |
| num_fraged_pages, |
| vacuumed_pages; |
| int keep_tuples = 0; |
| PGRUsage ru0; |
| |
| pg_rusage_init(&ru0); |
| |
| // Fetch gp_persistent_relation_node information that will be added to XLOG record. |
| RelationFetchGpRelationNodeForXLog(onerel); |
| |
| ExecContext_Init(&ec, onerel); |
| |
| Nvacpagelist.num_pages = 0; |
| num_fraged_pages = fraged_pages->num_pages; |
| Assert((BlockNumber) vacuum_pages->num_pages >= vacuum_pages->empty_end_pages); |
| vacuumed_pages = vacuum_pages->num_pages - vacuum_pages->empty_end_pages; |
| if (vacuumed_pages > 0) |
| { |
| /* get last reaped page from vacuum_pages */ |
| last_vacuum_page = vacuum_pages->pagedesc[vacuumed_pages - 1]; |
| last_vacuum_block = last_vacuum_page->blkno; |
| } |
| else |
| { |
| last_vacuum_page = NULL; |
| last_vacuum_block = InvalidBlockNumber; |
| } |
| |
| vacpage = (VacPage) palloc(sizeof(VacPageData) + MaxOffsetNumber * sizeof(OffsetNumber)); |
| vacpage->offsets_used = vacpage->offsets_free = 0; |
| |
| /* |
| * Scan pages backwards from the last nonempty page, trying to move tuples |
| * down to lower pages. Quit when we reach a page that we have moved any |
| * tuples onto, or the first page if we haven't moved anything, or when we |
| * find a page we cannot completely empty (this last condition is handled |
| * by "break" statements within the loop). |
| * |
| * NB: this code depends on the vacuum_pages and fraged_pages lists being |
| * in order by blkno. |
| */ |
| nblocks = vacrelstats->rel_pages; |
| for (blkno = nblocks - vacuum_pages->empty_end_pages - 1; |
| blkno > last_move_dest_block; |
| blkno--) |
| { |
| Buffer buf; |
| Page page; |
| OffsetNumber offnum, |
| maxoff; |
| bool isempty, |
| chain_tuple_moved; |
| |
| vacuum_delay_point(); |
| |
| /* |
| * Forget fraged_pages pages at or after this one; they're no longer |
| * useful as move targets, since we only want to move down. Note that |
| * since we stop the outer loop at last_move_dest_block, pages removed |
| * here cannot have had anything moved onto them already. |
| * |
| * Also note that we don't change the stored fraged_pages list, only |
| * our local variable num_fraged_pages; so the forgotten pages are |
| * still available to be loaded into the free space map later. |
| */ |
| while (num_fraged_pages > 0 && |
| fraged_pages->pagedesc[num_fraged_pages - 1]->blkno >= blkno) |
| { |
| Assert(fraged_pages->pagedesc[num_fraged_pages - 1]->offsets_used == 0); |
| --num_fraged_pages; |
| } |
| |
| /* |
| * Process this page of relation. |
| */ |
| |
| // -------- MirroredLock ---------- |
| MIRROREDLOCK_BUFMGR_LOCK; |
| |
| buf = ReadBuffer(onerel, blkno); |
| page = BufferGetPage(buf); |
| |
| vacpage->offsets_free = 0; |
| |
| isempty = PageIsEmpty(page); |
| |
| /* Is the page in the vacuum_pages list? */ |
| if (blkno == last_vacuum_block) |
| { |
| if (last_vacuum_page->offsets_free > 0) |
| { |
| /* there are dead tuples on this page - clean them */ |
| Assert(!isempty); |
| LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); |
| vacuum_page(onerel, buf, last_vacuum_page); |
| LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
| } |
| else |
| Assert(isempty); |
| --vacuumed_pages; |
| if (vacuumed_pages > 0) |
| { |
| /* get prev reaped page from vacuum_pages */ |
| last_vacuum_page = vacuum_pages->pagedesc[vacuumed_pages - 1]; |
| last_vacuum_block = last_vacuum_page->blkno; |
| } |
| else |
| { |
| last_vacuum_page = NULL; |
| last_vacuum_block = InvalidBlockNumber; |
| } |
| if (isempty) |
| { |
| |
| MIRROREDLOCK_BUFMGR_UNLOCK; |
| // -------- MirroredLock ---------- |
| |
| ReleaseBuffer(buf); |
| continue; |
| } |
| } |
| else |
| Assert(!isempty); |
| |
| chain_tuple_moved = false; /* no one chain-tuple was moved off |
| * this page, yet */ |
| vacpage->blkno = blkno; |
| maxoff = PageGetMaxOffsetNumber(page); |
| for (offnum = FirstOffsetNumber; |
| offnum <= maxoff; |
| offnum = OffsetNumberNext(offnum)) |
| { |
| Size tuple_len; |
| HeapTupleData tuple; |
| ItemId itemid = PageGetItemId(page, offnum); |
| |
| if (!ItemIdIsUsed(itemid)) |
| continue; |
| |
| tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); |
| tuple_len = tuple.t_len = ItemIdGetLength(itemid); |
| ItemPointerSet(&(tuple.t_self), blkno, offnum); |
| |
| /* --- |
| * VACUUM FULL has an exclusive lock on the relation. So |
| * normally no other transaction can have pending INSERTs or |
| * DELETEs in this relation. A tuple is either: |
| * (a) a tuple in a system catalog, inserted or deleted |
| * by a not yet committed transaction |
| * (b) known dead (XMIN_INVALID, or XMAX_COMMITTED and xmax |
| * is visible to all active transactions) |
| * (c) inserted by a committed xact (XMIN_COMMITTED) |
| * (d) moved by the currently running VACUUM. |
| * (e) deleted (XMAX_COMMITTED) but at least one active |
| * transaction does not see the deleting transaction |
| * In case (a) we wouldn't be in repair_frag() at all. |
| * In case (b) we cannot be here, because scan_heap() has |
| * already marked the item as unused, see continue above. Case |
| * (c) is what normally is to be expected. Case (d) is only |
| * possible, if a whole tuple chain has been moved while |
| * processing this or a higher numbered block. |
| * --- |
| */ |
| if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED)) |
| { |
| if (tuple.t_data->t_infomask & HEAP_MOVED_IN) |
| elog(ERROR, "HEAP_MOVED_IN was not expected"); |
| if (!(tuple.t_data->t_infomask & HEAP_MOVED_OFF)) |
| elog(ERROR, "HEAP_MOVED_OFF was expected"); |
| |
| /* |
| * MOVED_OFF by another VACUUM would have caused the |
| * visibility check to set XMIN_COMMITTED or XMIN_INVALID. |
| */ |
| if (HeapTupleHeaderGetXvac(tuple.t_data) != myXID) |
| elog(ERROR, "invalid XVAC in tuple header"); |
| |
| /* |
| * If this (chain) tuple is moved by me already then I have to |
| * check is it in vacpage or not - i.e. is it moved while |
| * cleaning this page or some previous one. |
| */ |
| |
| /* Can't we Assert(keep_tuples > 0) here? */ |
| if (keep_tuples == 0) |
| continue; |
| if (chain_tuple_moved) |
| { |
| /* some chains were moved while cleaning this page */ |
| Assert(vacpage->offsets_free > 0); |
| for (i = 0; i < vacpage->offsets_free; i++) |
| { |
| if (vacpage->offsets[i] == offnum) |
| break; |
| } |
| if (i >= vacpage->offsets_free) /* not found */ |
| { |
| vacpage->offsets[vacpage->offsets_free++] = offnum; |
| keep_tuples--; |
| } |
| } |
| else |
| { |
| vacpage->offsets[vacpage->offsets_free++] = offnum; |
| keep_tuples--; |
| } |
| continue; |
| } |
| |
| /* |
| * If this tuple is in a chain of tuples created in updates by |
| * "recent" transactions then we have to move the whole chain of |
| * tuples to other places, so that we can write new t_ctid links |
| * that preserve the chain relationship. |
| * |
| * This test is complicated. Read it as "if tuple is a recently |
| * created updated version, OR if it is an obsoleted version". (In |
| * the second half of the test, we needn't make any check on XMAX |
| * --- it must be recently obsoleted, else scan_heap would have |
| * deemed it removable.) |
| * |
| * NOTE: this test is not 100% accurate: it is possible for a |
| * tuple to be an updated one with recent xmin, and yet not match |
| * any new_tid entry in the vtlinks list. Presumably there was |
| * once a parent tuple with xmax matching the xmin, but it's |
| * possible that that tuple has been removed --- for example, if |
| * it had xmin = xmax and wasn't itself an updated version, then |
| * HeapTupleSatisfiesVacuum would deem it removable as soon as the |
| * xmin xact completes. |
| * |
| * To be on the safe side, we abandon the repair_frag process if |
| * we cannot find the parent tuple in vtlinks. This may be overly |
| * conservative; AFAICS it would be safe to move the chain. |
| * |
| * Also, because we distinguish DEAD and RECENTLY_DEAD tuples |
| * using OldestXmin, which is a rather coarse test, it is quite |
| * possible to have an update chain in which a tuple we think is |
| * RECENTLY_DEAD links forward to one that is definitely DEAD. |
| * In such a case the RECENTLY_DEAD tuple must actually be dead, |
| * but it seems too complicated to try to make VACUUM remove it. |
| * We treat each contiguous set of RECENTLY_DEAD tuples as a |
| * separately movable chain, ignoring any intervening DEAD ones. |
| */ |
| if (((tuple.t_data->t_infomask & HEAP_UPDATED) && |
| !TransactionIdPrecedes(HeapTupleHeaderGetXmin(tuple.t_data), |
| OldestXmin)) || |
| (!(tuple.t_data->t_infomask & (HEAP_XMAX_INVALID | |
| HEAP_IS_LOCKED)) && |
| !(ItemPointerEquals(&(tuple.t_self), |
| &(tuple.t_data->t_ctid))))) |
| { |
| Buffer Cbuf = buf; |
| bool freeCbuf = false; |
| bool chain_move_failed = false; |
| bool moved_target = false; |
| ItemPointerData Ctid; |
| HeapTupleData tp = tuple; |
| Size tlen = tuple_len; |
| VTupleMove vtmove; |
| int num_vtmove; |
| int free_vtmove; |
| VacPage to_vacpage = NULL; |
| int to_item = 0; |
| int ti; |
| |
| if (dst_buffer != InvalidBuffer) |
| { |
| ReleaseBuffer(dst_buffer); |
| dst_buffer = InvalidBuffer; |
| } |
| |
| /* Quick exit if we have no vtlinks to search in */ |
| if (vacrelstats->vtlinks == NULL) |
| { |
| elog(DEBUG2, "parent item in update-chain not found --- can't continue repair_frag"); |
| break; /* out of walk-along-page loop */ |
| } |
| |
| /* |
| * If this tuple is in the begin/middle of the chain then we |
| * have to move to the end of chain. As with any t_ctid |
| * chase, we have to verify that each new tuple is really the |
| * descendant of the tuple we came from; however, here we |
| * need even more than the normal amount of paranoia. |
| * If t_ctid links forward to a tuple determined to be DEAD, |
| * then depending on where that tuple is, it might already |
| * have been removed, and perhaps even replaced by a MOVED_IN |
| * tuple. We don't want to include any DEAD tuples in the |
| * chain, so we have to recheck HeapTupleSatisfiesVacuum. |
| */ |
| while (!(tp.t_data->t_infomask & (HEAP_XMAX_INVALID | |
| HEAP_IS_LOCKED)) && |
| !(ItemPointerEquals(&(tp.t_self), |
| &(tp.t_data->t_ctid)))) |
| { |
| ItemPointerData nextTid; |
| TransactionId priorXmax; |
| Buffer nextBuf; |
| Page nextPage; |
| OffsetNumber nextOffnum; |
| ItemId nextItemid; |
| HeapTupleHeader nextTdata; |
| HTSV_Result nextTstatus; |
| |
| nextTid = tp.t_data->t_ctid; |
| priorXmax = HeapTupleHeaderGetXmax(tp.t_data); |
| /* assume block# is OK (see heap_fetch comments) */ |
| nextBuf = ReadBuffer(onerel, |
| ItemPointerGetBlockNumber(&nextTid)); |
| nextPage = BufferGetPage(nextBuf); |
| /* If bogus or unused slot, assume tp is end of chain */ |
| nextOffnum = ItemPointerGetOffsetNumber(&nextTid); |
| if (nextOffnum < FirstOffsetNumber || |
| nextOffnum > PageGetMaxOffsetNumber(nextPage)) |
| { |
| ReleaseBuffer(nextBuf); |
| break; |
| } |
| nextItemid = PageGetItemId(nextPage, nextOffnum); |
| if (!ItemIdIsUsed(nextItemid)) |
| { |
| ReleaseBuffer(nextBuf); |
| break; |
| } |
| /* if not matching XMIN, assume tp is end of chain */ |
| nextTdata = (HeapTupleHeader) PageGetItem(nextPage, |
| nextItemid); |
| if (!TransactionIdEquals(HeapTupleHeaderGetXmin(nextTdata), |
| priorXmax)) |
| { |
| ReleaseBuffer(nextBuf); |
| break; |
| } |
| /* must check for DEAD or MOVED_IN tuple, too */ |
| nextTstatus = HeapTupleSatisfiesVacuum(nextTdata, |
| OldestXmin, |
| nextBuf, true); |
| if (nextTstatus == HEAPTUPLE_DEAD || |
| nextTstatus == HEAPTUPLE_INSERT_IN_PROGRESS) |
| { |
| ReleaseBuffer(nextBuf); |
| break; |
| } |
| /* if it's MOVED_OFF we shoulda moved this one with it */ |
| if (nextTstatus == HEAPTUPLE_DELETE_IN_PROGRESS) |
| elog(ERROR, "updated tuple is already HEAP_MOVED_OFF"); |
| /* OK, switch our attention to the next tuple in chain */ |
| tp.t_data = nextTdata; |
| tp.t_self = nextTid; |
| tlen = tp.t_len = ItemIdGetLength(nextItemid); |
| if (freeCbuf) |
| ReleaseBuffer(Cbuf); |
| Cbuf = nextBuf; |
| freeCbuf = true; |
| } |
| |
| /* Set up workspace for planning the chain move */ |
| vtmove = (VTupleMove) palloc(100 * sizeof(VTupleMoveData)); |
| num_vtmove = 0; |
| free_vtmove = 100; |
| |
| /* |
| * Now, walk backwards up the chain (towards older tuples) and |
| * check if all items in chain can be moved. We record all |
| * the moves that need to be made in the vtmove array. |
| */ |
| for (;;) |
| { |
| Buffer Pbuf; |
| Page Ppage; |
| ItemId Pitemid; |
| HeapTupleHeader PTdata; |
| VTupleLinkData vtld, |
| *vtlp; |
| |
| /* Identify a target page to move this tuple to */ |
| if (to_vacpage == NULL || |
| !enough_space(to_vacpage, tlen)) |
| { |
| for (i = 0; i < num_fraged_pages; i++) |
| { |
| if (enough_space(fraged_pages->pagedesc[i], tlen)) |
| break; |
| } |
| |
| if (i == num_fraged_pages) |
| { |
| /* can't move item anywhere */ |
| chain_move_failed = true; |
| break; /* out of check-all-items loop */ |
| } |
| to_item = i; |
| to_vacpage = fraged_pages->pagedesc[to_item]; |
| } |
| to_vacpage->free -= MAXALIGN(tlen); |
| if (to_vacpage->offsets_used >= to_vacpage->offsets_free) |
| to_vacpage->free -= sizeof(ItemIdData); |
| (to_vacpage->offsets_used)++; |
| |
| /* Add an entry to vtmove list */ |
| if (free_vtmove == 0) |
| { |
| free_vtmove = 1000; |
| vtmove = (VTupleMove) |
| repalloc(vtmove, |
| (free_vtmove + num_vtmove) * |
| sizeof(VTupleMoveData)); |
| } |
| vtmove[num_vtmove].tid = tp.t_self; |
| vtmove[num_vtmove].vacpage = to_vacpage; |
| if (to_vacpage->offsets_used == 1) |
| vtmove[num_vtmove].cleanVpd = true; |
| else |
| vtmove[num_vtmove].cleanVpd = false; |
| free_vtmove--; |
| num_vtmove++; |
| |
| /* Remember if we reached the original target tuple */ |
| if (ItemPointerGetBlockNumber(&tp.t_self) == blkno && |
| ItemPointerGetOffsetNumber(&tp.t_self) == offnum) |
| moved_target = true; |
| |
| /* Done if at beginning of chain */ |
| if (!(tp.t_data->t_infomask & HEAP_UPDATED) || |
| TransactionIdPrecedes(HeapTupleHeaderGetXmin(tp.t_data), |
| OldestXmin)) |
| break; /* out of check-all-items loop */ |
| |
| /* Move to tuple with prior row version */ |
| vtld.new_tid = tp.t_self; |
| vtlp = (VTupleLink) |
| vac_bsearch((void *) &vtld, |
| (void *) (vacrelstats->vtlinks), |
| vacrelstats->num_vtlinks, |
| sizeof(VTupleLinkData), |
| vac_cmp_vtlinks); |
| if (vtlp == NULL) |
| { |
| /* see discussion above */ |
| elog(DEBUG2, "parent item in update-chain not found --- can't continue repair_frag"); |
| chain_move_failed = true; |
| break; /* out of check-all-items loop */ |
| } |
| tp.t_self = vtlp->this_tid; |
| Pbuf = ReadBuffer(onerel, |
| ItemPointerGetBlockNumber(&(tp.t_self))); |
| Ppage = BufferGetPage(Pbuf); |
| Pitemid = PageGetItemId(Ppage, |
| ItemPointerGetOffsetNumber(&(tp.t_self))); |
| /* this can't happen since we saw tuple earlier: */ |
| if (!ItemIdIsUsed(Pitemid)) |
| elog(ERROR, "parent itemid marked as unused"); |
| PTdata = (HeapTupleHeader) PageGetItem(Ppage, Pitemid); |
| |
| /* ctid should not have changed since we saved it */ |
| Assert(ItemPointerEquals(&(vtld.new_tid), |
| &(PTdata->t_ctid))); |
| |
| /* |
| * Read above about cases when !ItemIdIsUsed(nextItemid) |
| * (child item is removed)... Due to the fact that at the |
| * moment we don't remove unuseful part of update-chain, |
| * it's possible to get non-matching parent row here. Like |
| * as in the case which caused this problem, we stop |
| * shrinking here. I could try to find real parent row but |
| * want not to do it because of real solution will be |
| * implemented anyway, later, and we are too close to 6.5 |
| * release. - vadim 06/11/99 |
| */ |
| if ((PTdata->t_infomask & HEAP_XMAX_IS_MULTI) || |
| !(TransactionIdEquals(HeapTupleHeaderGetXmax(PTdata), |
| HeapTupleHeaderGetXmin(tp.t_data)))) |
| { |
| ReleaseBuffer(Pbuf); |
| elog(DEBUG2, "too old parent tuple found --- can't continue repair_frag"); |
| chain_move_failed = true; |
| break; /* out of check-all-items loop */ |
| } |
| tp.t_data = PTdata; |
| tlen = tp.t_len = ItemIdGetLength(Pitemid); |
| if (freeCbuf) |
| ReleaseBuffer(Cbuf); |
| Cbuf = Pbuf; |
| freeCbuf = true; |
| } /* end of check-all-items loop */ |
| |
| if (freeCbuf) |
| ReleaseBuffer(Cbuf); |
| freeCbuf = false; |
| |
| /* Double-check that we will move the current target tuple */ |
| if (!moved_target && !chain_move_failed) |
| { |
| elog(DEBUG2, "failed to chain back to target --- cannot continue repair_frag"); |
| chain_move_failed = true; |
| } |
| |
| if (chain_move_failed) |
| { |
| /* |
| * Undo changes to offsets_used state. We don't bother |
| * cleaning up the amount-free state, since we're not |
| * going to do any further tuple motion. |
| */ |
| for (i = 0; i < num_vtmove; i++) |
| { |
| Assert(vtmove[i].vacpage->offsets_used > 0); |
| (vtmove[i].vacpage->offsets_used)--; |
| } |
| pfree(vtmove); |
| break; /* out of walk-along-page loop */ |
| } |
| |
| /* |
| * Okay, move the whole tuple chain in reverse order. |
| * |
| * Ctid tracks the new location of the previously-moved tuple. |
| */ |
| ItemPointerSetInvalid(&Ctid); |
| for (ti = 0; ti < num_vtmove; ti++) |
| { |
| VacPage destvacpage = vtmove[ti].vacpage; |
| Page Cpage; |
| ItemId Citemid; |
| |
| /* Get page to move from */ |
| tuple.t_self = vtmove[ti].tid; |
| Cbuf = ReadBuffer(onerel, |
| ItemPointerGetBlockNumber(&(tuple.t_self))); |
| |
| /* Get page to move to */ |
| dst_buffer = ReadBuffer(onerel, destvacpage->blkno); |
| |
| LockBuffer(dst_buffer, BUFFER_LOCK_EXCLUSIVE); |
| if (dst_buffer != Cbuf) |
| LockBuffer(Cbuf, BUFFER_LOCK_EXCLUSIVE); |
| |
| dst_page = BufferGetPage(dst_buffer); |
| Cpage = BufferGetPage(Cbuf); |
| |
| Citemid = PageGetItemId(Cpage, |
| ItemPointerGetOffsetNumber(&(tuple.t_self))); |
| tuple.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid); |
| tuple_len = tuple.t_len = ItemIdGetLength(Citemid); |
| |
| move_chain_tuple(onerel, Cbuf, Cpage, &tuple, |
| dst_buffer, dst_page, destvacpage, |
| &ec, &Ctid, vtmove[ti].cleanVpd); |
| |
| num_moved++; |
| if (destvacpage->blkno > last_move_dest_block) |
| last_move_dest_block = destvacpage->blkno; |
| |
| /* |
| * Remember that we moved tuple from the current page |
| * (corresponding index tuple will be cleaned). |
| */ |
| if (Cbuf == buf) |
| vacpage->offsets[vacpage->offsets_free++] = |
| ItemPointerGetOffsetNumber(&(tuple.t_self)); |
| else |
| keep_tuples++; |
| |
| ReleaseBuffer(dst_buffer); |
| ReleaseBuffer(Cbuf); |
| } /* end of move-the-tuple-chain loop */ |
| |
| dst_buffer = InvalidBuffer; |
| pfree(vtmove); |
| chain_tuple_moved = true; |
| |
| /* advance to next tuple in walk-along-page loop */ |
| continue; |
| } /* end of is-tuple-in-chain test */ |
| |
| /* try to find new page for this tuple */ |
| if (dst_buffer == InvalidBuffer || |
| !enough_space(dst_vacpage, tuple_len)) |
| { |
| if (dst_buffer != InvalidBuffer) |
| { |
| ReleaseBuffer(dst_buffer); |
| dst_buffer = InvalidBuffer; |
| } |
| for (i = 0; i < num_fraged_pages; i++) |
| { |
| if (enough_space(fraged_pages->pagedesc[i], tuple_len)) |
| break; |
| } |
| if (i == num_fraged_pages) |
| break; /* can't move item anywhere */ |
| dst_vacpage = fraged_pages->pagedesc[i]; |
| dst_buffer = ReadBuffer(onerel, dst_vacpage->blkno); |
| LockBuffer(dst_buffer, BUFFER_LOCK_EXCLUSIVE); |
| dst_page = BufferGetPage(dst_buffer); |
| /* if this page was not used before - clean it */ |
| if (!PageIsEmpty(dst_page) && dst_vacpage->offsets_used == 0) |
| vacuum_page(onerel, dst_buffer, dst_vacpage); |
| } |
| else |
| LockBuffer(dst_buffer, BUFFER_LOCK_EXCLUSIVE); |
| |
| LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); |
| |
| move_plain_tuple(onerel, buf, page, &tuple, |
| dst_buffer, dst_page, dst_vacpage, &ec); |
| |
| num_moved++; |
| if (dst_vacpage->blkno > last_move_dest_block) |
| last_move_dest_block = dst_vacpage->blkno; |
| |
| /* |
| * Remember that we moved tuple from the current page |
| * (corresponding index tuple will be cleaned). |
| */ |
| vacpage->offsets[vacpage->offsets_free++] = offnum; |
| } /* walk along page */ |
| |
| /* |
| * If we broke out of the walk-along-page loop early (ie, still have |
| * offnum <= maxoff), then we failed to move some tuple off this page. |
| * No point in shrinking any more, so clean up and exit the per-page |
| * loop. |
| */ |
| if (offnum < maxoff && keep_tuples > 0) |
| { |
| OffsetNumber off; |
| |
| /* |
| * Fix vacpage state for any unvisited tuples remaining on page |
| */ |
| for (off = OffsetNumberNext(offnum); |
| off <= maxoff; |
| off = OffsetNumberNext(off)) |
| { |
| ItemId itemid = PageGetItemId(page, off); |
| HeapTupleHeader htup; |
| |
| if (!ItemIdIsUsed(itemid)) |
| continue; |
| htup = (HeapTupleHeader) PageGetItem(page, itemid); |
| if (htup->t_infomask & HEAP_XMIN_COMMITTED) |
| continue; |
| |
| /* |
| * See comments in the walk-along-page loop above about why |
| * only MOVED_OFF tuples should be found here. |
| */ |
| if (htup->t_infomask & HEAP_MOVED_IN) |
| elog(ERROR, "HEAP_MOVED_IN was not expected"); |
| if (!(htup->t_infomask & HEAP_MOVED_OFF)) |
| elog(ERROR, "HEAP_MOVED_OFF was expected"); |
| if (HeapTupleHeaderGetXvac(htup) != myXID) |
| elog(ERROR, "invalid XVAC in tuple header"); |
| |
| if (chain_tuple_moved) |
| { |
| /* some chains were moved while cleaning this page */ |
| Assert(vacpage->offsets_free > 0); |
| for (i = 0; i < vacpage->offsets_free; i++) |
| { |
| if (vacpage->offsets[i] == off) |
| break; |
| } |
| if (i >= vacpage->offsets_free) /* not found */ |
| { |
| vacpage->offsets[vacpage->offsets_free++] = off; |
| Assert(keep_tuples > 0); |
| keep_tuples--; |
| } |
| } |
| else |
| { |
| vacpage->offsets[vacpage->offsets_free++] = off; |
| Assert(keep_tuples > 0); |
| keep_tuples--; |
| } |
| } |
| } |
| |
| if (vacpage->offsets_free > 0) /* some tuples were moved */ |
| { |
| if (chain_tuple_moved) /* else - they are ordered */ |
| { |
| qsort((char *) (vacpage->offsets), vacpage->offsets_free, |
| sizeof(OffsetNumber), vac_cmp_offno); |
| } |
| vpage_insert(&Nvacpagelist, copy_vac_page(vacpage)); |
| } |
| |
| MIRROREDLOCK_BUFMGR_UNLOCK; |
| // -------- MirroredLock ---------- |
| |
| ReleaseBuffer(buf); |
| |
| if (offnum <= maxoff) |
| break; /* had to quit early, see above note */ |
| |
| } /* walk along relation */ |
| |
| blkno++; /* new number of blocks */ |
| |
| if (dst_buffer != InvalidBuffer) |
| { |
| Assert(num_moved > 0); |
| ReleaseBuffer(dst_buffer); |
| } |
| |
| if (num_moved > 0) |
| { |
| /* |
| * We have to commit our tuple movings before we truncate the |
| * relation. Ideally we should do Commit/StartTransactionCommand |
| * here, relying on the session-level table lock to protect our |
| * exclusive access to the relation. However, that would require a |
| * lot of extra code to close and re-open the relation, indexes, etc. |
| * For now, a quick hack: record status of current transaction as |
| * committed, and continue. |
| */ |
| RecordTransactionCommit(); |
| } |
| |
| /* |
| * We are not going to move any more tuples across pages, but we still |
| * need to apply vacuum_page to compact free space in the remaining pages |
| * in vacuum_pages list. Note that some of these pages may also be in the |
| * fraged_pages list, and may have had tuples moved onto them; if so, we |
| * already did vacuum_page and needn't do it again. |
| */ |
| for (i = 0, curpage = vacuum_pages->pagedesc; |
| i < vacuumed_pages; |
| i++, curpage++) |
| { |
| vacuum_delay_point(); |
| |
| Assert((*curpage)->blkno < blkno); |
| if ((*curpage)->offsets_used == 0) |
| { |
| Buffer buf; |
| Page page; |
| |
| /* this page was not used as a move target, so must clean it */ |
| |
| // -------- MirroredLock ---------- |
| MIRROREDLOCK_BUFMGR_LOCK; |
| |
| buf = ReadBuffer(onerel, (*curpage)->blkno); |
| LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); |
| page = BufferGetPage(buf); |
| if (!PageIsEmpty(page)) |
| vacuum_page(onerel, buf, *curpage); |
| UnlockReleaseBuffer(buf); |
| |
| MIRROREDLOCK_BUFMGR_UNLOCK; |
| // -------- MirroredLock ---------- |
| |
| } |
| } |
| |
| /* |
| * Now scan all the pages that we moved tuples onto and update tuple |
| * status bits. This is not really necessary, but will save time for |
| * future transactions examining these tuples. |
| */ |
| update_hint_bits(onerel, fraged_pages, num_fraged_pages, |
| last_move_dest_block, num_moved); |
| |
| /* |
| * It'd be cleaner to make this report at the bottom of this routine, but |
| * then the rusage would double-count the second pass of index vacuuming. |
| * So do it here and ignore the relatively small amount of processing that |
| * occurs below. |
| */ |
| ereport(elevel, |
| (errmsg("\"%s\": moved %u row versions, truncated %u to %u pages", |
| RelationGetRelationName(onerel), |
| num_moved, nblocks, blkno), |
| errdetail("%s.", |
| pg_rusage_show(&ru0)))); |
| |
| /* |
| * Reflect the motion of system tuples to catalog cache here. |
| */ |
| CommandCounterIncrement(); |
| |
| if (Nvacpagelist.num_pages > 0) |
| { |
| /* vacuum indexes again if needed */ |
| if (Irel != NULL) |
| { |
| VacPage *vpleft, |
| *vpright, |
| vpsave; |
| |
| /* re-sort Nvacpagelist.pagedesc */ |
| for (vpleft = Nvacpagelist.pagedesc, |
| vpright = Nvacpagelist.pagedesc + Nvacpagelist.num_pages - 1; |
| vpleft < vpright; vpleft++, vpright--) |
| { |
| vpsave = *vpleft; |
| *vpleft = *vpright; |
| *vpright = vpsave; |
| } |
| |
| /* |
| * keep_tuples is the number of tuples that have been moved off a |
| * page during chain moves but not been scanned over subsequently. |
| * The tuple ids of these tuples are not recorded as free offsets |
| * for any VacPage, so they will not be cleared from the indexes. |
| */ |
| Assert(keep_tuples >= 0); |
| for (i = 0; i < nindexes; i++) |
| { |
| List *extra_oids = get_oids_for_bitmap(all_extra_oids, Irel[i], |
| onerel, reindex_count); |
| |
| vacuum_index(&Nvacpagelist, Irel[i], |
| vacrelstats->rel_tuples, keep_tuples, updated_stats, extra_oids); |
| list_free(extra_oids); |
| } |
| reindex_count++; |
| } |
| |
| /* |
| * Clean moved-off tuples from last page in Nvacpagelist list. |
| * |
| * We need only do this in this one page, because higher-numbered |
| * pages are going to be truncated from the relation entirely. But see |
| * comments for update_hint_bits(). |
| */ |
| if (vacpage->blkno == (blkno - 1) && |
| vacpage->offsets_free > 0) |
| { |
| Buffer buf; |
| Page page; |
| OffsetNumber unused[MaxOffsetNumber]; |
| OffsetNumber offnum, |
| maxoff; |
| int uncnt; |
| int num_tuples = 0; |
| |
| // -------- MirroredLock ---------- |
| MIRROREDLOCK_BUFMGR_LOCK; |
| |
| buf = ReadBuffer(onerel, vacpage->blkno); |
| LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); |
| page = BufferGetPage(buf); |
| maxoff = PageGetMaxOffsetNumber(page); |
| for (offnum = FirstOffsetNumber; |
| offnum <= maxoff; |
| offnum = OffsetNumberNext(offnum)) |
| { |
| ItemId itemid = PageGetItemId(page, offnum); |
| HeapTupleHeader htup; |
| |
| if (!ItemIdIsUsed(itemid)) |
| continue; |
| htup = (HeapTupleHeader) PageGetItem(page, itemid); |
| if (htup->t_infomask & HEAP_XMIN_COMMITTED) |
| continue; |
| |
| /* |
| * See comments in the walk-along-page loop above about why |
| * only MOVED_OFF tuples should be found here. |
| */ |
| if (htup->t_infomask & HEAP_MOVED_IN) |
| elog(ERROR, "HEAP_MOVED_IN was not expected"); |
| if (!(htup->t_infomask & HEAP_MOVED_OFF)) |
| elog(ERROR, "HEAP_MOVED_OFF was expected"); |
| if (HeapTupleHeaderGetXvac(htup) != myXID) |
| elog(ERROR, "invalid XVAC in tuple header"); |
| |
| itemid->lp_flags &= ~LP_USED; |
| num_tuples++; |
| } |
| Assert(vacpage->offsets_free == num_tuples); |
| |
| START_CRIT_SECTION(); |
| |
| uncnt = PageRepairFragmentation(page, unused); |
| |
| MarkBufferDirty(buf); |
| |
| /* XLOG stuff */ |
| if (!onerel->rd_istemp) |
| { |
| XLogRecPtr recptr; |
| |
| recptr = log_heap_clean(onerel, buf, unused, uncnt); |
| PageSetLSN(page, recptr); |
| PageSetTLI(page, ThisTimeLineID); |
| } |
| else |
| { |
| /* |
| * No XLOG record, but still need to flag that XID exists on |
| * disk |
| */ |
| MyXactMadeTempRelUpdate = true; |
| } |
| |
| END_CRIT_SECTION(); |
| |
| UnlockReleaseBuffer(buf); |
| |
| MIRROREDLOCK_BUFMGR_UNLOCK; |
| // -------- MirroredLock ---------- |
| |
| } |
| |
| /* now - free new list of reaped pages */ |
| curpage = Nvacpagelist.pagedesc; |
| for (i = 0; i < Nvacpagelist.num_pages; i++, curpage++) |
| pfree(*curpage); |
| pfree(Nvacpagelist.pagedesc); |
| } |
| |
| /* Truncate relation, if needed */ |
| if (blkno < nblocks) |
| { |
| RelationTruncate( |
| onerel, |
| blkno, |
| /* markPersistentAsPhysicallyTruncated */ true); |
| |
| vacrelstats->rel_pages = blkno; /* set new number of blocks */ |
| } |
| |
| /* clean up */ |
| pfree(vacpage); |
| if (vacrelstats->vtlinks != NULL) |
| pfree(vacrelstats->vtlinks); |
| |
| ExecContext_Finish(&ec); |
| } |
| |
| /* |
| * move_chain_tuple() -- move one tuple that is part of a tuple chain |
| * |
| * This routine moves old_tup from old_page to dst_page. |
| * old_page and dst_page might be the same page. |
| * On entry old_buf and dst_buf are locked exclusively, both locks (or |
| * the single lock, if this is a intra-page-move) are released before |
| * exit. |
| * |
| * Yes, a routine with ten parameters is ugly, but it's still better |
| * than having these 120 lines of code in repair_frag() which is |
| * already too long and almost unreadable. |
| */ |
| static void |
| move_chain_tuple(Relation rel, |
| Buffer old_buf, Page old_page, HeapTuple old_tup, |
| Buffer dst_buf, Page dst_page, VacPage dst_vacpage, |
| ExecContext ec, ItemPointer ctid, bool cleanVpd) |
| { |
| TransactionId myXID = GetCurrentTransactionId(); |
| HeapTupleData newtup; |
| OffsetNumber newoff; |
| ItemId newitemid; |
| Size tuple_len = old_tup->t_len; |
| |
| MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD; |
| |
| // Fetch gp_persistent_relation_node information that will be added to XLOG record. |
| RelationFetchGpRelationNodeForXLog(rel); |
| |
| /* |
| * make a modifiable copy of the source tuple. |
| */ |
| heap_copytuple_with_tuple(old_tup, &newtup); |
| |
| /* |
| * register invalidation of source tuple in catcaches. |
| */ |
| CacheInvalidateHeapTuple(rel, old_tup, SysCacheInvalidate_VacuumMove); |
| |
| /* NO EREPORT(ERROR) TILL CHANGES ARE LOGGED */ |
| START_CRIT_SECTION(); |
| |
| /* |
| * mark the source tuple MOVED_OFF. |
| */ |
| old_tup->t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED | |
| HEAP_XMIN_INVALID | |
| HEAP_MOVED_IN); |
| old_tup->t_data->t_infomask |= HEAP_MOVED_OFF; |
| HeapTupleHeaderSetXvac(old_tup->t_data, myXID); |
| |
| /* |
| * If this page was not used before - clean it. |
| * |
| * NOTE: a nasty bug used to lurk here. It is possible for the source and |
| * destination pages to be the same (since this tuple-chain member can be |
| * on a page lower than the one we're currently processing in the outer |
| * loop). If that's true, then after vacuum_page() the source tuple will |
| * have been moved, and tuple.t_data will be pointing at garbage. |
| * Therefore we must do everything that uses old_tup->t_data BEFORE this |
| * step!! |
| * |
| * This path is different from the other callers of vacuum_page, because |
| * we have already incremented the vacpage's offsets_used field to account |
| * for the tuple(s) we expect to move onto the page. Therefore |
| * vacuum_page's check for offsets_used == 0 is wrong. But since that's a |
| * good debugging check for all other callers, we work around it here |
| * rather than remove it. |
| */ |
| if (!PageIsEmpty(dst_page) && cleanVpd) |
| { |
| int sv_offsets_used = dst_vacpage->offsets_used; |
| |
| dst_vacpage->offsets_used = 0; |
| vacuum_page(rel, dst_buf, dst_vacpage); |
| dst_vacpage->offsets_used = sv_offsets_used; |
| } |
| |
| /* |
| * Update the state of the copied tuple, and store it on the destination |
| * page. |
| */ |
| newtup.t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED | |
| HEAP_XMIN_INVALID | |
| HEAP_MOVED_OFF); |
| newtup.t_data->t_infomask |= HEAP_MOVED_IN; |
| HeapTupleHeaderSetXvac(newtup.t_data, myXID); |
| newoff = PageAddItem(dst_page, (Item) newtup.t_data, tuple_len, |
| InvalidOffsetNumber, LP_USED); |
| if (newoff == InvalidOffsetNumber) |
| elog(PANIC, "failed to add item with len = %lu to page %u while moving tuple chain", |
| (unsigned long) tuple_len, dst_vacpage->blkno); |
| newitemid = PageGetItemId(dst_page, newoff); |
| /* drop temporary copy, and point to the version on the dest page */ |
| pfree(newtup.t_data); |
| newtup.t_data = (HeapTupleHeader) PageGetItem(dst_page, newitemid); |
| |
| ItemPointerSet(&(newtup.t_self), dst_vacpage->blkno, newoff); |
| |
| /* |
| * Set new tuple's t_ctid pointing to itself if last tuple in chain, and |
| * to next tuple in chain otherwise. (Since we move the chain in reverse |
| * order, this is actually the previously processed tuple.) |
| */ |
| if (!ItemPointerIsValid(ctid)) |
| newtup.t_data->t_ctid = newtup.t_self; |
| else |
| newtup.t_data->t_ctid = *ctid; |
| *ctid = newtup.t_self; |
| |
| MarkBufferDirty(dst_buf); |
| if (dst_buf != old_buf) |
| MarkBufferDirty(old_buf); |
| |
| /* XLOG stuff */ |
| if (!rel->rd_istemp) |
| { |
| XLogRecPtr recptr = log_heap_move(rel, old_buf, old_tup->t_self, |
| dst_buf, &newtup); |
| |
| if (old_buf != dst_buf) |
| { |
| PageSetLSN(old_page, recptr); |
| PageSetTLI(old_page, ThisTimeLineID); |
| } |
| PageSetLSN(dst_page, recptr); |
| PageSetTLI(dst_page, ThisTimeLineID); |
| } |
| else |
| { |
| /* |
| * No XLOG record, but still need to flag that XID exists on disk |
| */ |
| MyXactMadeTempRelUpdate = true; |
| } |
| |
| END_CRIT_SECTION(); |
| |
| LockBuffer(dst_buf, BUFFER_LOCK_UNLOCK); |
| if (dst_buf != old_buf) |
| LockBuffer(old_buf, BUFFER_LOCK_UNLOCK); |
| |
| /* Create index entries for the moved tuple */ |
| if (ec->resultRelInfo->ri_NumIndices > 0) |
| { |
| ExecStoreGenericTuple(&newtup, ec->slot, false); |
| ExecInsertIndexTuples(ec->slot, &(newtup.t_self), ec->estate, true); |
| ResetPerTupleExprContext(ec->estate); |
| } |
| } |
| |
| /* |
| * move_plain_tuple() -- move one tuple that is not part of a chain |
| * |
| * This routine moves old_tup from old_page to dst_page. |
| * On entry old_buf and dst_buf are locked exclusively, both locks are |
| * released before exit. |
| * |
| * Yes, a routine with eight parameters is ugly, but it's still better |
| * than having these 90 lines of code in repair_frag() which is already |
| * too long and almost unreadable. |
| */ |
| static void |
| move_plain_tuple(Relation rel, |
| Buffer old_buf, Page old_page, HeapTuple old_tup, |
| Buffer dst_buf, Page dst_page, VacPage dst_vacpage, |
| ExecContext ec) |
| { |
| TransactionId myXID = GetCurrentTransactionId(); |
| HeapTupleData newtup; |
| OffsetNumber newoff; |
| ItemId newitemid; |
| Size tuple_len = old_tup->t_len; |
| |
| MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD; |
| |
| // Fetch gp_persistent_relation_node information that will be added to XLOG record. |
| RelationFetchGpRelationNodeForXLog(rel); |
| |
| /* copy tuple */ |
| heap_copytuple_with_tuple(old_tup, &newtup); |
| |
| /* |
| * register invalidation of source tuple in catcaches. |
| * |
| * (Note: we do not need to register the copied tuple, because we are not |
| * changing the tuple contents and so there cannot be any need to flush |
| * negative catcache entries.) |
| */ |
| CacheInvalidateHeapTuple(rel, old_tup, SysCacheInvalidate_VacuumMove); |
| |
| /* NO EREPORT(ERROR) TILL CHANGES ARE LOGGED */ |
| START_CRIT_SECTION(); |
| |
| /* |
| * Mark new tuple as MOVED_IN by me. |
| */ |
| newtup.t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED | |
| HEAP_XMIN_INVALID | |
| HEAP_MOVED_OFF); |
| newtup.t_data->t_infomask |= HEAP_MOVED_IN; |
| HeapTupleHeaderSetXvac(newtup.t_data, myXID); |
| |
| /* add tuple to the page */ |
| newoff = PageAddItem(dst_page, (Item) newtup.t_data, tuple_len, |
| InvalidOffsetNumber, LP_USED); |
| if (newoff == InvalidOffsetNumber) |
| elog(PANIC, "failed to add item with len = %lu to page %u (free space %lu, nusd %u, noff %u)", |
| (unsigned long) tuple_len, |
| dst_vacpage->blkno, (unsigned long) dst_vacpage->free, |
| dst_vacpage->offsets_used, dst_vacpage->offsets_free); |
| newitemid = PageGetItemId(dst_page, newoff); |
| pfree(newtup.t_data); |
| newtup.t_data = (HeapTupleHeader) PageGetItem(dst_page, newitemid); |
| ItemPointerSet(&(newtup.t_data->t_ctid), dst_vacpage->blkno, newoff); |
| newtup.t_self = newtup.t_data->t_ctid; |
| |
| /* |
| * Mark old tuple as MOVED_OFF by me. |
| */ |
| old_tup->t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED | |
| HEAP_XMIN_INVALID | |
| HEAP_MOVED_IN); |
| old_tup->t_data->t_infomask |= HEAP_MOVED_OFF; |
| HeapTupleHeaderSetXvac(old_tup->t_data, myXID); |
| |
| MarkBufferDirty(dst_buf); |
| MarkBufferDirty(old_buf); |
| |
| /* XLOG stuff */ |
| if (!rel->rd_istemp) |
| { |
| XLogRecPtr recptr = log_heap_move(rel, old_buf, old_tup->t_self, |
| dst_buf, &newtup); |
| |
| PageSetLSN(old_page, recptr); |
| PageSetTLI(old_page, ThisTimeLineID); |
| PageSetLSN(dst_page, recptr); |
| PageSetTLI(dst_page, ThisTimeLineID); |
| } |
| else |
| { |
| /* |
| * No XLOG record, but still need to flag that XID exists on disk |
| */ |
| MyXactMadeTempRelUpdate = true; |
| } |
| |
| END_CRIT_SECTION(); |
| |
| dst_vacpage->free = PageGetFreeSpaceWithFillFactor(rel, dst_page); |
| LockBuffer(dst_buf, BUFFER_LOCK_UNLOCK); |
| LockBuffer(old_buf, BUFFER_LOCK_UNLOCK); |
| |
| dst_vacpage->offsets_used++; |
| |
| /* insert index' tuples if needed */ |
| if (ec->resultRelInfo->ri_NumIndices > 0) |
| { |
| ExecStoreGenericTuple(&newtup, ec->slot, false); |
| ExecInsertIndexTuples(ec->slot, &(newtup.t_self), ec->estate, true); |
| ResetPerTupleExprContext(ec->estate); |
| } |
| } |
| |
| /* |
| * update_hint_bits() -- update hint bits in destination pages |
| * |
| * Scan all the pages that we moved tuples onto and update tuple status bits. |
| * This is not really necessary, but it will save time for future transactions |
| * examining these tuples. |
| * |
| * This pass guarantees that all HEAP_MOVED_IN tuples are marked as |
| * XMIN_COMMITTED, so that future tqual tests won't need to check their XVAC. |
| * |
| * BUT NOTICE that this code fails to clear HEAP_MOVED_OFF tuples from |
| * pages that were move source pages but not move dest pages. The bulk |
| * of the move source pages will be physically truncated from the relation, |
| * and the last page remaining in the rel will be fixed separately in |
| * repair_frag(), so the only cases where a MOVED_OFF tuple won't get its |
| * hint bits updated are tuples that are moved as part of a chain and were |
| * on pages that were not either move destinations nor at the end of the rel. |
| * To completely ensure that no MOVED_OFF tuples remain unmarked, we'd have |
| * to remember and revisit those pages too. |
| * |
| * One wonders whether it wouldn't be better to skip this work entirely, |
| * and let the tuple status updates happen someplace that's not holding an |
| * exclusive lock on the relation. |
| */ |
| static void |
| update_hint_bits(Relation rel, VacPageList fraged_pages, int num_fraged_pages, |
| BlockNumber last_move_dest_block, int num_moved) |
| { |
| MIRROREDLOCK_BUFMGR_DECLARE; |
| |
| TransactionId myXID = GetCurrentTransactionId(); |
| int checked_moved = 0; |
| int i; |
| VacPage *curpage; |
| |
| for (i = 0, curpage = fraged_pages->pagedesc; |
| i < num_fraged_pages; |
| i++, curpage++) |
| { |
| Buffer buf; |
| Page page; |
| OffsetNumber max_offset; |
| OffsetNumber off; |
| int num_tuples = 0; |
| |
| vacuum_delay_point(); |
| |
| if ((*curpage)->blkno > last_move_dest_block) |
| break; /* no need to scan any further */ |
| if ((*curpage)->offsets_used == 0) |
| continue; /* this page was never used as a move dest */ |
| |
| // -------- MirroredLock ---------- |
| MIRROREDLOCK_BUFMGR_LOCK; |
| |
| buf = ReadBuffer(rel, (*curpage)->blkno); |
| LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); |
| page = BufferGetPage(buf); |
| max_offset = PageGetMaxOffsetNumber(page); |
| for (off = FirstOffsetNumber; |
| off <= max_offset; |
| off = OffsetNumberNext(off)) |
| { |
| ItemId itemid = PageGetItemId(page, off); |
| HeapTupleHeader htup; |
| |
| if (!ItemIdIsUsed(itemid)) |
| continue; |
| htup = (HeapTupleHeader) PageGetItem(page, itemid); |
| if (htup->t_infomask & HEAP_XMIN_COMMITTED) |
| continue; |
| |
| /* |
| * Here we may see either MOVED_OFF or MOVED_IN tuples. |
| */ |
| if (!(htup->t_infomask & HEAP_MOVED)) |
| elog(ERROR, "HEAP_MOVED_OFF/HEAP_MOVED_IN was expected"); |
| if (HeapTupleHeaderGetXvac(htup) != myXID) |
| elog(ERROR, "invalid XVAC in tuple header"); |
| |
| if (htup->t_infomask & HEAP_MOVED_IN) |
| { |
| htup->t_infomask |= HEAP_XMIN_COMMITTED; |
| htup->t_infomask &= ~HEAP_MOVED; |
| num_tuples++; |
| } |
| else |
| htup->t_infomask |= HEAP_XMIN_INVALID; |
| } |
| MarkBufferDirty(buf); |
| UnlockReleaseBuffer(buf); |
| |
| MIRROREDLOCK_BUFMGR_UNLOCK; |
| // -------- MirroredLock ---------- |
| |
| Assert((*curpage)->offsets_used == num_tuples); |
| checked_moved += num_tuples; |
| } |
| Assert(num_moved == checked_moved); |
| } |
| |
| /* |
| * vacuum_heap() -- free dead tuples |
| * |
| * This routine marks dead tuples as unused and truncates relation |
| * if there are "empty" end-blocks. |
| */ |
| static void |
| vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages) |
| { |
| MIRROREDLOCK_BUFMGR_DECLARE; |
| |
| Buffer buf; |
| VacPage *vacpage; |
| BlockNumber relblocks; |
| int nblocks; |
| int i; |
| |
| // Fetch gp_persistent_relation_node information that will be added to XLOG record. |
| RelationFetchGpRelationNodeForXLog(onerel); |
| |
| nblocks = vacuum_pages->num_pages; |
| nblocks -= vacuum_pages->empty_end_pages; /* nothing to do with them */ |
| |
| for (i = 0, vacpage = vacuum_pages->pagedesc; i < nblocks; i++, vacpage++) |
| { |
| vacuum_delay_point(); |
| |
| if ((*vacpage)->offsets_free > 0) |
| { |
| |
| // -------- MirroredLock ---------- |
| MIRROREDLOCK_BUFMGR_LOCK; |
| |
| buf = ReadBuffer(onerel, (*vacpage)->blkno); |
| LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); |
| vacuum_page(onerel, buf, *vacpage); |
| UnlockReleaseBuffer(buf); |
| |
| MIRROREDLOCK_BUFMGR_UNLOCK; |
| // -------- MirroredLock ---------- |
| |
| } |
| } |
| |
| /* Truncate relation if there are some empty end-pages */ |
| Assert(vacrelstats->rel_pages >= vacuum_pages->empty_end_pages); |
| if (vacuum_pages->empty_end_pages > 0) |
| { |
| relblocks = vacrelstats->rel_pages - vacuum_pages->empty_end_pages; |
| ereport(elevel, |
| (errmsg("\"%s\": truncated %u to %u pages", |
| RelationGetRelationName(onerel), |
| vacrelstats->rel_pages, relblocks))); |
| |
| RelationTruncate( |
| onerel, |
| relblocks, |
| /* markPersistentAsPhysicallyTruncated */ true); |
| |
| vacrelstats->rel_pages = relblocks; /* set new number of blocks */ |
| } |
| } |
| |
| /* |
| * vacuum_page() -- free dead tuples on a page |
| * and repair its fragmentation. |
| * |
| * Caller must hold pin and lock on buffer. |
| */ |
| static void |
| vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage) |
| { |
| OffsetNumber unused[MaxOffsetNumber]; |
| int uncnt; |
| Page page = BufferGetPage(buffer); |
| ItemId itemid; |
| int i; |
| |
| MIRROREDLOCK_BUFMGR_MUST_ALREADY_BE_HELD; |
| |
| /* There shouldn't be any tuples moved onto the page yet! */ |
| Assert(vacpage->offsets_used == 0); |
| |
| START_CRIT_SECTION(); |
| |
| for (i = 0; i < vacpage->offsets_free; i++) |
| { |
| itemid = PageGetItemId(page, vacpage->offsets[i]); |
| itemid->lp_flags &= ~LP_USED; |
| } |
| |
| uncnt = PageRepairFragmentation(page, unused); |
| |
| MarkBufferDirty(buffer); |
| |
| /* XLOG stuff */ |
| if (!onerel->rd_istemp) |
| { |
| XLogRecPtr recptr; |
| |
| recptr = log_heap_clean(onerel, buffer, unused, uncnt); |
| PageSetLSN(page, recptr); |
| PageSetTLI(page, ThisTimeLineID); |
| } |
| else |
| { |
| /* No XLOG record, but still need to flag that XID exists on disk */ |
| MyXactMadeTempRelUpdate = true; |
| } |
| |
| END_CRIT_SECTION(); |
| } |
| |
| /* |
| * scan_index() -- scan one index relation to update pg_class statistics. |
| * |
| * We use this when we have no deletions to do. |
| */ |
| static void |
| scan_index(Relation indrel, double num_tuples, List *updated_stats) |
| { |
| IndexBulkDeleteResult *stats; |
| IndexVacuumInfo ivinfo; |
| PGRUsage ru0; |
| |
| pg_rusage_init(&ru0); |
| |
| ivinfo.index = indrel; |
| ivinfo.vacuum_full = true; |
| ivinfo.message_level = elevel; |
| ivinfo.num_heap_tuples = num_tuples; |
| ivinfo.extra_oids = NIL; |
| |
| stats = index_vacuum_cleanup(&ivinfo, NULL); |
| |
| if (!stats) |
| return; |
| |
| /* now update statistics in pg_class */ |
| vac_update_relstats(indrel, |
| stats->num_pages, stats->num_index_tuples, |
| false, InvalidTransactionId, updated_stats); |
| |
| ereport(elevel, |
| (errmsg("index \"%s\" now contains %.0f row versions in %u pages", |
| RelationGetRelationName(indrel), |
| stats->num_index_tuples, |
| stats->num_pages), |
| errdetail("%u index pages have been deleted, %u are currently reusable.\n" |
| "%s.", |
| stats->pages_deleted, stats->pages_free, |
| pg_rusage_show(&ru0)))); |
| |
| /* |
| * Check for tuple count mismatch. If the index is partial, then it's OK |
| * for it to have fewer tuples than the heap; else we got trouble. |
| */ |
| if (stats->num_index_tuples != num_tuples) |
| { |
| if (stats->num_index_tuples > num_tuples || |
| !vac_is_partial_index(indrel)) |
| ereport(WARNING, |
| (errmsg("index \"%s\" contains %.0f row versions, but table contains %.0f row versions", |
| RelationGetRelationName(indrel), |
| stats->num_index_tuples, num_tuples), |
| errhint("Rebuild the index with REINDEX."))); |
| } |
| |
| pfree(stats); |
| } |
| |
| /* |
| * vacuum_index() -- vacuum one index relation. |
| * |
| * Vpl is the VacPageList of the heap we're currently vacuuming. |
| * It's locked. Indrel is an index relation on the vacuumed heap. |
| * |
| * We don't bother to set locks on the index relation here, since |
| * the parent table is exclusive-locked already. |
| * |
| * Finally, we arrange to update the index relation's statistics in |
| * pg_class. |
| */ |
| static void |
| vacuum_index(VacPageList vacpagelist, Relation indrel, |
| double num_tuples, int keep_tuples, List *updated_stats, List *extra_oids) |
| { |
| IndexBulkDeleteResult *stats; |
| IndexVacuumInfo ivinfo; |
| PGRUsage ru0; |
| |
| pg_rusage_init(&ru0); |
| |
| ivinfo.index = indrel; |
| ivinfo.vacuum_full = true; |
| ivinfo.message_level = elevel; |
| ivinfo.num_heap_tuples = num_tuples + keep_tuples; |
| ivinfo.extra_oids = extra_oids; |
| |
| /* Do bulk deletion */ |
| stats = index_bulk_delete(&ivinfo, NULL, tid_reaped, (void *) vacpagelist); |
| |
| /* Do post-VACUUM cleanup */ |
| stats = index_vacuum_cleanup(&ivinfo, stats); |
| |
| if (!stats) |
| return; |
| |
| /* now update statistics in pg_class */ |
| vac_update_relstats(indrel, |
| stats->num_pages, stats->num_index_tuples, |
| false, InvalidTransactionId, updated_stats); |
| |
| ereport(elevel, |
| (errmsg("index \"%s\" now contains %.0f row versions in %u pages", |
| RelationGetRelationName(indrel), |
| stats->num_index_tuples, |
| stats->num_pages), |
| errdetail("%.0f index row versions were removed.\n" |
| "%u index pages have been deleted, %u are currently reusable.\n" |
| "%s.", |
| stats->tuples_removed, |
| stats->pages_deleted, stats->pages_free, |
| pg_rusage_show(&ru0)))); |
| |
| /* |
| * Check for tuple count mismatch. If the index is partial, then it's OK |
| * for it to have fewer tuples than the heap; else we got trouble. |
| */ |
| if (stats->num_index_tuples != num_tuples + keep_tuples) |
| { |
| if (stats->num_index_tuples > num_tuples + keep_tuples || |
| !vac_is_partial_index(indrel)) |
| ereport(WARNING, |
| (errmsg("index \"%s\" contains %.0f row versions, but table contains %.0f row versions", |
| RelationGetRelationName(indrel), |
| stats->num_index_tuples, num_tuples + keep_tuples), |
| errhint("Rebuild the index with REINDEX."))); |
| } |
| |
| pfree(stats); |
| } |
| |
| /* |
| * tid_reaped() -- is a particular tid reaped? |
| * |
| * This has the right signature to be an IndexBulkDeleteCallback. |
| * |
| * vacpagelist->VacPage_array is sorted in right order. |
| */ |
| static bool |
| tid_reaped(ItemPointer itemptr, void *state) |
| { |
| VacPageList vacpagelist = (VacPageList) state; |
| OffsetNumber ioffno; |
| OffsetNumber *voff; |
| VacPage vp, |
| *vpp; |
| VacPageData vacpage; |
| |
| vacpage.blkno = ItemPointerGetBlockNumber(itemptr); |
| ioffno = ItemPointerGetOffsetNumber(itemptr); |
| |
| vp = &vacpage; |
| vpp = (VacPage *) vac_bsearch((void *) &vp, |
| (void *) (vacpagelist->pagedesc), |
| vacpagelist->num_pages, |
| sizeof(VacPage), |
| vac_cmp_blk); |
| |
| if (vpp == NULL) |
| return false; |
| |
| /* ok - we are on a partially or fully reaped page */ |
| vp = *vpp; |
| |
| if (vp->offsets_free == 0) |
| { |
| /* this is EmptyPage, so claim all tuples on it are reaped!!! */ |
| return true; |
| } |
| |
| voff = (OffsetNumber *) vac_bsearch((void *) &ioffno, |
| (void *) (vp->offsets), |
| vp->offsets_free, |
| sizeof(OffsetNumber), |
| vac_cmp_offno); |
| |
| if (voff == NULL) |
| return false; |
| |
| /* tid is reaped */ |
| return true; |
| } |
| |
| /* |
| * Update the shared Free Space Map with the info we now have about |
| * free space in the relation, discarding any old info the map may have. |
| */ |
| static void |
| vac_update_fsm(Relation onerel, VacPageList fraged_pages, |
| BlockNumber rel_pages) |
| { |
| int nPages = fraged_pages->num_pages; |
| VacPage *pagedesc = fraged_pages->pagedesc; |
| Size threshold; |
| PageFreeSpaceInfo *pageSpaces; |
| int outPages; |
| int i; |
| |
| /* |
| * We only report pages with free space at least equal to the average |
| * request size --- this avoids cluttering FSM with uselessly-small bits |
| * of space. Although FSM would discard pages with little free space |
| * anyway, it's important to do this prefiltering because (a) it reduces |
| * the time spent holding the FSM lock in RecordRelationFreeSpace, and (b) |
| * FSM uses the number of pages reported as a statistic for guiding space |
| * management. If we didn't threshold our reports the same way |
| * vacuumlazy.c does, we'd be skewing that statistic. |
| */ |
| threshold = GetAvgFSMRequestSize(&onerel->rd_node); |
| |
| pageSpaces = (PageFreeSpaceInfo *) |
| palloc(nPages * sizeof(PageFreeSpaceInfo)); |
| outPages = 0; |
| |
| for (i = 0; i < nPages; i++) |
| { |
| /* |
| * fraged_pages may contain entries for pages that we later decided to |
| * truncate from the relation; don't enter them into the free space |
| * map! |
| */ |
| if (pagedesc[i]->blkno >= rel_pages) |
| break; |
| |
| if (pagedesc[i]->free >= threshold) |
| { |
| pageSpaces[outPages].blkno = pagedesc[i]->blkno; |
| pageSpaces[outPages].avail = pagedesc[i]->free; |
| outPages++; |
| } |
| } |
| |
| RecordRelationFreeSpace(&onerel->rd_node, outPages, outPages, pageSpaces); |
| |
| pfree(pageSpaces); |
| } |
| |
| /* Copy a VacPage structure */ |
| static VacPage |
| copy_vac_page(VacPage vacpage) |
| { |
| VacPage newvacpage; |
| |
| /* allocate a VacPageData entry */ |
| newvacpage = (VacPage) palloc(sizeof(VacPageData) + |
| vacpage->offsets_free * sizeof(OffsetNumber)); |
| |
| /* fill it in */ |
| if (vacpage->offsets_free > 0) |
| memcpy(newvacpage->offsets, vacpage->offsets, |
| vacpage->offsets_free * sizeof(OffsetNumber)); |
| newvacpage->blkno = vacpage->blkno; |
| newvacpage->free = vacpage->free; |
| newvacpage->offsets_used = vacpage->offsets_used; |
| newvacpage->offsets_free = vacpage->offsets_free; |
| |
| return newvacpage; |
| } |
| |
| /* |
| * Add a VacPage pointer to a VacPageList. |
| * |
| * As a side effect of the way that scan_heap works, |
| * higher pages come after lower pages in the array |
| * (and highest tid on a page is last). |
| */ |
| static void |
| vpage_insert(VacPageList vacpagelist, VacPage vpnew) |
| { |
| #define PG_NPAGEDESC 1024 |
| |
| /* allocate a VacPage entry if needed */ |
| if (vacpagelist->num_pages == 0) |
| { |
| vacpagelist->pagedesc = (VacPage *) palloc(PG_NPAGEDESC * sizeof(VacPage)); |
| vacpagelist->num_allocated_pages = PG_NPAGEDESC; |
| } |
| else if (vacpagelist->num_pages >= vacpagelist->num_allocated_pages) |
| { |
| vacpagelist->num_allocated_pages *= 2; |
| vacpagelist->pagedesc = (VacPage *) repalloc(vacpagelist->pagedesc, vacpagelist->num_allocated_pages * sizeof(VacPage)); |
| } |
| vacpagelist->pagedesc[vacpagelist->num_pages] = vpnew; |
| (vacpagelist->num_pages)++; |
| } |
| |
| /* |
| * vac_bsearch: just like standard C library routine bsearch(), |
| * except that we first test to see whether the target key is outside |
| * the range of the table entries. This case is handled relatively slowly |
| * by the normal binary search algorithm (ie, no faster than any other key) |
| * but it occurs often enough in VACUUM to be worth optimizing. |
| */ |
| static void * |
| vac_bsearch(const void *key, const void *base, |
| size_t nelem, size_t size, |
| int (*compar) (const void *, const void *)) |
| { |
| int res; |
| const void *last; |
| |
| if (nelem == 0) |
| return NULL; |
| res = compar(key, base); |
| if (res < 0) |
| return NULL; |
| if (res == 0) |
| return (void *) base; |
| if (nelem > 1) |
| { |
| last = (const void *) ((const char *) base + (nelem - 1) * size); |
| res = compar(key, last); |
| if (res > 0) |
| return NULL; |
| if (res == 0) |
| return (void *) last; |
| } |
| if (nelem <= 2) |
| return NULL; /* already checked 'em all */ |
| return bsearch(key, base, nelem, size, compar); |
| } |
| |
| /* |
| * Comparator routines for use with qsort() and bsearch(). |
| */ |
| static int |
| vac_cmp_blk(const void *left, const void *right) |
| { |
| BlockNumber lblk, |
| rblk; |
| |
| lblk = (*((VacPage *) left))->blkno; |
| rblk = (*((VacPage *) right))->blkno; |
| |
| if (lblk < rblk) |
| return -1; |
| if (lblk == rblk) |
| return 0; |
| return 1; |
| } |
| |
| static int |
| vac_cmp_offno(const void *left, const void *right) |
| { |
| if (*(OffsetNumber *) left < *(OffsetNumber *) right) |
| return -1; |
| if (*(OffsetNumber *) left == *(OffsetNumber *) right) |
| return 0; |
| return 1; |
| } |
| |
| static int |
| vac_cmp_vtlinks(const void *left, const void *right) |
| { |
| if (((VTupleLink) left)->new_tid.ip_blkid.bi_hi < |
| ((VTupleLink) right)->new_tid.ip_blkid.bi_hi) |
| return -1; |
| if (((VTupleLink) left)->new_tid.ip_blkid.bi_hi > |
| ((VTupleLink) right)->new_tid.ip_blkid.bi_hi) |
| return 1; |
| /* bi_hi-es are equal */ |
| if (((VTupleLink) left)->new_tid.ip_blkid.bi_lo < |
| ((VTupleLink) right)->new_tid.ip_blkid.bi_lo) |
| return -1; |
| if (((VTupleLink) left)->new_tid.ip_blkid.bi_lo > |
| ((VTupleLink) right)->new_tid.ip_blkid.bi_lo) |
| return 1; |
| /* bi_lo-es are equal */ |
| if (((VTupleLink) left)->new_tid.ip_posid < |
| ((VTupleLink) right)->new_tid.ip_posid) |
| return -1; |
| if (((VTupleLink) left)->new_tid.ip_posid > |
| ((VTupleLink) right)->new_tid.ip_posid) |
| return 1; |
| return 0; |
| } |
| |
| |
| /* |
| * Open all the indexes of the given relation, obtaining the specified kind |
| * of lock on each. Return an array of Relation pointers for the indexes |
| * into *Irel, and the number of indexes into *nindexes. |
| */ |
| void |
| vac_open_indexes(Relation relation, LOCKMODE lockmode, |
| int *nindexes, Relation **Irel) |
| { |
| List *indexoidlist; |
| ListCell *indexoidscan; |
| int i; |
| |
| Assert(lockmode != NoLock); |
| |
| indexoidlist = RelationGetIndexList(relation); |
| |
| *nindexes = list_length(indexoidlist); |
| |
| if (*nindexes > 0) |
| *Irel = (Relation *) palloc(*nindexes * sizeof(Relation)); |
| else |
| *Irel = NULL; |
| |
| i = 0; |
| foreach(indexoidscan, indexoidlist) |
| { |
| Oid indexoid = lfirst_oid(indexoidscan); |
| |
| (*Irel)[i++] = index_open(indexoid, lockmode); |
| } |
| |
| list_free(indexoidlist); |
| } |
| |
| /* |
| * Release the resources acquired by vac_open_indexes. Optionally release |
| * the locks (say NoLock to keep 'em). |
| */ |
| void |
| vac_close_indexes(int nindexes, Relation *Irel, LOCKMODE lockmode) |
| { |
| if (Irel == NULL) |
| return; |
| |
| while (nindexes--) |
| { |
| Relation ind = Irel[nindexes]; |
| |
| index_close(ind, lockmode); |
| } |
| pfree(Irel); |
| } |
| |
| |
| /* |
| * Is an index partial (ie, could it contain fewer tuples than the heap?) |
| */ |
| bool |
| vac_is_partial_index(Relation indrel) |
| { |
| /* |
| * If the index's AM doesn't support nulls, it's partial for our purposes |
| */ |
| if (!indrel->rd_am->amindexnulls) |
| return true; |
| |
| /* Otherwise, look to see if there's a partial-index predicate */ |
| if (!heap_attisnull(indrel->rd_indextuple, Anum_pg_index_indpred)) |
| return true; |
| |
| return false; |
| } |
| |
| |
| static bool |
| enough_space(VacPage vacpage, Size len) |
| { |
| len = MAXALIGN(len); |
| |
| if (len > vacpage->free) |
| return false; |
| |
| /* if there are free itemid(s) and len <= free_space... */ |
| if (vacpage->offsets_used < vacpage->offsets_free) |
| return true; |
| |
| /* noff_used >= noff_free and so we'll have to allocate new itemid */ |
| if (len + sizeof(ItemIdData) <= vacpage->free) |
| return true; |
| |
| return false; |
| } |
| |
| static Size |
| PageGetFreeSpaceWithFillFactor(Relation relation, Page page) |
| { |
| PageHeader pd = (PageHeader) page; |
| Size freespace = pd->pd_upper - pd->pd_lower; |
| Size targetfree; |
| |
| targetfree = RelationGetTargetPageFreeSpace(relation, |
| HEAP_DEFAULT_FILLFACTOR); |
| if (freespace > targetfree) |
| return freespace - targetfree; |
| else |
| return 0; |
| } |
| |
| /* |
| * vacuum_delay_point --- check for interrupts and cost-based delay. |
| * |
| * This should be called in each major loop of VACUUM processing, |
| * typically once per page processed. |
| */ |
| void |
| vacuum_delay_point() |
| { |
| /* Always check for interrupts */ |
| CHECK_FOR_INTERRUPTS(); |
| |
| /* Nap if appropriate */ |
| if (VacuumCostActive && !InterruptPending && |
| VacuumCostBalance >= VacuumCostLimit) |
| { |
| int msec; |
| |
| msec = VacuumCostDelay * VacuumCostBalance / VacuumCostLimit; |
| if (msec > VacuumCostDelay * 4) |
| msec = VacuumCostDelay * 4; |
| |
| pg_usleep(msec * 1000L); |
| |
| VacuumCostBalance = 0; |
| |
| /* Might have gotten an interrupt while sleeping */ |
| CHECK_FOR_INTERRUPTS(); |
| } |
| } |
| |
| /* |
| * open_relation_and_check_permission -- open the relation with an appropriate |
| * lock based on the vacuum statement, and check for the permissions on this |
| * relation. |
| */ |
| static Relation |
| open_relation_and_check_permission(VacuumStmt *vacstmt, |
| Oid relid, |
| char expected_relkind) |
| { |
| Relation onerel; |
| LOCKMODE lmode; |
| |
| /* |
| * Determine the type of lock we want --- hard exclusive lock for a FULL |
| * vacuum, but just ShareUpdateExclusiveLock for concurrent vacuum. Either |
| * way, we can be sure that no other backend is vacuuming the same table. |
| * For analyze, we use ShareUpdateExclusiveLock. |
| */ |
| if (!vacstmt->vacuum) |
| lmode = ShareUpdateExclusiveLock; |
| else |
| lmode = vacstmt->full ? AccessExclusiveLock : ShareUpdateExclusiveLock; |
| |
| /* |
| * Open the relation and get the appropriate lock on it. |
| * |
| * There's a race condition here: the rel may have gone away since the |
| * last time we saw it. If so, we don't need to vacuum it. |
| */ |
| onerel = try_relation_open(relid, lmode, false); |
| |
| if (!onerel) |
| return NULL; |
| |
| /* |
| * Check permissions. |
| * |
| * We allow the user to vacuum a table if he is superuser, the table |
| * owner, or the database owner (but in the latter case, only if it's not |
| * a shared relation). pg_class_ownercheck includes the superuser case. |
| * |
| * Note we choose to treat permissions failure as a WARNING and keep |
| * trying to vacuum the rest of the DB --- is this appropriate? |
| */ |
| if (!(pg_class_ownercheck(RelationGetRelid(onerel), GetUserId()) || |
| (pg_database_ownercheck(MyDatabaseId, GetUserId()) && !onerel->rd_rel->relisshared))) |
| { |
| if (Gp_role != GP_ROLE_EXECUTE) |
| ereport(WARNING, |
| (errmsg("skipping \"%s\" --- only table or database owner can vacuum it", |
| RelationGetRelationName(onerel)))); |
| relation_close(onerel, lmode); |
| return NULL; |
| } |
| |
| /* |
| * Check that it's a plain table; we used to do this in get_rel_oids() but |
| * seems safer to check after we've locked the relation. |
| */ |
| if (onerel->rd_rel->relkind != expected_relkind || RelationIsExternal(onerel)) |
| { |
| ereport(WARNING, |
| (errmsg("skipping \"%s\" --- cannot vacuum indexes, views, external tables, or special system tables", |
| RelationGetRelationName(onerel)))); |
| relation_close(onerel, lmode); |
| return NULL; |
| } |
| |
| /* |
| * Silently ignore tables that are temp tables of other backends --- |
| * trying to vacuum these will lead to great unhappiness, since their |
| * contents are probably not up-to-date on disk. (We don't throw a |
| * warning here; it would just lead to chatter during a database-wide |
| * VACUUM.) |
| */ |
| if (isOtherTempNamespace(RelationGetNamespace(onerel))) |
| { |
| relation_close(onerel, lmode); |
| return NULL; |
| } |
| |
| /* |
| * We can ANALYZE any table except pg_statistic. See update_attstats |
| */ |
| if (vacstmt->analyze && RelationGetRelid(onerel) == StatisticRelationId) |
| { |
| relation_close(onerel, ShareUpdateExclusiveLock); |
| return NULL; |
| } |
| |
| return onerel; |
| } |
| |
| /* |
| * Generate three oids for each bitmap index in a given relation. |
| * |
| * These oids will be used in QD and QEs for new relfilenodes during |
| * reindexing a bitmap index. |
| * |
| * The index oid along with these three oids will be stored consecutively |
| * in vacstmt->extra_oids. |
| */ |
| void |
| gen_oids_for_bitmaps(VacuumStmt *vacstmt, Relation onerel) |
| { |
| Relation *Irel = NULL; |
| int nindexes; |
| int index_no; |
| |
| vac_open_indexes(onerel, AccessShareLock, &nindexes, &Irel); |
| if (Irel == NULL) |
| return; |
| |
| Assert(nindexes > 0); |
| for (index_no = 0; index_no < nindexes; index_no++) |
| { |
| /* |
| * If this relation is a bitmap index, we generate three OIDs |
| * for relfilenodes needed for vacuuming a bitmap index. We do this |
| * NUM_EXTRA_OIDS_FOR_BITMAP to handle the case when reindex is called |
| * multiple times, such as "vacuum full" and etc. |
| */ |
| Oid indoid = RelationGetRelid(Irel[index_no]); |
| Oid tblspc = Irel[index_no]->rd_rel->reltablespace; |
| bool shared = Irel[index_no]->rd_rel->relisshared; |
| int i; |
| |
| if (RelationIsBitmapIndex(Irel[index_no])) |
| { |
| vacstmt->extra_oids = lappend_oid(vacstmt->extra_oids, |
| indoid); |
| Assert(NUM_EXTRA_OIDS_FOR_BITMAP % 3 == 0); |
| |
| for (i = 0; i < NUM_EXTRA_OIDS_FOR_BITMAP / 3; i++) |
| { |
| vacstmt->extra_oids = lappend_oid(vacstmt->extra_oids, |
| GetNewRelFileNode(tblspc, |
| shared, |
| NULL, |
| false)); |
| vacstmt->extra_oids = lappend_oid(vacstmt->extra_oids, |
| GetNewRelFileNode(tblspc, |
| shared, |
| NULL, |
| false)); |
| vacstmt->extra_oids = lappend_oid(vacstmt->extra_oids, |
| GetNewRelFileNode(tblspc, |
| shared, |
| NULL, |
| false)); |
| } |
| } |
| } |
| |
| vac_close_indexes(nindexes, Irel, AccessShareLock); |
| } |
| |
| /* |
| * Obtain extra oids for a given index. |
| * |
| * If the given index is a bitmap index, extra oids are returned. Otherwise, |
| * NIL is returned. |
| * |
| * occurrence determines the offset of the OIDs in the list. |
| * |
| * If there are no extra oids available for the bitmap index, ereport |
| * is called. |
| * |
| * The caller is responsible to free the space. |
| */ |
| List * |
| get_oids_for_bitmap(List *all_extra_oids, Relation Irel, |
| Relation onerel, int occurrence) |
| { |
| List *extra_oids = NIL; |
| int count = 0; |
| bool found = false; |
| ListCell *lc; |
| int oid_index = 0; |
| |
| if (!RelationIsBitmapIndex(Irel)) |
| return extra_oids; |
| |
| foreach(lc, all_extra_oids) |
| { |
| if (found) |
| { |
| if (oid_index / 3 == occurrence - 1) |
| { |
| extra_oids = lappend_oid(extra_oids, lfirst_oid(lc)); |
| if (list_length(extra_oids) == 3) |
| break; |
| } |
| |
| oid_index ++; |
| |
| if (oid_index % NUM_EXTRA_OIDS_FOR_BITMAP == 0) |
| break; |
| } |
| |
| if (count % (NUM_EXTRA_OIDS_FOR_BITMAP + 1) == 0 && |
| lfirst_oid(lc) == RelationGetRelid(Irel)) |
| { |
| found = true; |
| oid_index = 0; |
| } |
| |
| count++; |
| } |
| |
| if (extra_oids == NULL) |
| ereport(ERROR, |
| (errmsg("can not vacuum the relation '%s' with bitmap indexes. " |
| "Please either increase your maintenance_work_mem or " |
| "drop the bitmap index and try again.", |
| RelationGetRelationName(onerel)))); |
| |
| Assert(extra_oids != NULL && list_length(extra_oids) == 3); |
| return extra_oids; |
| } |
| |