| /*------------------------------------------------------------------------- |
| * |
| * nbtxlog.h |
| * header file for postgres btree xlog routines |
| * |
| * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group |
| * Portions Copyright (c) 1994, Regents of the University of California |
| * |
| * src/include/access/nbtxlog.h |
| * |
| *------------------------------------------------------------------------- |
| */ |
| #ifndef NBTXLOG_H |
| #define NBTXLOG_H |
| |
| #include "access/transam.h" |
| #include "access/xlogreader.h" |
| #include "lib/stringinfo.h" |
| #include "storage/off.h" |
| |
| /* |
| * XLOG records for btree operations |
| * |
| * XLOG allows to store some information in high 4 bits of log |
| * record xl_info field |
| */ |
| #define XLOG_BTREE_INSERT_LEAF 0x00 /* add index tuple without split */ |
| #define XLOG_BTREE_INSERT_UPPER 0x10 /* same, on a non-leaf page */ |
| #define XLOG_BTREE_INSERT_META 0x20 /* same, plus update metapage */ |
| #define XLOG_BTREE_SPLIT_L 0x30 /* add index tuple with split */ |
| #define XLOG_BTREE_SPLIT_R 0x40 /* as above, new item on right */ |
| #define XLOG_BTREE_INSERT_POST 0x50 /* add index tuple with posting split */ |
| #define XLOG_BTREE_DEDUP 0x60 /* deduplicate tuples for a page */ |
| #define XLOG_BTREE_DELETE 0x70 /* delete leaf index tuples for a page */ |
| #define XLOG_BTREE_UNLINK_PAGE 0x80 /* delete a half-dead page */ |
| #define XLOG_BTREE_UNLINK_PAGE_META 0x90 /* same, and update metapage */ |
| #define XLOG_BTREE_NEWROOT 0xA0 /* new root page */ |
| #define XLOG_BTREE_MARK_PAGE_HALFDEAD 0xB0 /* mark a leaf as half-dead */ |
| #define XLOG_BTREE_VACUUM 0xC0 /* delete entries on a page during |
| * vacuum */ |
| #define XLOG_BTREE_REUSE_PAGE 0xD0 /* old page is about to be reused from |
| * FSM */ |
| #define XLOG_BTREE_META_CLEANUP 0xE0 /* update cleanup-related data in the |
| * metapage */ |
| |
| /* |
| * All that we need to regenerate the meta-data page |
| */ |
| typedef struct xl_btree_metadata |
| { |
| uint32 version; |
| BlockNumber root; |
| uint32 level; |
| BlockNumber fastroot; |
| uint32 fastlevel; |
| uint32 last_cleanup_num_delpages; |
| bool allequalimage; |
| } xl_btree_metadata; |
| |
| /* |
| * This is what we need to know about simple (without split) insert. |
| * |
| * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META, and |
| * INSERT_POST. Note that INSERT_META and INSERT_UPPER implies it's not a |
| * leaf page, while INSERT_POST and INSERT_LEAF imply that it must be a leaf |
| * page. |
| * |
| * Backup Blk 0: original page |
| * Backup Blk 1: child's left sibling, if INSERT_UPPER or INSERT_META |
| * Backup Blk 2: xl_btree_metadata, if INSERT_META |
| * |
| * Note: The new tuple is actually the "original" new item in the posting |
| * list split insert case (i.e. the INSERT_POST case). A split offset for |
| * the posting list is logged before the original new item. Recovery needs |
| * both, since it must do an in-place update of the existing posting list |
| * that was split as an extra step. Also, recovery generates a "final" |
| * newitem. See _bt_swap_posting() for details on posting list splits. |
| */ |
| typedef struct xl_btree_insert |
| { |
| OffsetNumber offnum; |
| |
| /* POSTING SPLIT OFFSET FOLLOWS (INSERT_POST case) */ |
| /* NEW TUPLE ALWAYS FOLLOWS AT THE END */ |
| } xl_btree_insert; |
| |
| #define SizeOfBtreeInsert (offsetof(xl_btree_insert, offnum) + sizeof(OffsetNumber)) |
| |
| /* |
| * On insert with split, we save all the items going into the right sibling |
| * so that we can restore it completely from the log record. This way takes |
| * less xlog space than the normal approach, because if we did it standardly, |
| * XLogInsert would almost always think the right page is new and store its |
| * whole page image. The left page, however, is handled in the normal |
| * incremental-update fashion. |
| * |
| * Note: XLOG_BTREE_SPLIT_L and XLOG_BTREE_SPLIT_R share this data record. |
| * There are two variants to indicate whether the inserted tuple went into the |
| * left or right split page (and thus, whether the new item is stored or not). |
| * We always log the left page high key because suffix truncation can generate |
| * a new leaf high key using user-defined code. This is also necessary on |
| * internal pages, since the firstright item that the left page's high key was |
| * based on will have been truncated to zero attributes in the right page (the |
| * separator key is unavailable from the right page). |
| * |
| * Backup Blk 0: original page / new left page |
| * |
| * The left page's data portion contains the new item, if it's the _L variant. |
| * _R variant split records generally do not have a newitem (_R variant leaf |
| * page split records that must deal with a posting list split will include an |
| * explicit newitem, though it is never used on the right page -- it is |
| * actually an orignewitem needed to update existing posting list). The new |
| * high key of the left/original page appears last of all (and must always be |
| * present). |
| * |
| * Page split records that need the REDO routine to deal with a posting list |
| * split directly will have an explicit newitem, which is actually an |
| * orignewitem (the newitem as it was before the posting list split, not |
| * after). A posting list split always has a newitem that comes immediately |
| * after the posting list being split (which would have overlapped with |
| * orignewitem prior to split). Usually REDO must deal with posting list |
| * splits with an _L variant page split record, and usually both the new |
| * posting list and the final newitem go on the left page (the existing |
| * posting list will be inserted instead of the old, and the final newitem |
| * will be inserted next to that). However, _R variant split records will |
| * include an orignewitem when the split point for the page happens to have a |
| * lastleft tuple that is also the posting list being split (leaving newitem |
| * as the page split's firstright tuple). The existence of this corner case |
| * does not change the basic fact about newitem/orignewitem for the REDO |
| * routine: it is always state used for the left page alone. (This is why the |
| * record's postingoff field isn't a reliable indicator of whether or not a |
| * posting list split occurred during the page split; a non-zero value merely |
| * indicates that the REDO routine must reconstruct a new posting list tuple |
| * that is needed for the left page.) |
| * |
| * This posting list split handling is equivalent to the xl_btree_insert REDO |
| * routine's INSERT_POST handling. While the details are more complicated |
| * here, the concept and goals are exactly the same. See _bt_swap_posting() |
| * for details on posting list splits. |
| * |
| * Backup Blk 1: new right page |
| * |
| * The right page's data portion contains the right page's tuples in the form |
| * used by _bt_restore_page. This includes the new item, if it's the _R |
| * variant. The right page's tuples also include the right page's high key |
| * with either variant (moved from the left/original page during the split), |
| * unless the split happened to be of the rightmost page on its level, where |
| * there is no high key for new right page. |
| * |
| * Backup Blk 2: next block (orig page's rightlink), if any |
| * Backup Blk 3: child's left sibling, if non-leaf split |
| */ |
| typedef struct xl_btree_split |
| { |
| uint32 level; /* tree level of page being split */ |
| OffsetNumber firstrightoff; /* first origpage item on rightpage */ |
| OffsetNumber newitemoff; /* new item's offset */ |
| uint16 postingoff; /* offset inside orig posting tuple */ |
| } xl_btree_split; |
| |
| #define SizeOfBtreeSplit (offsetof(xl_btree_split, postingoff) + sizeof(uint16)) |
| |
| /* |
| * When page is deduplicated, consecutive groups of tuples with equal keys are |
| * merged together into posting list tuples. |
| * |
| * The WAL record represents a deduplication pass for a leaf page. An array |
| * of BTDedupInterval structs follows. |
| */ |
| typedef struct xl_btree_dedup |
| { |
| uint16 nintervals; |
| |
| /* DEDUPLICATION INTERVALS FOLLOW */ |
| } xl_btree_dedup; |
| |
| #define SizeOfBtreeDedup (offsetof(xl_btree_dedup, nintervals) + sizeof(uint16)) |
| |
| /* |
| * This is what we need to know about page reuse within btree. This record |
| * only exists to generate a conflict point for Hot Standby. |
| * |
| * Note that we must include a RelFileNode in the record because we don't |
| * actually register the buffer with the record. |
| */ |
| typedef struct xl_btree_reuse_page |
| { |
| RelFileNode node; |
| BlockNumber block; |
| FullTransactionId latestRemovedFullXid; |
| } xl_btree_reuse_page; |
| |
| #define SizeOfBtreeReusePage (sizeof(xl_btree_reuse_page)) |
| |
| /* |
| * xl_btree_vacuum and xl_btree_delete records describe deletion of index |
| * tuples on a leaf page. The former variant is used by VACUUM, while the |
| * latter variant is used by the ad-hoc deletions that sometimes take place |
| * when btinsert() is called. |
| * |
| * The records are very similar. The only difference is that xl_btree_delete |
| * has to include a latestRemovedXid field to generate recovery conflicts. |
| * (VACUUM operations can just rely on earlier conflicts generated during |
| * pruning of the table whose TIDs the to-be-deleted index tuples point to. |
| * There are also small differences between each REDO routine that we don't go |
| * into here.) |
| * |
| * xl_btree_vacuum and xl_btree_delete both represent deletion of any number |
| * of index tuples on a single leaf page using page offset numbers. Both also |
| * support "updates" of index tuples, which is how deletes of a subset of TIDs |
| * contained in an existing posting list tuple are implemented. |
| * |
| * Updated posting list tuples are represented using xl_btree_update metadata. |
| * The REDO routines each use the xl_btree_update entries (plus each |
| * corresponding original index tuple from the target leaf page) to generate |
| * the final updated tuple. |
| * |
| * Updates are only used when there will be some remaining TIDs left by the |
| * REDO routine. Otherwise the posting list tuple just gets deleted outright. |
| */ |
| typedef struct xl_btree_vacuum |
| { |
| uint16 ndeleted; |
| uint16 nupdated; |
| |
| /* DELETED TARGET OFFSET NUMBERS FOLLOW */ |
| /* UPDATED TARGET OFFSET NUMBERS FOLLOW */ |
| /* UPDATED TUPLES METADATA (xl_btree_update) ARRAY FOLLOWS */ |
| } xl_btree_vacuum; |
| |
| #define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, nupdated) + sizeof(uint16)) |
| |
| typedef struct xl_btree_delete |
| { |
| TransactionId latestRemovedXid; |
| uint16 ndeleted; |
| uint16 nupdated; |
| |
| /* DELETED TARGET OFFSET NUMBERS FOLLOW */ |
| /* UPDATED TARGET OFFSET NUMBERS FOLLOW */ |
| /* UPDATED TUPLES METADATA (xl_btree_update) ARRAY FOLLOWS */ |
| } xl_btree_delete; |
| |
| #define SizeOfBtreeDelete (offsetof(xl_btree_delete, nupdated) + sizeof(uint16)) |
| |
| /* |
| * The offsets that appear in xl_btree_update metadata are offsets into the |
| * original posting list from tuple, not page offset numbers. These are |
| * 0-based. The page offset number for the original posting list tuple comes |
| * from the main xl_btree_vacuum/xl_btree_delete record. |
| */ |
| typedef struct xl_btree_update |
| { |
| uint16 ndeletedtids; |
| |
| /* POSTING LIST uint16 OFFSETS TO A DELETED TID FOLLOW */ |
| } xl_btree_update; |
| |
| #define SizeOfBtreeUpdate (offsetof(xl_btree_update, ndeletedtids) + sizeof(uint16)) |
| |
| /* |
| * This is what we need to know about marking an empty subtree for deletion. |
| * The target identifies the tuple removed from the parent page (note that we |
| * remove this tuple's downlink and the *following* tuple's key). Note that |
| * the leaf page is empty, so we don't need to store its content --- it is |
| * just reinitialized during recovery using the rest of the fields. |
| * |
| * Backup Blk 0: leaf block |
| * Backup Blk 1: top parent |
| */ |
| typedef struct xl_btree_mark_page_halfdead |
| { |
| OffsetNumber poffset; /* deleted tuple id in parent page */ |
| |
| /* information needed to recreate the leaf page: */ |
| BlockNumber leafblk; /* leaf block ultimately being deleted */ |
| BlockNumber leftblk; /* leaf block's left sibling, if any */ |
| BlockNumber rightblk; /* leaf block's right sibling */ |
| BlockNumber topparent; /* topmost internal page in the subtree */ |
| } xl_btree_mark_page_halfdead; |
| |
| #define SizeOfBtreeMarkPageHalfDead (offsetof(xl_btree_mark_page_halfdead, topparent) + sizeof(BlockNumber)) |
| |
| /* |
| * This is what we need to know about deletion of a btree page. Note that we |
| * only leave behind a small amount of bookkeeping information in deleted |
| * pages (deleted pages must be kept around as tombstones for a while). It is |
| * convenient for the REDO routine to regenerate its target page from scratch. |
| * This is why WAL record describes certain details that are actually directly |
| * available from the target page. |
| * |
| * Backup Blk 0: target block being deleted |
| * Backup Blk 1: target block's left sibling, if any |
| * Backup Blk 2: target block's right sibling |
| * Backup Blk 3: leaf block (if different from target) |
| * Backup Blk 4: metapage (if rightsib becomes new fast root) |
| */ |
| typedef struct xl_btree_unlink_page |
| { |
| BlockNumber leftsib; /* target block's left sibling, if any */ |
| BlockNumber rightsib; /* target block's right sibling */ |
| uint32 level; /* target block's level */ |
| FullTransactionId safexid; /* target block's BTPageSetDeleted() XID */ |
| |
| /* |
| * Information needed to recreate a half-dead leaf page with correct |
| * topparent link. The fields are only used when deletion operation's |
| * target page is an internal page. REDO routine creates half-dead page |
| * from scratch to keep things simple (this is the same convenient |
| * approach used for the target page itself). |
| */ |
| BlockNumber leafleftsib; |
| BlockNumber leafrightsib; |
| BlockNumber leaftopparent; /* next child down in the subtree */ |
| |
| /* xl_btree_metadata FOLLOWS IF XLOG_BTREE_UNLINK_PAGE_META */ |
| } xl_btree_unlink_page; |
| |
| #define SizeOfBtreeUnlinkPage (offsetof(xl_btree_unlink_page, leaftopparent) + sizeof(BlockNumber)) |
| |
| /* |
| * New root log record. There are zero tuples if this is to establish an |
| * empty root, or two if it is the result of splitting an old root. |
| * |
| * Note that although this implies rewriting the metadata page, we don't need |
| * an xl_btree_metadata record --- the rootblk and level are sufficient. |
| * |
| * Backup Blk 0: new root page (2 tuples as payload, if splitting old root) |
| * Backup Blk 1: left child (if splitting an old root) |
| * Backup Blk 2: metapage |
| */ |
| typedef struct xl_btree_newroot |
| { |
| BlockNumber rootblk; /* location of new root (redundant with blk 0) */ |
| uint32 level; /* its tree level */ |
| } xl_btree_newroot; |
| |
| #define SizeOfBtreeNewroot (offsetof(xl_btree_newroot, level) + sizeof(uint32)) |
| |
| |
| /* |
| * prototypes for functions in nbtxlog.c |
| */ |
| extern void btree_redo(XLogReaderState *record); |
| extern void btree_desc(StringInfo buf, XLogReaderState *record); |
| extern const char *btree_identify(uint8 info); |
| extern void btree_xlog_startup(void); |
| extern void btree_xlog_cleanup(void); |
| extern void btree_mask(char *pagedata, BlockNumber blkno); |
| |
| #endif /* NBTXLOG_H */ |