| /*----------------------------------------------------------------------------- |
| * |
| * appendonlyblockdirectory.c |
| * maintain the block directory to blocks in append-only relation files. |
| * |
| * Portions Copyright (c) 2009, Greenplum Inc. |
| * Portions Copyright (c) 2012-Present VMware, Inc. or its affiliates. |
| * |
| * |
| * IDENTIFICATION |
| * src/backend/access/appendonly/appendonlyblockdirectory.c |
| * |
| *----------------------------------------------------------------------------- |
| */ |
| #include "postgres.h" |
| |
| #include "access/xact.h" |
| #include "cdb/cdbappendonlyblockdirectory.h" |
| #include "catalog/aoblkdir.h" |
| #include "catalog/pg_appendonly.h" |
| #include "access/heapam.h" |
| #include "access/genam.h" |
| #include "parser/parse_oper.h" |
| #include "utils/lsyscache.h" |
| #include "utils/memutils.h" |
| #include "utils/faultinjector.h" |
| #include "utils/guc.h" |
| #include "utils/fmgroids.h" |
| #include "cdb/cdbappendonlyam.h" |
| |
| int gp_blockdirectory_entry_min_range = 0; |
| int gp_blockdirectory_minipage_size = NUM_MINIPAGE_ENTRIES; |
| |
| static void load_last_minipage( |
| AppendOnlyBlockDirectory *blockDirectory, |
| int64 lastSequence, |
| int columnGroupNo); |
| static void init_scankeys( |
| TupleDesc tupleDesc, |
| int nkeys, ScanKey scanKeys, |
| StrategyNumber *strategyNumbers); |
| static int find_minipage_entry( |
| Minipage *minipage, |
| uint32 numEntries, |
| int64 rowNum); |
| static void extract_minipage( |
| AppendOnlyBlockDirectory *blockDirectory, |
| HeapTuple tuple, |
| TupleDesc tupleDesc, |
| int columnGroupNo); |
| static void write_minipage(AppendOnlyBlockDirectory *blockDirectory, |
| int columnGroupNo, |
| MinipagePerColumnGroup *minipageInfo); |
| static bool insert_new_entry(AppendOnlyBlockDirectory *blockDirectory, |
| int columnGroupNo, |
| int64 firstRowNum, |
| int64 fileOffset, |
| int64 rowCount, |
| bool addColAction); |
| static void clear_minipage(MinipagePerColumnGroup *minipagePerColumnGroup); |
| static bool blkdir_entry_exists(AppendOnlyBlockDirectory *blockDirectory, |
| AOTupleId *aoTupleId, |
| int columnGroupNo); |
| |
| void |
| AppendOnlyBlockDirectoryEntry_GetBeginRange( |
| AppendOnlyBlockDirectoryEntry *directoryEntry, |
| int64 *fileOffset, |
| int64 *firstRowNum) |
| { |
| *fileOffset = directoryEntry->range.fileOffset; |
| *firstRowNum = directoryEntry->range.firstRowNum; |
| } |
| |
| void |
| AppendOnlyBlockDirectoryEntry_GetEndRange( |
| AppendOnlyBlockDirectoryEntry *directoryEntry, |
| int64 *afterFileOffset, |
| int64 *lastRowNum) |
| { |
| *afterFileOffset = directoryEntry->range.afterFileOffset; |
| *lastRowNum = directoryEntry->range.lastRowNum; |
| } |
| |
| bool |
| AppendOnlyBlockDirectoryEntry_RangeHasRow( |
| AppendOnlyBlockDirectoryEntry *directoryEntry, |
| int64 checkRowNum) |
| { |
| return (checkRowNum >= directoryEntry->range.firstRowNum && |
| checkRowNum <= directoryEntry->range.lastRowNum); |
| } |
| |
| /* |
| * init_internal |
| * |
| * Initialize the block directory structure. |
| */ |
| static void |
| init_internal(AppendOnlyBlockDirectory *blockDirectory) |
| { |
| MemoryContext oldcxt; |
| int numScanKeys; |
| TupleDesc heapTupleDesc; |
| TupleDesc idxTupleDesc; |
| int groupNo; |
| |
| Assert(blockDirectory->blkdirRel != NULL); |
| Assert(blockDirectory->blkdirIdx != NULL); |
| |
| blockDirectory->memoryContext = |
| AllocSetContextCreate(CurrentMemoryContext, |
| "BlockDirectoryContext", |
| ALLOCSET_DEFAULT_MINSIZE, |
| ALLOCSET_DEFAULT_INITSIZE, |
| ALLOCSET_DEFAULT_MAXSIZE); |
| |
| oldcxt = MemoryContextSwitchTo(blockDirectory->memoryContext); |
| |
| heapTupleDesc = RelationGetDescr(blockDirectory->blkdirRel); |
| blockDirectory->values = palloc0(sizeof(Datum) * heapTupleDesc->natts); |
| blockDirectory->nulls = palloc0(sizeof(bool) * heapTupleDesc->natts); |
| blockDirectory->numScanKeys = 3; |
| numScanKeys = blockDirectory->numScanKeys; |
| blockDirectory->scanKeys = palloc0(numScanKeys * sizeof(ScanKeyData)); |
| |
| blockDirectory->strategyNumbers = palloc0(numScanKeys * sizeof(StrategyNumber)); |
| blockDirectory->strategyNumbers[0] = BTEqualStrategyNumber; |
| blockDirectory->strategyNumbers[1] = BTEqualStrategyNumber; |
| blockDirectory->strategyNumbers[2] = BTLessEqualStrategyNumber; |
| |
| idxTupleDesc = RelationGetDescr(blockDirectory->blkdirIdx); |
| |
| init_scankeys(idxTupleDesc, numScanKeys, |
| blockDirectory->scanKeys, |
| blockDirectory->strategyNumbers); |
| |
| /* Initialize the last minipage */ |
| blockDirectory->minipages = |
| palloc0(sizeof(MinipagePerColumnGroup) * blockDirectory->numColumnGroups); |
| for (groupNo = 0; groupNo < blockDirectory->numColumnGroups; groupNo++) |
| { |
| if (blockDirectory->proj && !blockDirectory->proj[groupNo]) |
| { |
| /* Ignore columns that are not projected. */ |
| continue; |
| } |
| MinipagePerColumnGroup *minipageInfo = |
| &blockDirectory->minipages[groupNo]; |
| |
| minipageInfo->minipage = |
| palloc0(minipage_size(NUM_MINIPAGE_ENTRIES)); |
| minipageInfo->numMinipageEntries = 0; |
| ItemPointerSetInvalid(&minipageInfo->tupleTid); |
| } |
| |
| MemoryContextSwitchTo(oldcxt); |
| } |
| |
| /* |
| * AppendOnlyBlockDirectory_Init_forSearch |
| * |
| * Initialize the block directory to handle the lookup. |
| * |
| * If the block directory relation for this appendonly relation |
| * does not exist before calling this function, set blkdirRel |
| * and blkdirIdx to NULL, and return. |
| */ |
| void |
| AppendOnlyBlockDirectory_Init_forSearch( |
| AppendOnlyBlockDirectory *blockDirectory, |
| Snapshot appendOnlyMetaDataSnapshot, |
| FileSegInfo **segmentFileInfo, |
| int totalSegfiles, |
| Relation aoRel, |
| int numColumnGroups, |
| bool isAOCol, |
| bool *proj) |
| { |
| Oid blkdirrelid; |
| Oid blkdiridxid; |
| |
| blockDirectory->aoRel = aoRel; |
| GetAppendOnlyEntryAuxOids(aoRel, NULL, &blkdirrelid, &blkdiridxid, NULL, NULL); |
| |
| if (!OidIsValid(blkdirrelid)) |
| { |
| Assert(!OidIsValid(blkdiridxid)); |
| blockDirectory->blkdirRel = NULL; |
| blockDirectory->blkdirIdx = NULL; |
| |
| return; |
| } |
| |
| ereportif(Debug_appendonly_print_blockdirectory, LOG, |
| (errmsg("Append-only block directory init for search: " |
| "(totalSegfiles, numColumnGroups, isAOCol)=" |
| "(%d, %d, %d)", |
| totalSegfiles, numColumnGroups, isAOCol))); |
| |
| blockDirectory->segmentFileInfo = segmentFileInfo; |
| blockDirectory->totalSegfiles = totalSegfiles; |
| blockDirectory->aoRel = aoRel; |
| blockDirectory->appendOnlyMetaDataSnapshot = appendOnlyMetaDataSnapshot; |
| blockDirectory->numColumnGroups = numColumnGroups; |
| blockDirectory->isAOCol = isAOCol; |
| blockDirectory->proj = proj; |
| blockDirectory->currentSegmentFileNum = -1; |
| |
| Assert(OidIsValid(blkdirrelid)); |
| |
| blockDirectory->blkdirRel = |
| heap_open(blkdirrelid, AccessShareLock); |
| |
| Assert(OidIsValid(blkdiridxid)); |
| |
| blockDirectory->blkdirIdx = |
| index_open(blkdiridxid, AccessShareLock); |
| |
| init_internal(blockDirectory); |
| } |
| |
| /* |
| * AppendOnlyBlockDirectory_Init_forUniqueChecks |
| * |
| * Initializes the block directory to handle lookups for uniqueness checks. |
| * |
| * Note: These lookups will be purely restricted to the block directory relation |
| * itself and will not involve the physical AO relation. |
| * |
| * Note: we defer setting up the appendOnlyMetaDataSnapshot for the block |
| * directory to the index_unique_check() table AM call. This is because |
| * snapshots used for unique index lookups are special and don't follow the |
| * usual allocation or registration mechanism. They may be stack-allocated and a |
| * new snapshot object may be passed to every unique index check (this happens |
| * when SNAPSHOT_DIRTY is passed). While technically, we could set up the |
| * metadata snapshot in advance for SNAPSHOT_SELF, the alternative is fine. |
| */ |
| void |
| AppendOnlyBlockDirectory_Init_forUniqueChecks( |
| AppendOnlyBlockDirectory *blockDirectory, |
| Relation aoRel, |
| int numColumnGroups, |
| Snapshot snapshot) |
| { |
| Oid blkdirrelid; |
| Oid blkdiridxid; |
| |
| Assert(RelationIsValid(aoRel)); |
| |
| Assert(snapshot->snapshot_type == SNAPSHOT_DIRTY || |
| snapshot->snapshot_type == SNAPSHOT_SELF); |
| |
| GetAppendOnlyEntryAuxOids(aoRel, |
| NULL, &blkdirrelid, &blkdiridxid, NULL, NULL); |
| |
| if (!OidIsValid(blkdirrelid) || !OidIsValid(blkdiridxid)) |
| elog(ERROR, "Could not find block directory for relation: %u", aoRel->rd_id); |
| |
| ereportif(Debug_appendonly_print_blockdirectory, LOG, |
| (errmsg("Append-only block directory init for unique checks"), |
| errdetail("(aoRel = %u, blkdirrel = %u, blkdiridxrel = %u, numColumnGroups = %d)", |
| aoRel->rd_id, blkdirrelid, blkdiridxid, numColumnGroups))); |
| |
| blockDirectory->aoRel = aoRel; |
| blockDirectory->isAOCol = RelationIsAoCols(aoRel); |
| |
| /* Segfile setup is not necessary as physical AO tuples will not be accessed */ |
| blockDirectory->segmentFileInfo = NULL; |
| blockDirectory->totalSegfiles = -1; |
| blockDirectory->currentSegmentFileNum = -1; |
| |
| /* Metadata snapshot assignment is deferred to lookup-time */ |
| blockDirectory->appendOnlyMetaDataSnapshot = InvalidSnapshot; |
| |
| blockDirectory->numColumnGroups = numColumnGroups; |
| blockDirectory->proj = NULL; |
| |
| blockDirectory->blkdirRel = heap_open(blkdirrelid, AccessShareLock); |
| blockDirectory->blkdirIdx = index_open(blkdiridxid, AccessShareLock); |
| |
| init_internal(blockDirectory); |
| } |
| |
| /* |
| * AppendOnlyBlockDirectory_Init_forInsert |
| * |
| * Initialize the block directory to handle the inserts. |
| * |
| * If the block directory relation for this appendonly relation |
| * does not exist before calling this function, set blkdirRel |
| * and blkdirIdx to NULL, and return. |
| */ |
| void |
| AppendOnlyBlockDirectory_Init_forInsert( |
| AppendOnlyBlockDirectory *blockDirectory, |
| Snapshot appendOnlyMetaDataSnapshot, |
| FileSegInfo *segmentFileInfo, |
| int64 lastSequence, |
| Relation aoRel, |
| int segno, |
| int numColumnGroups, |
| bool isAOCol) |
| { |
| int groupNo; |
| Oid blkdirrelid; |
| Oid blkdiridxid; |
| |
| blockDirectory->aoRel = aoRel; |
| blockDirectory->appendOnlyMetaDataSnapshot = appendOnlyMetaDataSnapshot; |
| |
| GetAppendOnlyEntryAuxOids(aoRel, NULL, &blkdirrelid, &blkdiridxid, NULL, NULL); |
| |
| if (!OidIsValid(blkdirrelid)) |
| { |
| Assert(!OidIsValid(blkdiridxid)); |
| blockDirectory->blkdirRel = NULL; |
| blockDirectory->blkdirIdx = NULL; |
| |
| return; |
| } |
| |
| blockDirectory->segmentFileInfo = NULL; |
| blockDirectory->totalSegfiles = -1; |
| blockDirectory->currentSegmentFileInfo = segmentFileInfo; |
| |
| blockDirectory->currentSegmentFileNum = segno; |
| blockDirectory->numColumnGroups = numColumnGroups; |
| blockDirectory->isAOCol = isAOCol; |
| blockDirectory->proj = NULL; |
| |
| Assert(OidIsValid(blkdirrelid)); |
| |
| blockDirectory->blkdirRel = |
| heap_open(blkdirrelid, RowExclusiveLock); |
| |
| Assert(OidIsValid(blkdiridxid)); |
| |
| blockDirectory->blkdirIdx = |
| index_open(blkdiridxid, RowExclusiveLock); |
| |
| blockDirectory->indinfo = CatalogOpenIndexes(blockDirectory->blkdirRel); |
| |
| init_internal(blockDirectory); |
| |
| ereportif(Debug_appendonly_print_blockdirectory, LOG, |
| (errmsg("Append-only block directory init for insert: " |
| "(segno, numColumnGroups, isAOCol, lastSequence)=" |
| "(%d, %d, %d, " INT64_FORMAT ")", |
| segno, numColumnGroups, isAOCol, lastSequence))); |
| |
| /* |
| * Load the last minipages from the block directory relation. |
| */ |
| for (groupNo = 0; groupNo < blockDirectory->numColumnGroups; groupNo++) |
| { |
| load_last_minipage(blockDirectory, lastSequence, groupNo); |
| } |
| } |
| |
| /* |
| * Open block directory relation, initialize scan keys and minipages |
| * for ALTER TABLE ADD COLUMN operation. |
| */ |
| void |
| AppendOnlyBlockDirectory_Init_addCol( |
| AppendOnlyBlockDirectory *blockDirectory, |
| Snapshot appendOnlyMetaDataSnapshot, |
| FileSegInfo *segmentFileInfo, |
| Relation aoRel, |
| int segno, |
| int numColumnGroups, |
| bool isAOCol) |
| { |
| Oid blkdirrelid; |
| Oid blkdiridxid; |
| |
| blockDirectory->aoRel = aoRel; |
| blockDirectory->appendOnlyMetaDataSnapshot = appendOnlyMetaDataSnapshot; |
| |
| GetAppendOnlyEntryAuxOids(aoRel, NULL, &blkdirrelid, &blkdiridxid, NULL, NULL); |
| |
| if (!OidIsValid(blkdirrelid)) |
| { |
| Assert(!OidIsValid(blkdiridxid)); |
| blockDirectory->blkdirRel = NULL; |
| blockDirectory->blkdirIdx = NULL; |
| blockDirectory->numColumnGroups = 0; |
| return; |
| } |
| |
| blockDirectory->segmentFileInfo = NULL; |
| blockDirectory->totalSegfiles = -1; |
| blockDirectory->currentSegmentFileInfo = segmentFileInfo; |
| |
| blockDirectory->currentSegmentFileNum = segno; |
| blockDirectory->numColumnGroups = numColumnGroups; |
| blockDirectory->isAOCol = isAOCol; |
| blockDirectory->proj = NULL; |
| |
| Assert(OidIsValid(blkdirrelid)); |
| |
| /* |
| * TODO: refactor the *_addCol* interface so that opening of |
| * blockdirectory relation and index, init_internal and corresponding |
| * cleanup in *_End_addCol() is called only once during the add-column |
| * operation. Currently, this is being called for every appendonly |
| * segment. |
| */ |
| blockDirectory->blkdirRel = |
| heap_open(blkdirrelid, RowExclusiveLock); |
| |
| Assert(OidIsValid(blkdiridxid)); |
| |
| blockDirectory->blkdirIdx = |
| index_open(blkdiridxid, RowExclusiveLock); |
| |
| blockDirectory->indinfo = CatalogOpenIndexes(blockDirectory->blkdirRel); |
| |
| init_internal(blockDirectory); |
| } |
| |
| static bool |
| set_directoryentry_range( |
| AppendOnlyBlockDirectory *blockDirectory, |
| int columnGroupNo, |
| int entry_no, |
| AppendOnlyBlockDirectoryEntry *directoryEntry) |
| { |
| MinipagePerColumnGroup *minipageInfo = |
| &blockDirectory->minipages[columnGroupNo]; |
| FileSegInfo *fsInfo; |
| AOCSFileSegInfo *aocsFsInfo = NULL; |
| MinipageEntry *entry; |
| MinipageEntry *next_entry = NULL; |
| |
| Assert(entry_no >= 0 && ((uint32) entry_no) < minipageInfo->numMinipageEntries); |
| |
| fsInfo = blockDirectory->currentSegmentFileInfo; |
| Assert(fsInfo != NULL); |
| |
| if (blockDirectory->isAOCol) |
| { |
| aocsFsInfo = (AOCSFileSegInfo *) fsInfo; |
| } |
| |
| entry = &(minipageInfo->minipage->entry[entry_no]); |
| if (((uint32) entry_no) < minipageInfo->numMinipageEntries - 1) |
| { |
| next_entry = &(minipageInfo->minipage->entry[entry_no + 1]); |
| } |
| |
| directoryEntry->range.fileOffset = entry->fileOffset; |
| directoryEntry->range.firstRowNum = entry->firstRowNum; |
| if (next_entry != NULL) |
| { |
| directoryEntry->range.afterFileOffset = next_entry->fileOffset; |
| } |
| else |
| { |
| if (!blockDirectory->isAOCol) |
| { |
| directoryEntry->range.afterFileOffset = fsInfo->eof; |
| } |
| |
| else |
| { |
| directoryEntry->range.afterFileOffset = |
| aocsFsInfo->vpinfo.entry[columnGroupNo].eof; |
| } |
| } |
| |
| directoryEntry->range.lastRowNum = entry->firstRowNum + entry->rowCount - 1; |
| if (next_entry == NULL && gp_blockdirectory_entry_min_range != 0) |
| { |
| directoryEntry->range.lastRowNum = (~(((int64) 1) << 63)); /* set to the maximal |
| * value */ |
| } |
| |
| /* |
| * When crashes during inserts, or cancellation during inserts, the block |
| * directory may contain out-of-date entries. We check for the end of file |
| * here. If the requested directory entry is after the end of file, return |
| * false. |
| */ |
| if ((!blockDirectory->isAOCol && |
| directoryEntry->range.fileOffset > fsInfo->eof) || |
| (blockDirectory->isAOCol && |
| directoryEntry->range.fileOffset > |
| aocsFsInfo->vpinfo.entry[columnGroupNo].eof)) |
| return false; |
| |
| if ((!blockDirectory->isAOCol && |
| directoryEntry->range.afterFileOffset > fsInfo->eof)) |
| { |
| directoryEntry->range.afterFileOffset = fsInfo->eof; |
| } |
| |
| if (blockDirectory->isAOCol && |
| directoryEntry->range.afterFileOffset > |
| aocsFsInfo->vpinfo.entry[columnGroupNo].eof) |
| { |
| directoryEntry->range.afterFileOffset = |
| aocsFsInfo->vpinfo.entry[columnGroupNo].eof; |
| } |
| |
| ereportif(Debug_appendonly_print_blockdirectory, LOG, |
| (errmsg("Append-only block directory find entry: " |
| "(columnGroupNo, firstRowNum, fileOffset, lastRowNum, afterFileOffset) = " |
| "(%d, " INT64_FORMAT ", " INT64_FORMAT ", " INT64_FORMAT ", " INT64_FORMAT ")", |
| columnGroupNo, directoryEntry->range.firstRowNum, |
| directoryEntry->range.fileOffset, directoryEntry->range.lastRowNum, |
| directoryEntry->range.afterFileOffset))); |
| |
| return true; |
| } |
| |
| /* |
| * AppendOnlyBlockDirectory_GetEntry |
| * |
| * Find a directory entry for the given AOTupleId in the block directory. |
| * If such an entry is found, return true. Otherwise, return false. |
| * |
| * The range for directoryEntry is assigned accordingly in this function. |
| * |
| * The block directory for the appendonly table should exist before calling |
| * this function. |
| */ |
| bool |
| AppendOnlyBlockDirectory_GetEntry( |
| AppendOnlyBlockDirectory *blockDirectory, |
| AOTupleId *aoTupleId, |
| int columnGroupNo, |
| AppendOnlyBlockDirectoryEntry *directoryEntry) |
| { |
| int segmentFileNum = AOTupleIdGet_segmentFileNum(aoTupleId); |
| int64 rowNum = AOTupleIdGet_rowNum(aoTupleId); |
| int i; |
| Relation blkdirRel = blockDirectory->blkdirRel; |
| Relation blkdirIdx = blockDirectory->blkdirIdx; |
| int numScanKeys = blockDirectory->numScanKeys; |
| ScanKey scanKeys = blockDirectory->scanKeys; |
| |
| TupleDesc heapTupleDesc; |
| FileSegInfo *fsInfo = NULL; |
| SysScanDesc idxScanDesc; |
| HeapTuple tuple = NULL; |
| MinipagePerColumnGroup *minipageInfo = |
| &blockDirectory->minipages[columnGroupNo]; |
| int entry_no = -1; |
| int tmpGroupNo; |
| |
| if (blkdirRel == NULL || blkdirIdx == NULL) |
| { |
| Assert(RelationIsValid(blockDirectory->aoRel)); |
| |
| ereport(ERROR, |
| (errcode(ERRCODE_INTERNAL_ERROR), |
| errmsg("block directory for append-only relation '%s' does not exist", |
| RelationGetRelationName(blockDirectory->aoRel)))); |
| return false; |
| } |
| |
| ereportif(Debug_appendonly_print_blockdirectory, LOG, |
| (errmsg("Append-only block directory get entry: " |
| "(columnGroupNo, segmentFileNum, rowNum) = " |
| "(%d, %d, " INT64_FORMAT ")", |
| columnGroupNo, segmentFileNum, rowNum))); |
| |
| /* |
| * If the segment file number is the same as |
| * blockDirectory->currentSegmentFileNum, the in-memory minipage may |
| * contain such an entry. We search the in-memory minipage first. If such |
| * an entry can not be found, we search for the appropriate minipage by |
| * using the block directory btree index. |
| */ |
| if (segmentFileNum == blockDirectory->currentSegmentFileNum && |
| minipageInfo->numMinipageEntries > 0) |
| { |
| Assert(blockDirectory->currentSegmentFileInfo != NULL); |
| |
| MinipageEntry *firstentry = |
| &minipageInfo->minipage->entry[0]; |
| |
| if (rowNum >= firstentry->firstRowNum) |
| { |
| /* |
| * Check if the existing minipage contains the requested rowNum. |
| * If so, just get it. |
| */ |
| entry_no = find_minipage_entry(minipageInfo->minipage, |
| minipageInfo->numMinipageEntries, |
| rowNum); |
| if (entry_no != -1) |
| { |
| return set_directoryentry_range(blockDirectory, |
| columnGroupNo, |
| entry_no, |
| directoryEntry); |
| |
| } |
| |
| /* |
| * The given rowNum may point to a tuple that does not exist in |
| * the AO table any more, either because of cancellation of an |
| * insert, or due to crashes during an insert. If this is the |
| * case, rowNum is smaller than the highest entry in the in-memory |
| * minipage entry. |
| */ |
| else |
| { |
| MinipageEntry *entry = |
| &minipageInfo->minipage->entry[minipageInfo->numMinipageEntries - 1]; |
| |
| if (rowNum < entry->firstRowNum + entry->rowCount - 1) |
| return false; |
| } |
| } |
| } |
| |
| for (i = 0; i < blockDirectory->totalSegfiles; i++) |
| { |
| fsInfo = blockDirectory->segmentFileInfo[i]; |
| |
| if (!blockDirectory->isAOCol && segmentFileNum == fsInfo->segno) |
| break; |
| else if (blockDirectory->isAOCol && segmentFileNum == |
| ((AOCSFileSegInfo *) fsInfo)->segno) |
| break; |
| } |
| |
| Assert(fsInfo != NULL); |
| |
| /* |
| * Search the btree index to find the minipage that contains the rowNum. |
| * We find the minipages for all column groups, since currently we will |
| * need to access all columns at the same time. |
| */ |
| heapTupleDesc = RelationGetDescr(blkdirRel); |
| |
| Assert(numScanKeys == 3); |
| |
| for (tmpGroupNo = 0; tmpGroupNo < blockDirectory->numColumnGroups; tmpGroupNo++) |
| { |
| if (blockDirectory->proj && !blockDirectory->proj[tmpGroupNo]) |
| { |
| /* Ignore columns that are not projected. */ |
| continue; |
| } |
| /* |
| * Set up the scan keys values. The keys have already been set up in |
| * init_internal() with the following strategy: |
| * (=segmentFileNum, =columnGroupNo, <=rowNum) |
| * See init_internal(). |
| */ |
| Assert(scanKeys != NULL); |
| scanKeys[0].sk_argument = Int32GetDatum(segmentFileNum); |
| scanKeys[1].sk_argument = Int32GetDatum(tmpGroupNo); |
| scanKeys[2].sk_argument = Int64GetDatum(rowNum); |
| |
| idxScanDesc = systable_beginscan_ordered(blkdirRel, blkdirIdx, |
| blockDirectory->appendOnlyMetaDataSnapshot, |
| numScanKeys, scanKeys); |
| |
| tuple = systable_getnext_ordered(idxScanDesc, BackwardScanDirection); |
| |
| if (tuple != NULL) |
| { |
| /* |
| * MPP-17061: we need to update currentSegmentFileNum & |
| * currentSegmentFileInfo at the same time when we load the |
| * minipage for the block directory entry we found, otherwise we |
| * would risk having inconsistency between |
| * currentSegmentFileNum/currentSegmentFileInfo and minipage |
| * contents, which would cause wrong block header offset being |
| * returned in following block directory entry look up. |
| */ |
| blockDirectory->currentSegmentFileNum = segmentFileNum; |
| blockDirectory->currentSegmentFileInfo = fsInfo; |
| |
| extract_minipage(blockDirectory, |
| tuple, |
| heapTupleDesc, |
| tmpGroupNo); |
| } |
| else |
| { |
| /* MPP-17061: index look up failed, row is invisible */ |
| systable_endscan_ordered(idxScanDesc); |
| return false; |
| } |
| |
| systable_endscan_ordered(idxScanDesc); |
| } |
| |
| |
| minipageInfo = &blockDirectory->minipages[columnGroupNo]; |
| |
| /* |
| * Perform a binary search over the minipage to find the entry about |
| * the AO block. |
| */ |
| entry_no = find_minipage_entry(minipageInfo->minipage, |
| minipageInfo->numMinipageEntries, |
| rowNum); |
| |
| /* If there are no entries, return false. */ |
| if (entry_no == -1 && minipageInfo->numMinipageEntries == 0) |
| return false; |
| |
| if (entry_no == -1) |
| { |
| /* |
| * Since the last few blocks may not be logged in the block |
| * directory, we always use the last entry. |
| * |
| * FIXME: If we didn't find a suitable entry, why even use the last |
| * entry? Currently, as it stands we would most likely return |
| * true from this function. This will lead to us having to do a |
| * fetch of the tuple from the physical file in the layer above (see |
| * scanToFetchTuple()), where we would ultimately find the tuple |
| * missing. Would it be correct to set the directory entry here to |
| * be the last one (for caching purposes) and return false, in order |
| * to avoid this physical file read? |
| */ |
| entry_no = minipageInfo->numMinipageEntries - 1; |
| } |
| return set_directoryentry_range(blockDirectory, |
| columnGroupNo, |
| entry_no, |
| directoryEntry); |
| } |
| |
| /* |
| * AppendOnlyBlockDirectory_CoversTuple |
| * |
| * Check if there exists a visible block directory entry that represents a range |
| * in which this tid resides. |
| * |
| * Currently used by index fetches to perform unique constraint validation. A |
| * sysscan of the block directory relation is performed to determine the result. |
| * (see blkdir_entry_exists()) |
| * |
| * Performing a sysscan also has the distinct advantage of setting the xmin/xmax |
| * of the snapshot used to scan, which is a requirement when SNAPSHOT_DIRTY is |
| * used. See _bt_check_unique() and SNAPSHOT_DIRTY for details. |
| * |
| * Note about AOCO tables: |
| * For AOCO tables, there are multiple block directory entries for each tid. |
| * However, it is currently sufficient to check the block directory entry for |
| * just one of these columns. We do so for the 1st non-dropped column. Note that |
| * if we write a placeholder row for the 1st non-dropped column i, there is a |
| * guarantee that if there is a conflict on the placeholder row, the covering |
| * block directory entry will be based on the same column i (as columnar DDL |
| * changes need exclusive locks and placeholder rows can't be seen after tx end) |
| * (We could just have checked the covers condition for column 0, as block |
| * directory entries are inserted even for dropped columns. But, this may change |
| * one day, and we want our code to be future-proof) |
| */ |
| bool |
| AppendOnlyBlockDirectory_CoversTuple( |
| AppendOnlyBlockDirectory *blockDirectory, |
| AOTupleId *aoTupleId) |
| { |
| Relation aoRel = blockDirectory->aoRel; |
| int firstNonDroppedColumn = -1; |
| |
| Assert(RelationIsValid(aoRel)); |
| |
| if (RelationIsAoRows(aoRel)) |
| return blkdir_entry_exists(blockDirectory, aoTupleId, 0); |
| else |
| { |
| for(int i = 0; i < aoRel->rd_att->natts; i++) |
| { |
| if (!aoRel->rd_att->attrs[i].attisdropped) { |
| firstNonDroppedColumn = i; |
| break; |
| } |
| } |
| Assert(firstNonDroppedColumn != -1); |
| |
| return blkdir_entry_exists(blockDirectory, |
| aoTupleId, |
| firstNonDroppedColumn); |
| } |
| } |
| |
| /* |
| * Does a visible block directory entry exist for a given aotid and column no? |
| * Currently used to satisfy unique constraint checks. |
| */ |
| static bool |
| blkdir_entry_exists(AppendOnlyBlockDirectory *blockDirectory, |
| AOTupleId *aoTupleId, |
| int columnGroupNo) |
| { |
| int segmentFileNum = AOTupleIdGet_segmentFileNum(aoTupleId); |
| int64 rowNum = AOTupleIdGet_rowNum(aoTupleId); |
| Relation blkdirRel = blockDirectory->blkdirRel; |
| Relation blkdirIdx = blockDirectory->blkdirIdx; |
| ScanKey scanKeys = blockDirectory->scanKeys; |
| HeapTuple tuple; |
| SysScanDesc idxScanDesc; |
| bool found = false; |
| TupleDesc blkdirTupleDesc; |
| |
| Assert(RelationIsValid(blkdirRel)); |
| |
| ereportif(Debug_appendonly_print_blockdirectory, LOG, |
| (errmsg("Append-only block directory covers tuple check: " |
| "(columnGroupNo, segmentFileNum, rowNum) = " |
| "(%d, %d, " INT64_FORMAT ")", |
| 0, segmentFileNum, rowNum))); |
| |
| blkdirTupleDesc = RelationGetDescr(blkdirRel); |
| |
| /* |
| * Set up the scan keys values. The keys have already been set up in |
| * init_internal() with the following strategy: |
| * (=segmentFileNum, =columnGroupNo, <=rowNum) |
| * See init_internal(). |
| */ |
| Assert(scanKeys != NULL); |
| Assert(blockDirectory->numScanKeys == 3); |
| scanKeys[0].sk_argument = Int32GetDatum(segmentFileNum); |
| scanKeys[1].sk_argument = Int32GetDatum(columnGroupNo); |
| scanKeys[2].sk_argument = Int64GetDatum(rowNum); |
| idxScanDesc = systable_beginscan_ordered(blkdirRel, blkdirIdx, |
| blockDirectory->appendOnlyMetaDataSnapshot, |
| blockDirectory->numScanKeys, |
| scanKeys); |
| |
| /* |
| * |
| * Loop until: |
| * |
| * (1) No rows are returned from the sysscan, as there is no visible row |
| * satisfying the criteria. This is what happens when there is no uniqueness |
| * conflict, when we call this in the context of a uniqueness check. |
| * |
| * (2) We find a row such that: rowNum ∈ [firstRowNum, firstRowNum + rowCount) |
| * (a) The row is a regular block directory row covering the rowNum. |
| * (b) The row is a placeholder block directory row, inserted by |
| * AppendOnlyBlockDirectory_InsertPlaceholder(), which will always |
| * cover the rowNum by virtue of it's rowCount = AOTupleId_MaxRowNum. |
| */ |
| while (HeapTupleIsValid(tuple = systable_getnext_ordered(idxScanDesc, BackwardScanDirection))) |
| { |
| /* |
| * Once we have found a matching row, we must also ensure that we check |
| * for a block directory entry, in this row's minipage, that has a range |
| * that covers the rowNum. |
| * |
| * This is necessary for aborted transactions where the index entry |
| * might still be live. In such a case, since our search criteria lacks |
| * a lastRowNum, we will match rows where: |
| * firstRowNum < lastRowNum < rowNum |
| * Such rows will obviously not cover the rowNum, thus making inspection |
| * of the row's minipage a necessity. |
| */ |
| MinipagePerColumnGroup *minipageInfo; |
| int entry_no; |
| |
| BlockNumber blockNumber = ItemPointerGetBlockNumberNoCheck(&tuple->t_self); |
| OffsetNumber offsetNumber = ItemPointerGetOffsetNumberNoCheck(&tuple->t_self); |
| elogif(Debug_appendonly_print_blockdirectory, LOG, |
| "For segno = %d, rownum = %ld, tid returned: (%u,%u) " |
| "tuple (xmin, xmax) = (%lu, %lu), snaptype = %d", |
| segmentFileNum, rowNum, blockNumber, offsetNumber, |
| (unsigned long) HeapTupleHeaderGetRawXmin(tuple->t_data), |
| (unsigned long) HeapTupleHeaderGetRawXmax(tuple->t_data), |
| blockDirectory->appendOnlyMetaDataSnapshot->snapshot_type); |
| |
| /* Set this so that we don't blow up in the assert in extract_minipage */ |
| blockDirectory->currentSegmentFileNum = segmentFileNum; |
| extract_minipage(blockDirectory, |
| tuple, |
| blkdirTupleDesc, |
| columnGroupNo); |
| |
| minipageInfo = &blockDirectory->minipages[columnGroupNo]; |
| entry_no = find_minipage_entry(minipageInfo->minipage, |
| minipageInfo->numMinipageEntries, |
| rowNum); |
| if (entry_no != -1) |
| { |
| found = true; |
| break; |
| } |
| } |
| |
| systable_endscan_ordered(idxScanDesc); |
| |
| return found; |
| } |
| |
| /* |
| * AppendOnlyBlockDirectory_InsertEntry |
| * |
| * Insert an entry to the block directory. This entry is appended to the |
| * in-memory minipage. If the minipage is full, it is written to the block |
| * directory relation on disk. After that, the new entry is added to the |
| * new in-memory minipage. |
| * |
| * To reduce the size of a block directory, this function ignores new entries |
| * when the range between the offset value of the latest existing entry and |
| * the offset of the new entry is smaller than gp_blockdirectory_entry_min_range |
| * (if it is set). Otherwise, the latest existing entry is updated with new |
| * rowCount value, and the given new entry is appended to the in-memory minipage. |
| * |
| * If the block directory for the appendonly relation does not exist, |
| * this function simply returns. |
| * |
| * If rowCount is 0, simple return false. |
| */ |
| bool |
| AppendOnlyBlockDirectory_InsertEntry( |
| AppendOnlyBlockDirectory *blockDirectory, |
| int columnGroupNo, |
| int64 firstRowNum, |
| int64 fileOffset, |
| int64 rowCount, |
| bool addColAction) |
| { |
| return insert_new_entry(blockDirectory, columnGroupNo, firstRowNum, |
| fileOffset, rowCount, addColAction); |
| } |
| |
| /* |
| * Helper method used to insert a new minipage entry in the block |
| * directory relation. Refer to AppendOnlyBlockDirectory_InsertEntry() |
| * for more details. |
| * |
| * 1. Checks if the current minipage is full. If yes, it writes the current |
| * minipage to the block directory relation and empty the in-memory area. This |
| * could mean a new block directory tuple is inserted OR an old tuple is updated. |
| * |
| * 2. "Inserts" the new entry in the current in-mem minipage -> just sets the |
| * in-memory area with the supplied function args. |
| * |
| */ |
| static bool |
| insert_new_entry( |
| AppendOnlyBlockDirectory *blockDirectory, |
| int columnGroupNo, |
| int64 firstRowNum, |
| int64 fileOffset, |
| int64 rowCount, |
| bool addColAction) |
| { |
| MinipageEntry *entry = NULL; |
| MinipagePerColumnGroup *minipageInfo; |
| int minipageIndex; |
| |
| if (rowCount == 0) |
| return false; |
| |
| if (blockDirectory->blkdirRel == NULL || |
| blockDirectory->blkdirIdx == NULL) |
| return false; |
| |
| if (addColAction) |
| { |
| /* |
| * columnGroupNo is attribute number of the new column for |
| * addColAction. We need to map it to the right index in the minipage |
| * array. |
| */ |
| int numExistingCols = blockDirectory->aoRel->rd_att->natts - |
| blockDirectory->numColumnGroups; |
| |
| Assert((numExistingCols >= 0) && (numExistingCols - 1 < columnGroupNo)); |
| minipageIndex = columnGroupNo - numExistingCols; |
| } |
| else |
| { |
| minipageIndex = columnGroupNo; |
| } |
| |
| minipageInfo = &blockDirectory->minipages[minipageIndex]; |
| Assert(minipageInfo->numMinipageEntries <= (uint32) NUM_MINIPAGE_ENTRIES); |
| |
| /* |
| * Before we insert the new entry into the current minipage, we should |
| * check if the current minipage is full. If so, we write out the current |
| * minipage to the block directory relation and clear out the last minipage |
| * in-mem, making the current in-mem minipage empty and ready to hold the |
| * new entry (and beyond). |
| */ |
| if (IsMinipageFull(minipageInfo)) |
| { |
| write_minipage(blockDirectory, columnGroupNo, minipageInfo); |
| clear_minipage(minipageInfo); |
| SIMPLE_FAULT_INJECTOR("insert_new_entry_curr_minipage_full"); |
| } |
| |
| /* Now insert the new entry */ |
| Assert(minipageInfo->numMinipageEntries < (uint32) gp_blockdirectory_minipage_size); |
| entry = &(minipageInfo->minipage->entry[minipageInfo->numMinipageEntries]); |
| entry->firstRowNum = firstRowNum; |
| entry->fileOffset = fileOffset; |
| entry->rowCount = rowCount; |
| |
| minipageInfo->numMinipageEntries++; |
| |
| ereportif(Debug_appendonly_print_blockdirectory, LOG, |
| (errmsg("Append-only block directory insert entry: " |
| "(firstRowNum, columnGroupNo, fileOffset, rowCount) = (" INT64_FORMAT |
| ", %d, " INT64_FORMAT ", " INT64_FORMAT ") at index %d", |
| entry->firstRowNum, columnGroupNo, entry->fileOffset, entry->rowCount, |
| minipageInfo->numMinipageEntries - 1))); |
| |
| return true; |
| } |
| |
| /* |
| * AppendOnlyBlockDirectory_DeleteSegmentFile |
| * |
| * Deletes all block directory entries for given segment file of an |
| * append-only relation. |
| */ |
| void |
| AppendOnlyBlockDirectory_DeleteSegmentFile(Relation aoRel, |
| Snapshot snapshot, |
| int segno, |
| int columnGroupNo) |
| { |
| Oid blkdirrelid; |
| Oid blkdiridxid; |
| |
| GetAppendOnlyEntryAuxOids(aoRel, NULL, &blkdirrelid, &blkdiridxid, NULL, NULL); |
| |
| Assert(OidIsValid(blkdirrelid)); |
| Assert(OidIsValid(blkdiridxid)); |
| |
| Relation blkdirRel = table_open(blkdirrelid, RowExclusiveLock); |
| Relation blkdirIdx = index_open(blkdiridxid, RowExclusiveLock); |
| ScanKeyData scanKey; |
| SysScanDesc indexScan; |
| HeapTuple tuple; |
| |
| ScanKeyInit(&scanKey, |
| 1, /* segno */ |
| BTEqualStrategyNumber, |
| F_INT4EQ, |
| Int32GetDatum(segno)); |
| |
| indexScan = systable_beginscan_ordered(blkdirRel, |
| blkdirIdx, |
| snapshot, |
| 1 /* nkeys */, |
| &scanKey); |
| |
| while ((tuple = systable_getnext_ordered(indexScan, ForwardScanDirection)) != NULL) |
| { |
| CatalogTupleDelete(blkdirRel, &tuple->t_self); |
| } |
| systable_endscan_ordered(indexScan); |
| |
| index_close(blkdirIdx, RowExclusiveLock); |
| table_close(blkdirRel, RowExclusiveLock); |
| |
| } |
| |
| /* |
| * init_scankeys |
| * |
| * Initialize the scan keys. |
| */ |
| static void |
| init_scankeys(TupleDesc tupleDesc, |
| int nkeys, ScanKey scanKeys, |
| StrategyNumber *strategyNumbers) |
| { |
| int keyNo; |
| |
| Assert(nkeys <= tupleDesc->natts); |
| |
| for (keyNo = 0; keyNo < nkeys; keyNo++) |
| { |
| Oid atttypid = TupleDescAttr(tupleDesc, keyNo)->atttypid; |
| ScanKey scanKey = (ScanKey) (((char *) scanKeys) + |
| keyNo * sizeof(ScanKeyData)); |
| RegProcedure opfuncid; |
| StrategyNumber strategyNumber = strategyNumbers[keyNo]; |
| |
| Assert(strategyNumber <= BTMaxStrategyNumber && |
| strategyNumber != InvalidStrategy); |
| |
| if (strategyNumber == BTEqualStrategyNumber) |
| { |
| Oid eq_opr; |
| |
| get_sort_group_operators(atttypid, |
| false, true, false, |
| NULL, &eq_opr, NULL, NULL); |
| opfuncid = get_opcode(eq_opr); |
| ScanKeyEntryInitialize(scanKey, |
| 0, /* sk_flag */ |
| keyNo + 1, /* attribute number to scan */ |
| BTEqualStrategyNumber, /* strategy */ |
| InvalidOid, /* strategy subtype */ |
| InvalidOid, /* collation */ |
| opfuncid, /* reg proc to use */ |
| 0 /* constant */ |
| ); |
| } |
| else |
| { |
| Oid gtOid, |
| leOid; |
| |
| get_sort_group_operators(atttypid, |
| false, false, true, |
| NULL, NULL, >Oid, NULL); |
| leOid = get_negator(gtOid); |
| opfuncid = get_opcode(leOid); |
| |
| ScanKeyEntryInitialize(scanKey, |
| 0, /* sk_flag */ |
| keyNo + 1, /* attribute number to scan */ |
| strategyNumber, /* strategy */ |
| InvalidOid, /* strategy subtype */ |
| InvalidOid, /* collation */ |
| opfuncid, /* reg proc to use */ |
| 0 /* constant */ |
| ); |
| } |
| } |
| } |
| |
| |
| /* |
| * extract_minipage |
| * |
| * Extract the minipage info from the given tuple. The tupleTid |
| * is also set here. |
| */ |
| static void |
| extract_minipage(AppendOnlyBlockDirectory *blockDirectory, |
| HeapTuple tuple, |
| TupleDesc tupleDesc, |
| int columnGroupNo) |
| { |
| Datum *values = blockDirectory->values; |
| bool *nulls = blockDirectory->nulls; |
| MinipagePerColumnGroup *minipageInfo = &blockDirectory->minipages[columnGroupNo]; |
| |
| heap_deform_tuple(tuple, tupleDesc, values, nulls); |
| |
| Assert(blockDirectory->currentSegmentFileNum == |
| DatumGetInt32(values[Anum_pg_aoblkdir_segno - 1])); |
| |
| /* |
| * Copy out the minipage |
| */ |
| copy_out_minipage(minipageInfo, |
| values[Anum_pg_aoblkdir_minipage - 1], |
| nulls[Anum_pg_aoblkdir_minipage - 1]); |
| |
| ItemPointerCopy(&tuple->t_self, &minipageInfo->tupleTid); |
| } |
| |
| /* |
| * load_last_minipage |
| * |
| * Search through the block directory btree to find the last row that |
| * contains the last minipage. |
| */ |
| static void |
| load_last_minipage(AppendOnlyBlockDirectory *blockDirectory, |
| int64 lastSequence, |
| int columnGroupNo) |
| { |
| Relation blkdirRel = blockDirectory->blkdirRel; |
| Relation blkdirIdx = blockDirectory->blkdirIdx; |
| TupleDesc heapTupleDesc; |
| SysScanDesc idxScanDesc; |
| HeapTuple tuple = NULL; |
| MemoryContext oldcxt; |
| int numScanKeys = blockDirectory->numScanKeys; |
| ScanKey scanKeys = blockDirectory->scanKeys; |
| |
| #ifdef USE_ASSERT_CHECKING |
| StrategyNumber *strategyNumbers = blockDirectory->strategyNumbers; |
| #endif /* USE_ASSERT_CHECKING */ |
| |
| Assert(blockDirectory->aoRel != NULL); |
| Assert(blockDirectory->blkdirRel != NULL); |
| Assert(blockDirectory->blkdirIdx != NULL); |
| |
| oldcxt = MemoryContextSwitchTo(blockDirectory->memoryContext); |
| |
| heapTupleDesc = RelationGetDescr(blkdirRel); |
| |
| Assert(numScanKeys == 3); |
| Assert(blockDirectory->currentSegmentFileInfo != NULL); |
| |
| /* Setup the scan keys for the scan. */ |
| Assert(scanKeys != NULL); |
| Assert(strategyNumbers != NULL); |
| if (lastSequence == 0) |
| lastSequence = 1; |
| |
| scanKeys[0].sk_argument = |
| Int32GetDatum(blockDirectory->currentSegmentFileNum); |
| scanKeys[1].sk_argument = Int32GetDatum(columnGroupNo); |
| scanKeys[2].sk_argument = Int64GetDatum(lastSequence); |
| |
| /* |
| * Search the btree to find the entry in the block directory that contains |
| * the last minipage. |
| */ |
| idxScanDesc = systable_beginscan_ordered(blkdirRel, blkdirIdx, |
| blockDirectory->appendOnlyMetaDataSnapshot, |
| numScanKeys, scanKeys); |
| |
| tuple = systable_getnext_ordered(idxScanDesc, BackwardScanDirection); |
| if (tuple != NULL) |
| { |
| extract_minipage(blockDirectory, |
| tuple, |
| heapTupleDesc, |
| columnGroupNo); |
| } |
| |
| systable_endscan_ordered(idxScanDesc); |
| |
| MemoryContextSwitchTo(oldcxt); |
| |
| ereportif(Debug_appendonly_print_blockdirectory, LOG, |
| (errmsg("Append-only block directory load last minipage: " |
| "(columnGroupNo, lastSequence, nEntries) = (%d, " INT64_FORMAT ", %u)", |
| columnGroupNo, lastSequence, |
| blockDirectory->minipages[columnGroupNo].numMinipageEntries))); |
| |
| } |
| |
| /* |
| * find_minipage_entry |
| * |
| * Find the minipage entry that covers the given rowNum. |
| * If such an entry does not exists, -1 is returned. Otherwise |
| * the index to such an entry in the minipage array is returned. |
| */ |
| static int |
| find_minipage_entry(Minipage *minipage, |
| uint32 numEntries, |
| int64 rowNum) |
| { |
| int start_no, |
| end_no; |
| int entry_no; |
| MinipageEntry *entry; |
| |
| start_no = 0; |
| end_no = numEntries - 1; |
| while (start_no <= end_no) |
| { |
| entry_no = start_no + (end_no - start_no + 1) / 2; |
| Assert(entry_no >= start_no && entry_no <= end_no); |
| |
| entry = &(minipage->entry[entry_no]); |
| |
| Assert(entry->firstRowNum > 0); |
| Assert(entry->rowCount > 0); |
| |
| if (entry->firstRowNum <= rowNum && |
| entry->firstRowNum + entry->rowCount > rowNum) |
| break; |
| else if (entry->firstRowNum > rowNum) |
| { |
| end_no = entry_no - 1; |
| } |
| else |
| { |
| start_no = entry_no + 1; |
| } |
| } |
| |
| if (start_no <= end_no) |
| return entry_no; |
| else |
| return -1; |
| } |
| |
| /* |
| * write_minipage |
| * |
| * Write the in-memory minipage to the block directory relation. |
| */ |
| static void |
| write_minipage(AppendOnlyBlockDirectory *blockDirectory, |
| int columnGroupNo, MinipagePerColumnGroup *minipageInfo) |
| { |
| HeapTuple tuple; |
| MemoryContext oldcxt; |
| Datum *values = blockDirectory->values; |
| bool *nulls = blockDirectory->nulls; |
| Relation blkdirRel = blockDirectory->blkdirRel; |
| CatalogIndexState indinfo = blockDirectory->indinfo; |
| TupleDesc heapTupleDesc = RelationGetDescr(blkdirRel); |
| |
| Assert(minipageInfo->numMinipageEntries > 0); |
| |
| oldcxt = MemoryContextSwitchTo(blockDirectory->memoryContext); |
| |
| Assert(blkdirRel != NULL); |
| |
| values[Anum_pg_aoblkdir_segno - 1] = |
| Int32GetDatum(blockDirectory->currentSegmentFileNum); |
| nulls[Anum_pg_aoblkdir_segno - 1] = false; |
| |
| values[Anum_pg_aoblkdir_columngroupno - 1] = |
| Int32GetDatum(columnGroupNo); |
| nulls[Anum_pg_aoblkdir_columngroupno - 1] = false; |
| |
| values[Anum_pg_aoblkdir_firstrownum - 1] = |
| Int64GetDatum(minipageInfo->minipage->entry[0].firstRowNum); |
| nulls[Anum_pg_aoblkdir_firstrownum - 1] = false; |
| |
| SET_VARSIZE(minipageInfo->minipage, |
| minipage_size(minipageInfo->numMinipageEntries)); |
| minipageInfo->minipage->nEntry = minipageInfo->numMinipageEntries; |
| values[Anum_pg_aoblkdir_minipage - 1] = |
| PointerGetDatum(minipageInfo->minipage); |
| nulls[Anum_pg_aoblkdir_minipage - 1] = false; |
| |
| tuple = heaptuple_form_to(heapTupleDesc, |
| values, |
| nulls, |
| NULL, |
| NULL); |
| |
| /* |
| * Write out the minipage to the block directory relation. If this |
| * minipage is already in the relation, we update the row. Otherwise, a |
| * new row is inserted. |
| */ |
| if (ItemPointerIsValid(&minipageInfo->tupleTid)) |
| { |
| ereportif(Debug_appendonly_print_blockdirectory, LOG, |
| (errmsg("Append-only block directory update a minipage: " |
| "(segno, columnGroupNo, nEntries, firstRowNum) = " |
| "(%d, %d, %u, " INT64_FORMAT ")", |
| blockDirectory->currentSegmentFileNum, |
| columnGroupNo, minipageInfo->numMinipageEntries, |
| minipageInfo->minipage->entry[0].firstRowNum))); |
| |
| CatalogTupleUpdateWithInfo(blkdirRel, &minipageInfo->tupleTid, tuple, |
| indinfo); |
| } |
| else |
| { |
| ereportif(Debug_appendonly_print_blockdirectory, LOG, |
| (errmsg("Append-only block directory insert a minipage: " |
| "(segno, columnGroupNo, nEntries, firstRowNum) = " |
| "(%d, %d, %u, " INT64_FORMAT ")", |
| blockDirectory->currentSegmentFileNum, |
| columnGroupNo, minipageInfo->numMinipageEntries, |
| minipageInfo->minipage->entry[0].firstRowNum))); |
| |
| CatalogTupleInsertWithInfo(blkdirRel, tuple, indinfo); |
| } |
| |
| /* memorize updated/inserted tuple header info */ |
| ItemPointerCopy(&tuple->t_self, &minipageInfo->tupleTid); |
| |
| heap_freetuple(tuple); |
| |
| MemoryContextSwitchTo(oldcxt); |
| } |
| |
| static void |
| clear_minipage(MinipagePerColumnGroup *minipagePerColumnGroup) |
| { |
| MemSet(minipagePerColumnGroup->minipage->entry, 0, |
| minipagePerColumnGroup->numMinipageEntries * sizeof(MinipageEntry)); |
| minipagePerColumnGroup->numMinipageEntries = 0; |
| ItemPointerSetInvalid(&minipagePerColumnGroup->tupleTid); |
| } |
| |
| /* |
| * AppendOnlyBlockDirectory_InsertPlaceholder |
| * |
| * We perform uniqueness checks by looking up block directory rows that cover |
| * the rowNum indicated by the aotid obtained from the index. See |
| * AppendOnlyBlockDirectory_CoversTuple() for details. |
| * |
| * However, there are multiple time windows in which there are no covering block |
| * directory entries in the table for already inserted data rows. Such time |
| * windows start from when a data row is inserted and lasts till the block |
| * directory row covering it is written to the block directory table (see |
| * write_minipage()). Block directory rows are written only when: |
| * (i) the current in-memory minipage is full |
| * (ii) at end of command. |
| * |
| * So we insert a placeholder entry in the current block directory row and |
| * persist the row before the first insert to cover rows in the range: |
| * [firstRowNum, lastRowNum], starting at firstOffset in the relfile |
| * corresponding to columnGroupNo. |
| * |
| * firstRowNum is the rowNum assigned to the 1st insert of the insert command. |
| * lastRowNum is the last rowNum that will be entered by the insert command, |
| * which is something unknown to us. So, to cover all such windows during the |
| * insert command's execution, we insert an entry with a placeholder |
| * rowcount = AOTupleId_MaxRowNum into the current minipage and write it to the |
| * relation (by reusing the machinery in write_minipage()). Such a row whose |
| * last entry is a placeholder entry is called a placeholder row. This entry |
| * will cover up to lastRowNum, whatever its value may be, for all such time |
| * windows during the insert command. |
| * |
| * Safety: |
| * (1) The placeholder upper bound is not a concern as this row will be consulted |
| * ONLY by SNAPSHOT_DIRTY (for uniqueness checks) and will be ignored by regular |
| * MVCC processing (for index scans). Eventually, it will be rendered invisible |
| * as it will be updated by a subsequent write_minipage() or by virtue of abort. |
| * |
| * (2) There is no way a placeholder row will detect spurious conflicts due to |
| * its loose upper bound, in the same segment file, to which it maps. This is |
| * because there can be no other rows inserted into a segment file other than |
| * the insert operation that is currently in progress on the file. |
| */ |
| void |
| AppendOnlyBlockDirectory_InsertPlaceholder(AppendOnlyBlockDirectory *blockDirectory, |
| int64 firstRowNum, |
| int64 fileOffset, |
| int columnGroupNo) |
| { |
| MinipagePerColumnGroup *minipagePerColumnGroup; |
| |
| Assert(firstRowNum > 0); |
| Assert(fileOffset >= 0); |
| Assert(RelationIsValid(blockDirectory->blkdirRel)); |
| Assert(columnGroupNo >= 0 && |
| columnGroupNo < blockDirectory->aoRel->rd_att->natts); |
| |
| minipagePerColumnGroup = &blockDirectory->minipages[columnGroupNo]; |
| |
| /* insert placeholder entry with a max row count */ |
| insert_new_entry(blockDirectory, columnGroupNo, firstRowNum, fileOffset, |
| AOTupleId_MaxRowNum, false); |
| /* insert placeholder row containing placeholder entry */ |
| write_minipage(blockDirectory, columnGroupNo, minipagePerColumnGroup); |
| /* |
| * Delete the placeholder entry as it has no business being in memory. |
| * Removing it from the current minipage will make rest of the processing |
| * for the current command behave as if it never existed. The absence of |
| * this entry will help effectively "update" it once it's replacement entry |
| * is created in memory, in a subsequent call to insert_new_entry(), |
| * followed by a write_minipage() which will make this "update" persistent. |
| */ |
| minipagePerColumnGroup->numMinipageEntries--; |
| /* |
| * Increment the command counter, as we will be updating this temp row later |
| * on in write_minipage(). |
| */ |
| CommandCounterIncrement(); |
| } |
| |
| void |
| AppendOnlyBlockDirectory_End_forInsert( |
| AppendOnlyBlockDirectory *blockDirectory) |
| { |
| int groupNo; |
| |
| if (blockDirectory->blkdirRel == NULL || |
| blockDirectory->blkdirIdx == NULL) |
| return; |
| for (groupNo = 0; groupNo < blockDirectory->numColumnGroups; groupNo++) |
| { |
| MinipagePerColumnGroup *minipageInfo = |
| &blockDirectory->minipages[groupNo]; |
| |
| if (minipageInfo->numMinipageEntries > 0) |
| { |
| write_minipage(blockDirectory, groupNo, minipageInfo); |
| ereportif(Debug_appendonly_print_blockdirectory, LOG, |
| (errmsg("Append-only block directory end of insert write minipage: " |
| "(columnGroupNo, nEntries) = (%d, %u)", |
| groupNo, minipageInfo->numMinipageEntries))); |
| } |
| |
| pfree(minipageInfo->minipage); |
| } |
| |
| ereportif(Debug_appendonly_print_blockdirectory, LOG, |
| (errmsg("Append-only block directory end for insert: " |
| "(segno, numColumnGroups, isAOCol)=" |
| "(%d, %d, %d)", |
| blockDirectory->currentSegmentFileNum, |
| blockDirectory->numColumnGroups, |
| blockDirectory->isAOCol))); |
| |
| pfree(blockDirectory->values); |
| pfree(blockDirectory->nulls); |
| pfree(blockDirectory->minipages); |
| pfree(blockDirectory->scanKeys); |
| pfree(blockDirectory->strategyNumbers); |
| |
| index_close(blockDirectory->blkdirIdx, RowExclusiveLock); |
| heap_close(blockDirectory->blkdirRel, RowExclusiveLock); |
| CatalogCloseIndexes(blockDirectory->indinfo); |
| |
| MemoryContextDelete(blockDirectory->memoryContext); |
| } |
| |
| void |
| AppendOnlyBlockDirectory_End_forSearch( |
| AppendOnlyBlockDirectory *blockDirectory) |
| { |
| int groupNo; |
| |
| if (blockDirectory->blkdirRel == NULL) |
| return; |
| |
| for (groupNo = 0; groupNo < blockDirectory->numColumnGroups; groupNo++) |
| { |
| if (blockDirectory->minipages[groupNo].minipage != NULL) |
| pfree(blockDirectory->minipages[groupNo].minipage); |
| } |
| |
| ereportif(Debug_appendonly_print_blockdirectory, LOG, |
| (errmsg("Append-only block directory end for search: " |
| "(totalSegfiles, numColumnGroups, isAOCol)=" |
| "(%d, %d, %d)", |
| blockDirectory->totalSegfiles, |
| blockDirectory->numColumnGroups, |
| blockDirectory->isAOCol))); |
| |
| pfree(blockDirectory->values); |
| pfree(blockDirectory->nulls); |
| pfree(blockDirectory->minipages); |
| pfree(blockDirectory->scanKeys); |
| pfree(blockDirectory->strategyNumbers); |
| |
| if (blockDirectory->blkdirIdx) |
| index_close(blockDirectory->blkdirIdx, AccessShareLock); |
| heap_close(blockDirectory->blkdirRel, AccessShareLock); |
| |
| MemoryContextDelete(blockDirectory->memoryContext); |
| } |
| |
| void |
| AppendOnlyBlockDirectory_End_addCol( |
| AppendOnlyBlockDirectory *blockDirectory) |
| { |
| int groupNo; |
| |
| /* newly added columns have attribute number beginning with this */ |
| AttrNumber colno = blockDirectory->aoRel->rd_att->natts - |
| blockDirectory->numColumnGroups; |
| |
| if (blockDirectory->blkdirRel == NULL || |
| blockDirectory->blkdirIdx == NULL) |
| return; |
| for (groupNo = 0; groupNo < blockDirectory->numColumnGroups; groupNo++) |
| { |
| MinipagePerColumnGroup *minipageInfo = |
| &blockDirectory->minipages[groupNo]; |
| |
| if (minipageInfo->numMinipageEntries > 0) |
| { |
| write_minipage(blockDirectory, groupNo + colno, minipageInfo); |
| ereportif(Debug_appendonly_print_blockdirectory, LOG, |
| (errmsg("Append-only block directory end of insert write" |
| " minipage: (columnGroupNo, nEntries) = (%d, %u)", |
| groupNo, minipageInfo->numMinipageEntries))); |
| } |
| pfree(minipageInfo->minipage); |
| } |
| |
| ereportif(Debug_appendonly_print_blockdirectory, LOG, |
| (errmsg("Append-only block directory end for insert: " |
| "(segno, numColumnGroups, isAOCol)=" |
| "(%d, %d, %d)", |
| blockDirectory->currentSegmentFileNum, |
| blockDirectory->numColumnGroups, |
| blockDirectory->isAOCol))); |
| |
| pfree(blockDirectory->values); |
| pfree(blockDirectory->nulls); |
| pfree(blockDirectory->minipages); |
| pfree(blockDirectory->scanKeys); |
| pfree(blockDirectory->strategyNumbers); |
| |
| /* |
| * We already hold transaction-scope exclusive lock on the AOCS relation. |
| * Let's defer release of locks on block directory as well until the end |
| * of alter-table transaction. |
| */ |
| index_close(blockDirectory->blkdirIdx, NoLock); |
| heap_close(blockDirectory->blkdirRel, NoLock); |
| CatalogCloseIndexes(blockDirectory->indinfo); |
| |
| MemoryContextDelete(blockDirectory->memoryContext); |
| } |
| |
| void |
| AppendOnlyBlockDirectory_End_forUniqueChecks(AppendOnlyBlockDirectory *blockDirectory) |
| { |
| Assert(RelationIsValid(blockDirectory->blkdirRel)); |
| |
| /* This must have been reset after each uniqueness check */ |
| Assert(blockDirectory->appendOnlyMetaDataSnapshot == InvalidSnapshot); |
| |
| Assert(RelationIsValid(blockDirectory->blkdirIdx)); |
| Assert(RelationIsValid(blockDirectory->blkdirRel)); |
| |
| ereportif(Debug_appendonly_print_blockdirectory, LOG, |
| (errmsg("Append-only block directory end for unique checks"), |
| errdetail("(aoRel = %u, blkdirrel = %u, blkdiridxrel = %u)", |
| blockDirectory->aoRel->rd_id, |
| blockDirectory->blkdirRel->rd_id, |
| blockDirectory->blkdirIdx->rd_id))); |
| |
| index_close(blockDirectory->blkdirIdx, AccessShareLock); |
| heap_close(blockDirectory->blkdirRel, AccessShareLock); |
| |
| MemoryContextDelete(blockDirectory->memoryContext); |
| } |