| /* |
| * brin.c |
| * Implementation of BRIN indexes for Postgres |
| * |
| * See src/backend/access/brin/README for details. |
| * |
| * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group |
| * Portions Copyright (c) 1994, Regents of the University of California |
| * |
| * IDENTIFICATION |
| * src/backend/access/brin/brin.c |
| * |
| * TODO |
| * * ScalarArrayOpExpr (amsearcharray -> SK_SEARCHARRAY) |
| */ |
| #include "postgres.h" |
| |
| #include "access/aosegfiles.h" |
| #include "access/aocssegfiles.h" |
| #include "access/brin.h" |
| #include "access/brin_page.h" |
| #include "access/brin_pageops.h" |
| #include "access/brin_xlog.h" |
| #include "access/relation.h" |
| #include "access/reloptions.h" |
| #include "access/relscan.h" |
| #include "access/table.h" |
| #include "access/tableam.h" |
| #include "access/xloginsert.h" |
| #include "catalog/index.h" |
| #include "catalog/pg_am.h" |
| #include "commands/vacuum.h" |
| #include "miscadmin.h" |
| #include "pgstat.h" |
| #include "postmaster/autovacuum.h" |
| #include "storage/bufmgr.h" |
| #include "storage/freespace.h" |
| #include "utils/acl.h" |
| #include "utils/builtins.h" |
| #include "utils/datum.h" |
| #include "utils/fmgrprotos.h" |
| #include "utils/index_selfuncs.h" |
| #include "utils/memutils.h" |
| #include "utils/rel.h" |
| |
| /* GPDB includes */ |
| #include "storage/procarray.h" |
| #include "utils/faultinjector.h" |
| |
| /* |
| * We use a BrinBuildState during initial construction of a BRIN index. |
| * The running state is kept in a BrinMemTuple. |
| */ |
| typedef struct BrinBuildState |
| { |
| Relation bs_irel; |
| int bs_numtuples; |
| Buffer bs_currentInsertBuf; |
| BlockNumber bs_pagesPerRange; |
| BlockNumber bs_currRangeStart; |
| BrinRevmap *bs_rmAccess; |
| BrinDesc *bs_bdesc; |
| BrinMemTuple *bs_dtuple; |
| |
| /* GPDB specific state for AO/CO tables */ |
| |
| bool bs_isAO; |
| /* Have we incorporated even one data tuple into the build state? */ |
| bool bs_aoHasDataTuple; |
| } BrinBuildState; |
| |
| /* |
| * Struct used as "opaque" during index scans |
| */ |
| typedef struct BrinOpaque |
| { |
| BlockNumber bo_pagesPerRange; |
| BrinRevmap *bo_rmAccess; |
| BrinDesc *bo_bdesc; |
| } BrinOpaque; |
| |
| #define BRIN_ALL_BLOCKRANGES InvalidBlockNumber |
| |
| static BrinBuildState * |
| initialize_brin_buildstate(Relation idxRel, |
| BrinRevmap *revmap, |
| BlockNumber pagesPerRange, |
| bool isAO); |
| static void terminate_brin_buildstate(BrinBuildState *state); |
| static void brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange, |
| bool include_partial, double *numSummarized, double *numExisting); |
| static void form_and_insert_tuple(BrinBuildState *state); |
| static void union_tuples(BrinDesc *bdesc, BrinMemTuple *a, |
| BrinTuple *b); |
| static void brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy); |
| static bool add_values_to_range(Relation idxRel, BrinDesc *bdesc, |
| BrinMemTuple *dtup, Datum *values, bool *nulls); |
| static bool check_null_keys(BrinValues *bval, ScanKey *nullkeys, int nnullkeys); |
| |
| /* |
| * BRIN handler function: return IndexAmRoutine with access method parameters |
| * and callbacks. |
| */ |
| Datum |
| brinhandler(PG_FUNCTION_ARGS) |
| { |
| IndexAmRoutine *amroutine = makeNode(IndexAmRoutine); |
| |
| amroutine->amstrategies = 0; |
| amroutine->amsupport = BRIN_LAST_OPTIONAL_PROCNUM; |
| amroutine->amoptsprocnum = BRIN_PROCNUM_OPTIONS; |
| amroutine->amcanorder = false; |
| amroutine->amcanorderbyop = false; |
| amroutine->amcanbackward = false; |
| amroutine->amcanunique = false; |
| amroutine->amcanmulticol = true; |
| amroutine->amoptionalkey = true; |
| amroutine->amsearcharray = false; |
| amroutine->amsearchnulls = true; |
| amroutine->amstorage = true; |
| amroutine->amclusterable = false; |
| amroutine->ampredlocks = false; |
| amroutine->amcanparallel = false; |
| amroutine->amcaninclude = false; |
| amroutine->amusemaintenanceworkmem = false; |
| amroutine->amparallelvacuumoptions = |
| VACUUM_OPTION_PARALLEL_CLEANUP; |
| amroutine->amkeytype = InvalidOid; |
| |
| amroutine->ambuild = brinbuild; |
| amroutine->ambuildempty = brinbuildempty; |
| amroutine->aminsert = brininsert; |
| amroutine->ambulkdelete = brinbulkdelete; |
| amroutine->amvacuumcleanup = brinvacuumcleanup; |
| amroutine->amcanreturn = NULL; |
| amroutine->amcostestimate = brincostestimate; |
| amroutine->amoptions = brinoptions; |
| amroutine->amproperty = NULL; |
| amroutine->ambuildphasename = NULL; |
| amroutine->amvalidate = brinvalidate; |
| amroutine->amadjustmembers = NULL; |
| amroutine->ambeginscan = brinbeginscan; |
| amroutine->amrescan = brinrescan; |
| amroutine->amgettuple = NULL; |
| amroutine->amgetbitmap = bringetbitmap; |
| amroutine->amendscan = brinendscan; |
| amroutine->ammarkpos = NULL; |
| amroutine->amrestrpos = NULL; |
| amroutine->amestimateparallelscan = NULL; |
| amroutine->aminitparallelscan = NULL; |
| amroutine->amparallelrescan = NULL; |
| |
| PG_RETURN_POINTER(amroutine); |
| } |
| |
| /* |
| * A tuple in the heap is being inserted. To keep a brin index up to date, |
| * we need to obtain the relevant index tuple and compare its stored values |
| * with those of the new tuple. If the tuple values are not consistent with |
| * the summary tuple, we need to update the index tuple. |
| * |
| * If autosummarization is enabled, check if we need to summarize the previous |
| * page range. |
| * |
| * If the range is not currently summarized (i.e. the revmap returns NULL for |
| * it), there's nothing to do for this tuple. |
| */ |
| bool |
| brininsert(Relation idxRel, Datum *values, bool *nulls, |
| ItemPointer heaptid, Relation heapRel, |
| IndexUniqueCheck checkUnique, |
| bool indexUnchanged, |
| IndexInfo *indexInfo) |
| { |
| BlockNumber pagesPerRange; |
| BlockNumber origHeapBlk; |
| BlockNumber heapBlk; |
| BrinDesc *bdesc = (BrinDesc *) indexInfo->ii_AmCache; |
| BrinRevmap *revmap; |
| Buffer buf = InvalidBuffer; |
| MemoryContext tupcxt = NULL; |
| MemoryContext oldcxt = CurrentMemoryContext; |
| bool autosummarize = BrinGetAutoSummarize(idxRel); |
| |
| /* |
| * GPDB: XXX: We initialize the revmap per-tuple. This routine has |
| * non-trivial CPU overhead (including a snapshot test and meta-page lock) |
| * Also, there is definitely memory overhead (even more so for GPDB, due to |
| * the added AO/CO specific state) |
| * |
| * Can we cache the access struct somehow, maybe in BrinDesc (as |
| * part of IndexInfo->ii_AmCache)? Both heap tables and AO/CO tables can |
| * definitely benefit from it. There might be concurrency concerns, however. |
| */ |
| revmap = brinRevmapInitialize(idxRel, &pagesPerRange, NULL); |
| |
| /* |
| * origHeapBlk is the block number where the insertion occurred. heapBlk |
| * is the first block in the corresponding page range. |
| */ |
| origHeapBlk = ItemPointerGetBlockNumber(heaptid); |
| heapBlk = brin_range_start_blk(origHeapBlk, |
| RelationIsAppendOptimized(heapRel), |
| pagesPerRange); |
| |
| /* |
| * GPDB: Due to the appendonly nature of AO/CO tables, we would always write |
| * to the last logical heap block within a block sequence (due to |
| * monotonically increasing gp_fastsequence allocations). Thus, unlike for |
| * heap, blocks other than the last block would never be summarized as a |
| * result of an insert. |
| * |
| * This holds true even for INSERTs following a VACUUM on a given segment, |
| * since VACUUM does not reset gp_fastsequence on the VACUUMed segment. |
| * |
| * So, we can safely position the revmap iterator at the end of the chain |
| * (instead of traversing the chain unnecessarily from the front). |
| */ |
| if (RelationIsAppendOptimized(heapRel)) |
| brinRevmapAOPositionAtEnd(revmap, AOSegmentGet_blockSequenceNum(heapBlk)); |
| |
| for (;;) |
| { |
| bool need_insert = false; |
| OffsetNumber off; |
| BrinTuple *brtup; |
| BrinMemTuple *dtup; |
| |
| CHECK_FOR_INTERRUPTS(); |
| |
| /* |
| * If auto-summarization is enabled and we just inserted the first |
| * tuple into the first block of a new non-first page range, request a |
| * summarization run of the previous range. |
| */ |
| if (autosummarize && |
| heapBlk > 0 && |
| heapBlk == origHeapBlk && |
| ItemPointerGetOffsetNumber(heaptid) == FirstOffsetNumber) |
| { |
| BlockNumber lastPageRange = heapBlk - 1; |
| BrinTuple *lastPageTuple; |
| |
| lastPageTuple = |
| brinGetTupleForHeapBlock(revmap, lastPageRange, &buf, &off, |
| NULL, BUFFER_LOCK_SHARE, NULL); |
| if (!lastPageTuple) |
| { |
| bool recorded; |
| |
| recorded = AutoVacuumRequestWork(AVW_BRINSummarizeRange, |
| RelationGetRelid(idxRel), |
| lastPageRange); |
| if (!recorded) |
| ereport(LOG, |
| (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
| errmsg("request for BRIN range summarization for index \"%s\" page %u was not recorded", |
| RelationGetRelationName(idxRel), |
| lastPageRange))); |
| } |
| else |
| LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
| } |
| |
| brtup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off, |
| NULL, BUFFER_LOCK_SHARE, NULL); |
| |
| /* if range is unsummarized, there's nothing to do */ |
| if (!brtup) |
| break; |
| |
| /* First time through in this statement? */ |
| if (bdesc == NULL) |
| { |
| MemoryContextSwitchTo(indexInfo->ii_Context); |
| bdesc = brin_build_desc(idxRel); |
| indexInfo->ii_AmCache = (void *) bdesc; |
| MemoryContextSwitchTo(oldcxt); |
| } |
| /* First time through in this brininsert call? */ |
| if (tupcxt == NULL) |
| { |
| tupcxt = AllocSetContextCreate(CurrentMemoryContext, |
| "brininsert cxt", |
| ALLOCSET_DEFAULT_SIZES); |
| MemoryContextSwitchTo(tupcxt); |
| } |
| |
| dtup = brin_deform_tuple(bdesc, brtup, NULL); |
| |
| need_insert = add_values_to_range(idxRel, bdesc, dtup, values, nulls); |
| |
| if (!need_insert) |
| { |
| /* |
| * The tuple is consistent with the new values, so there's nothing |
| * to do. |
| */ |
| LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
| } |
| else |
| { |
| Page page = BufferGetPage(buf); |
| ItemId lp = PageGetItemId(page, off); |
| Size origsz; |
| BrinTuple *origtup; |
| Size newsz; |
| BrinTuple *newtup; |
| bool samepage; |
| |
| /* |
| * Make a copy of the old tuple, so that we can compare it after |
| * re-acquiring the lock. |
| */ |
| origsz = ItemIdGetLength(lp); |
| origtup = brin_copy_tuple(brtup, origsz, NULL, NULL); |
| |
| /* |
| * Before releasing the lock, check if we can attempt a same-page |
| * update. Another process could insert a tuple concurrently in |
| * the same page though, so downstream we must be prepared to cope |
| * if this turns out to not be possible after all. |
| */ |
| newtup = brin_form_tuple(bdesc, heapBlk, dtup, &newsz); |
| samepage = brin_can_do_samepage_update(buf, origsz, newsz); |
| LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
| |
| /* |
| * Try to update the tuple. If this doesn't work for whatever |
| * reason, we need to restart from the top; the revmap might be |
| * pointing at a different tuple for this block now, so we need to |
| * recompute to ensure both our new heap tuple and the other |
| * inserter's are covered by the combined tuple. It might be that |
| * we don't need to update at all. |
| */ |
| if (!brin_doupdate(idxRel, pagesPerRange, revmap, heapBlk, |
| buf, off, origtup, origsz, newtup, newsz, |
| samepage)) |
| { |
| /* no luck; start over */ |
| MemoryContextResetAndDeleteChildren(tupcxt); |
| continue; |
| } |
| } |
| |
| /* success! */ |
| break; |
| } |
| |
| brinRevmapTerminate(revmap); |
| if (BufferIsValid(buf)) |
| ReleaseBuffer(buf); |
| MemoryContextSwitchTo(oldcxt); |
| if (tupcxt != NULL) |
| MemoryContextDelete(tupcxt); |
| |
| return false; |
| } |
| |
| /* |
| * Initialize state for a BRIN index scan. |
| * |
| * We read the metapage here to determine the pages-per-range number that this |
| * index was built with. Note that since this cannot be changed while we're |
| * holding lock on index, it's not necessary to recompute it during brinrescan. |
| */ |
| IndexScanDesc |
| brinbeginscan(Relation r, int nkeys, int norderbys) |
| { |
| IndexScanDesc scan; |
| BrinOpaque *opaque; |
| |
| scan = RelationGetIndexScan(r, nkeys, norderbys); |
| |
| opaque = (BrinOpaque *) palloc(sizeof(BrinOpaque)); |
| opaque->bo_rmAccess = brinRevmapInitialize(r, &opaque->bo_pagesPerRange, |
| scan->xs_snapshot); |
| opaque->bo_bdesc = brin_build_desc(r); |
| scan->opaque = opaque; |
| |
| return scan; |
| } |
| |
| /* |
| * Execute the index scan. |
| * |
| * This works by reading index TIDs from the revmap, and obtaining the index |
| * tuples pointed to by them; the summary values in the index tuples are |
| * compared to the scan keys. We return into the TID bitmap all the pages in |
| * ranges corresponding to index tuples that match the scan keys. |
| * |
| * If a TID from the revmap is read as InvalidTID, we know that range is |
| * unsummarized. Pages in those ranges need to be returned regardless of scan |
| * keys. |
| */ |
| int64 |
| bringetbitmap(IndexScanDesc scan, Node **bmNodeP) |
| { |
| TIDBitmap *tbm; |
| Relation idxRel = scan->indexRelation; |
| Buffer buf = InvalidBuffer; |
| BrinDesc *bdesc; |
| Oid heapOid; |
| Relation heapRel; |
| BrinOpaque *opaque; |
| BlockNumber heapBlk; |
| int totalpages = 0; |
| FmgrInfo *consistentFn; |
| MemoryContext oldcxt; |
| MemoryContext perRangeCxt; |
| BrinMemTuple *dtup; |
| BrinTuple *btup = NULL; |
| Size btupsz = 0; |
| ScanKey **keys, |
| **nullkeys; |
| int *nkeys, |
| *nnullkeys; |
| int keyno; |
| char *ptr; |
| Size len; |
| char *tmp PG_USED_FOR_ASSERTS_ONLY; |
| |
| /* GPDB: Used for iterating over the revmap */ |
| int numSequences; |
| BlockSequence *sequences; |
| |
| opaque = (BrinOpaque *) scan->opaque; |
| bdesc = opaque->bo_bdesc; |
| pgstat_count_index_scan(idxRel); |
| |
| /* |
| * GPDB specific code. Since GPDB also support StreamBitmap |
| * in bitmap index. So normally we need to create specific bitmap |
| * node in the amgetbitmap AM. |
| */ |
| Assert(bmNodeP); |
| if (*bmNodeP == NULL) |
| { |
| /* XXX should we use less than work_mem for this? */ |
| tbm = tbm_create(work_mem * 1024L, scan->dsa); |
| *bmNodeP = (Node *) tbm; |
| } |
| else if (!IsA(*bmNodeP, TIDBitmap)) |
| elog(ERROR, "non brin bitmap"); |
| else |
| tbm = (TIDBitmap *)*bmNodeP; |
| |
| heapOid = IndexGetRelation(RelationGetRelid(idxRel), false); |
| heapRel = table_open(heapOid, AccessShareLock); |
| sequences = table_relation_get_block_sequences(heapRel, |
| &numSequences); |
| table_close(heapRel, AccessShareLock); |
| |
| /* |
| * Make room for the consistent support procedures of indexed columns. We |
| * don't look them up here; we do that lazily the first time we see a scan |
| * key reference each of them. We rely on zeroing fn_oid to InvalidOid. |
| */ |
| consistentFn = palloc0(sizeof(FmgrInfo) * bdesc->bd_tupdesc->natts); |
| |
| /* |
| * Make room for per-attribute lists of scan keys that we'll pass to the |
| * consistent support procedure. We don't know which attributes have scan |
| * keys, so we allocate space for all attributes. That may use more memory |
| * but it's probably cheaper than determining which attributes are used. |
| * |
| * We keep null and regular keys separate, so that we can pass just the |
| * regular keys to the consistent function easily. |
| * |
| * To reduce the allocation overhead, we allocate one big chunk and then |
| * carve it into smaller arrays ourselves. All the pieces have exactly the |
| * same lifetime, so that's OK. |
| * |
| * XXX The widest index can have 32 attributes, so the amount of wasted |
| * memory is negligible. We could invent a more compact approach (with |
| * just space for used attributes) but that would make the matching more |
| * complex so it's not a good trade-off. |
| */ |
| len = |
| MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts) + /* regular keys */ |
| MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys) * bdesc->bd_tupdesc->natts + |
| MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts) + |
| MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts) + /* NULL keys */ |
| MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys) * bdesc->bd_tupdesc->natts + |
| MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts); |
| |
| ptr = palloc(len); |
| tmp = ptr; |
| |
| keys = (ScanKey **) ptr; |
| ptr += MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts); |
| |
| nullkeys = (ScanKey **) ptr; |
| ptr += MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts); |
| |
| nkeys = (int *) ptr; |
| ptr += MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts); |
| |
| nnullkeys = (int *) ptr; |
| ptr += MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts); |
| |
| for (int i = 0; i < bdesc->bd_tupdesc->natts; i++) |
| { |
| keys[i] = (ScanKey *) ptr; |
| ptr += MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys); |
| |
| nullkeys[i] = (ScanKey *) ptr; |
| ptr += MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys); |
| } |
| |
| Assert(tmp + len == ptr); |
| |
| /* zero the number of keys */ |
| memset(nkeys, 0, sizeof(int) * bdesc->bd_tupdesc->natts); |
| memset(nnullkeys, 0, sizeof(int) * bdesc->bd_tupdesc->natts); |
| |
| /* Preprocess the scan keys - split them into per-attribute arrays. */ |
| for (keyno = 0; keyno < scan->numberOfKeys; keyno++) |
| { |
| ScanKey key = &scan->keyData[keyno]; |
| AttrNumber keyattno = key->sk_attno; |
| |
| /* |
| * The collation of the scan key must match the collation used in the |
| * index column (but only if the search is not IS NULL/ IS NOT NULL). |
| * Otherwise we shouldn't be using this index ... |
| */ |
| Assert((key->sk_flags & SK_ISNULL) || |
| (key->sk_collation == |
| TupleDescAttr(bdesc->bd_tupdesc, |
| keyattno - 1)->attcollation)); |
| |
| /* |
| * First time we see this index attribute, so init as needed. |
| * |
| * This is a bit of an overkill - we don't know how many scan keys are |
| * there for this attribute, so we simply allocate the largest number |
| * possible (as if all keys were for this attribute). This may waste a |
| * bit of memory, but we only expect small number of scan keys in |
| * general, so this should be negligible, and repeated repalloc calls |
| * are not free either. |
| */ |
| if (consistentFn[keyattno - 1].fn_oid == InvalidOid) |
| { |
| FmgrInfo *tmp; |
| |
| /* First time we see this attribute, so no key/null keys. */ |
| Assert(nkeys[keyattno - 1] == 0); |
| Assert(nnullkeys[keyattno - 1] == 0); |
| |
| tmp = index_getprocinfo(idxRel, keyattno, |
| BRIN_PROCNUM_CONSISTENT); |
| fmgr_info_copy(&consistentFn[keyattno - 1], tmp, |
| CurrentMemoryContext); |
| } |
| |
| /* Add key to the proper per-attribute array. */ |
| if (key->sk_flags & SK_ISNULL) |
| { |
| nullkeys[keyattno - 1][nnullkeys[keyattno - 1]] = key; |
| nnullkeys[keyattno - 1]++; |
| } |
| else |
| { |
| keys[keyattno - 1][nkeys[keyattno - 1]] = key; |
| nkeys[keyattno - 1]++; |
| } |
| } |
| |
| /* allocate an initial in-memory tuple, out of the per-range memcxt */ |
| dtup = brin_new_memtuple(bdesc); |
| |
| /* |
| * Setup and use a per-range memory context, which is reset every time we |
| * loop below. This avoids having to free the tuples within the loop. |
| */ |
| perRangeCxt = AllocSetContextCreate(CurrentMemoryContext, |
| "bringetbitmap cxt", |
| ALLOCSET_DEFAULT_SIZES); |
| oldcxt = MemoryContextSwitchTo(perRangeCxt); |
| |
| /* |
| * GPDB: We have the notion of BlockSequences to keep the following code |
| * section unified for AO/CO vs heap tables. Heap tables have only 1 |
| * block sequence, whereas AO/CO tables may have up to AOTupleId_MaxSegmentFileNum |
| * number of such sequences. The outer loop is thus a GPDB addition, whereas |
| * the inner one mostly stays the same (barring offset recalculation) |
| */ |
| for (int i = 0; i < numSequences; i++) |
| { |
| /* code in the loop left unindented to prevent merge conflicts */ |
| |
| /* |
| * Now scan the revmap. We start by querying for the 1st heap page in |
| * the ith block sequence, incrementing by the number of pages per range; |
| * this gives us a full view of each block sequence and ultimately, the |
| * full table. |
| */ |
| BlockNumber startblknum = sequences[i].startblknum; |
| BlockNumber endblknum = sequences[i].startblknum + sequences[i].nblocks; |
| int currseq = AOSegmentGet_blockSequenceNum(startblknum); |
| |
| if (RelationIsAppendOptimized(heapRel)) |
| brinRevmapAOPositionAtStart(opaque->bo_rmAccess, currseq); |
| |
| for (heapBlk = startblknum; heapBlk < endblknum; heapBlk += opaque->bo_pagesPerRange) |
| { |
| bool addrange; |
| bool gottuple = false; |
| BrinTuple *tup; |
| OffsetNumber off; |
| Size size; |
| |
| CHECK_FOR_INTERRUPTS(); |
| |
| MemoryContextResetAndDeleteChildren(perRangeCxt); |
| |
| tup = brinGetTupleForHeapBlock(opaque->bo_rmAccess, heapBlk, &buf, |
| &off, &size, BUFFER_LOCK_SHARE, |
| scan->xs_snapshot); |
| if (tup) |
| { |
| gottuple = true; |
| btup = brin_copy_tuple(tup, size, btup, &btupsz); |
| LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
| } |
| |
| /* |
| * For page ranges with no indexed tuple, we must return the whole |
| * range; otherwise, compare it to the scan keys. |
| */ |
| if (!gottuple) |
| { |
| addrange = true; |
| } |
| else |
| { |
| dtup = brin_deform_tuple(bdesc, btup, dtup); |
| if (dtup->bt_placeholder) |
| { |
| /* |
| * Placeholder tuples are always returned, regardless of the |
| * values stored in them. |
| */ |
| addrange = true; |
| } |
| else |
| { |
| int attno; |
| |
| /* |
| * Compare scan keys with summary values stored for the range. |
| * If scan keys are matched, the page range must be added to |
| * the bitmap. We initially assume the range needs to be |
| * added; in particular this serves the case where there are |
| * no keys. |
| */ |
| addrange = true; |
| for (attno = 1; attno <= bdesc->bd_tupdesc->natts; attno++) |
| { |
| BrinValues *bval; |
| Datum add; |
| Oid collation; |
| |
| /* |
| * skip attributes without any scan keys (both regular and |
| * IS [NOT] NULL) |
| */ |
| if (nkeys[attno - 1] == 0 && nnullkeys[attno - 1] == 0) |
| continue; |
| |
| bval = &dtup->bt_columns[attno - 1]; |
| |
| /* |
| * If the BRIN tuple indicates that this range is empty, |
| * we can skip it: there's nothing to match. We don't |
| * need to examine the next columns. |
| */ |
| if (dtup->bt_empty_range) |
| { |
| addrange = false; |
| break; |
| } |
| |
| /* |
| * First check if there are any IS [NOT] NULL scan keys, |
| * and if we're violating them. In that case we can |
| * terminate early, without invoking the support function. |
| * |
| * As there may be more keys, we can only determine |
| * mismatch within this loop. |
| */ |
| if (bdesc->bd_info[attno - 1]->oi_regular_nulls && |
| !check_null_keys(bval, nullkeys[attno - 1], |
| nnullkeys[attno - 1])) |
| { |
| /* |
| * If any of the IS [NOT] NULL keys failed, the page |
| * range as a whole can't pass. So terminate the loop. |
| */ |
| addrange = false; |
| break; |
| } |
| |
| /* |
| * So either there are no IS [NOT] NULL keys, or all |
| * passed. If there are no regular scan keys, we're done - |
| * the page range matches. If there are regular keys, but |
| * the page range is marked as 'all nulls' it can't |
| * possibly pass (we're assuming the operators are |
| * strict). |
| */ |
| |
| /* No regular scan keys - page range as a whole passes. */ |
| if (!nkeys[attno - 1]) |
| continue; |
| |
| Assert((nkeys[attno - 1] > 0) && |
| (nkeys[attno - 1] <= scan->numberOfKeys)); |
| |
| /* If it is all nulls, it cannot possibly be consistent. */ |
| if (bval->bv_allnulls) |
| { |
| addrange = false; |
| break; |
| } |
| |
| /* |
| * Collation from the first key (has to be the same for |
| * all keys for the same attribute). |
| */ |
| collation = keys[attno - 1][0]->sk_collation; |
| |
| /* |
| * Check whether the scan key is consistent with the page |
| * range values; if so, have the pages in the range added |
| * to the output bitmap. |
| * |
| * The opclass may or may not support processing of |
| * multiple scan keys. We can determine that based on the |
| * number of arguments - functions with extra parameter |
| * (number of scan keys) do support this, otherwise we |
| * have to simply pass the scan keys one by one. |
| */ |
| if (consistentFn[attno - 1].fn_nargs >= 4) |
| { |
| /* Check all keys at once */ |
| add = FunctionCall4Coll(&consistentFn[attno - 1], |
| collation, |
| PointerGetDatum(bdesc), |
| PointerGetDatum(bval), |
| PointerGetDatum(keys[attno - 1]), |
| Int32GetDatum(nkeys[attno - 1])); |
| addrange = DatumGetBool(add); |
| } |
| else |
| { |
| /* |
| * Check keys one by one |
| * |
| * When there are multiple scan keys, failure to meet |
| * the criteria for a single one of them is enough to |
| * discard the range as a whole, so break out of the |
| * loop as soon as a false return value is obtained. |
| */ |
| int keyno; |
| |
| for (keyno = 0; keyno < nkeys[attno - 1]; keyno++) |
| { |
| add = FunctionCall3Coll(&consistentFn[attno - 1], |
| keys[attno - 1][keyno]->sk_collation, |
| PointerGetDatum(bdesc), |
| PointerGetDatum(bval), |
| PointerGetDatum(keys[attno - 1][keyno])); |
| addrange = DatumGetBool(add); |
| if (!addrange) |
| break; |
| } |
| } |
| } |
| } |
| } |
| |
| /* add the pages in the range to the output bitmap, if needed */ |
| if (addrange) |
| { |
| BlockNumber pageno; |
| |
| for (pageno = heapBlk; |
| pageno <= Min(endblknum, heapBlk + opaque->bo_pagesPerRange) - 1; |
| pageno++) |
| { |
| MemoryContextSwitchTo(oldcxt); |
| tbm_add_page(tbm, pageno); |
| totalpages++; |
| MemoryContextSwitchTo(perRangeCxt); |
| |
| SIMPLE_FAULT_INJECTOR("brin_bitmap_page_added"); |
| } |
| } |
| } |
| |
| /* outer loop end */ |
| } |
| |
| MemoryContextSwitchTo(oldcxt); |
| MemoryContextDelete(perRangeCxt); |
| pfree(sequences); |
| |
| if (buf != InvalidBuffer) |
| ReleaseBuffer(buf); |
| |
| /* |
| * XXX We have an approximation of the number of *pages* that our scan |
| * returns, but we don't have a precise idea of the number of heap tuples |
| * involved. |
| */ |
| return totalpages * 10; |
| } |
| |
| /* |
| * Re-initialize state for a BRIN index scan |
| */ |
| void |
| brinrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, |
| ScanKey orderbys, int norderbys) |
| { |
| /* |
| * Other index AMs preprocess the scan keys at this point, or sometime |
| * early during the scan; this lets them optimize by removing redundant |
| * keys, or doing early returns when they are impossible to satisfy; see |
| * _bt_preprocess_keys for an example. Something like that could be added |
| * here someday, too. |
| */ |
| |
| if (scankey && scan->numberOfKeys > 0) |
| memmove(scan->keyData, scankey, |
| scan->numberOfKeys * sizeof(ScanKeyData)); |
| } |
| |
| /* |
| * Close down a BRIN index scan |
| */ |
| void |
| brinendscan(IndexScanDesc scan) |
| { |
| BrinOpaque *opaque = (BrinOpaque *) scan->opaque; |
| |
| brinRevmapTerminate(opaque->bo_rmAccess); |
| brin_free_desc(opaque->bo_bdesc); |
| pfree(opaque); |
| } |
| |
| /* |
| * Per-heap-tuple callback for table_index_build_scan. |
| * |
| * Note we don't worry about the page range at the end of the table here; it is |
| * present in the build state struct after we're called the last time, but not |
| * inserted into the index. Caller must ensure to do so, if appropriate. |
| */ |
| static void |
| brinbuildCallback(Relation index, |
| ItemPointer tid, |
| Datum *values, |
| bool *isnull, |
| bool tupleIsAlive, |
| void *brstate) |
| { |
| BrinBuildState *state = (BrinBuildState *) brstate; |
| BlockNumber thisblock; |
| |
| thisblock = ItemPointerGetBlockNumber(tid); |
| |
| /* |
| * If we're in a block that belongs to a future range, summarize what |
| * we've got and start afresh. Note the scan might have skipped many |
| * pages, if they were devoid of live tuples; make sure to insert index |
| * tuples for those too. |
| */ |
| |
| /* |
| * GPDB: Adjust build state depending on latest logical heap block |
| * |
| * XXX: We can move this out of brinbuildCallback() if we refactor |
| * brinbuild() to loop over BlockSequences, much like we do in |
| * bringetbitmap() and brinsummarize(). |
| * We would also be able to get rid of BrinBuildState.bs_seq_reltuples. |
| */ |
| if (state->bs_isAO) |
| { |
| BlockNumber seqStartBlk = AOHeapBlockGet_startHeapBlock(thisblock); |
| |
| if (state->bs_currRangeStart < seqStartBlk) |
| { |
| /* We are starting a new block sequence */ |
| int seqNum; |
| |
| /* process the final batch in the current block sequence (if any) */ |
| if (state->bs_aoHasDataTuple) |
| form_and_insert_tuple(state); |
| |
| /* adjust the current block sequence */ |
| seqNum = AOSegmentGet_blockSequenceNum(thisblock); |
| brinRevmapAOPositionAtStart(state->bs_rmAccess, seqNum); |
| |
| /* readjust the range lower bound */ |
| state->bs_currRangeStart = seqStartBlk; |
| |
| /* re-initialize state for it */ |
| brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc); |
| } |
| } |
| |
| while (thisblock > state->bs_currRangeStart + state->bs_pagesPerRange - 1) |
| { |
| |
| BRIN_elog((DEBUG2, |
| "brinbuildCallback: completed a range: %u--%u", |
| state->bs_currRangeStart, |
| state->bs_currRangeStart + state->bs_pagesPerRange)); |
| |
| /* create the index tuple and insert it */ |
| form_and_insert_tuple(state); |
| |
| /* set state to correspond to the next range */ |
| /* XXX: This needs clamping for AO/CO tables for seg i full case. */ |
| state->bs_currRangeStart += state->bs_pagesPerRange; |
| |
| /* re-initialize state for it */ |
| brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc); |
| } |
| |
| /* Accumulate the current tuple into the running state */ |
| (void) add_values_to_range(index, state->bs_bdesc, state->bs_dtuple, |
| values, isnull); |
| /* GPDB: Additional accounting in the build state for AO/CO relations */ |
| state->bs_aoHasDataTuple = true; |
| } |
| |
| /* |
| * brinbuild() -- build a new BRIN index. |
| */ |
| IndexBuildResult * |
| brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) |
| { |
| IndexBuildResult *result; |
| double reltuples; |
| double idxtuples; |
| BrinRevmap *revmap; |
| BrinBuildState *state; |
| Buffer meta; |
| BlockNumber pagesPerRange; |
| bool isAO; |
| |
| isAO = RelationStorageIsAO(heap); |
| /* |
| * We expect to be called exactly once for any index relation. |
| */ |
| if (RelationGetNumberOfBlocks(index) != 0) |
| elog(ERROR, "index \"%s\" already contains data", |
| RelationGetRelationName(index)); |
| |
| /* |
| * Critical section not required, because on error the creation of the |
| * whole relation will be rolled back. |
| */ |
| |
| meta = ReadBuffer(index, P_NEW); |
| Assert(BufferGetBlockNumber(meta) == BRIN_METAPAGE_BLKNO); |
| LockBuffer(meta, BUFFER_LOCK_EXCLUSIVE); |
| |
| brin_metapage_init(BufferGetPage(meta), BrinGetPagesPerRange(index), |
| BRIN_CURRENT_VERSION, RelationStorageIsAO(heap)); |
| MarkBufferDirty(meta); |
| |
| if (RelationNeedsWAL(index)) |
| { |
| xl_brin_createidx xlrec; |
| XLogRecPtr recptr; |
| Page page; |
| |
| xlrec.version = BRIN_CURRENT_VERSION; |
| xlrec.pagesPerRange = BrinGetPagesPerRange(index); |
| xlrec.isAO = isAO; |
| |
| XLogBeginInsert(); |
| XLogRegisterData((char *) &xlrec, SizeOfBrinCreateIdx); |
| XLogRegisterBuffer(0, meta, REGBUF_WILL_INIT | REGBUF_STANDARD); |
| |
| recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_CREATE_INDEX); |
| |
| page = BufferGetPage(meta); |
| PageSetLSN(page, recptr); |
| } |
| |
| UnlockReleaseBuffer(meta); |
| |
| /* |
| * Initialize our state, including the deformed tuple state. |
| */ |
| revmap = brinRevmapInitialize(index, &pagesPerRange, NULL); |
| state = initialize_brin_buildstate(index, revmap, pagesPerRange, isAO); |
| |
| /* GPDB: AO/CO tables: position iterator to start of sequence 0's chain. */ |
| brinRevmapAOPositionAtStart(revmap, 0); |
| |
| /* |
| * Now scan the relation. No syncscan allowed here because we want the |
| * heap blocks in physical order. |
| */ |
| reltuples = table_index_build_scan(heap, index, indexInfo, false, true, |
| brinbuildCallback, (void *) state, NULL); |
| |
| /* process the final batch */ |
| /* |
| * GPDB: Avoid this for AO/CO tables with no rows. We opt to not create a |
| * revmap page and data page with a placeholder tuple for empty relations, |
| * as is done for heap. If we did, we would have to do so for all 128 |
| * possible block sequences, creating unnecessary bloat. |
| */ |
| if (!isAO || state->bs_aoHasDataTuple) |
| form_and_insert_tuple(state); |
| |
| /* release resources */ |
| idxtuples = state->bs_numtuples; |
| brinRevmapTerminate(state->bs_rmAccess); |
| terminate_brin_buildstate(state); |
| |
| /* |
| * Return statistics |
| */ |
| result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); |
| |
| result->heap_tuples = reltuples; |
| result->index_tuples = idxtuples; |
| |
| return result; |
| } |
| |
| void |
| brinbuildempty(Relation index) |
| { |
| Buffer metabuf; |
| |
| /* An empty BRIN index has a metapage only. */ |
| metabuf = |
| ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_NORMAL, NULL); |
| LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); |
| |
| /* Initialize and xlog metabuffer. */ |
| START_CRIT_SECTION(); |
| brin_metapage_init(BufferGetPage(metabuf), BrinGetPagesPerRange(index), |
| BRIN_CURRENT_VERSION, false); |
| MarkBufferDirty(metabuf); |
| log_newpage_buffer(metabuf, true); |
| END_CRIT_SECTION(); |
| |
| UnlockReleaseBuffer(metabuf); |
| } |
| |
| /* |
| * brinbulkdelete |
| * Since there are no per-heap-tuple index tuples in BRIN indexes, |
| * there's not a lot we can do here. |
| * |
| * XXX we could mark item tuples as "dirty" (when a minimum or maximum heap |
| * tuple is deleted), meaning the need to re-run summarization on the affected |
| * range. Would need to add an extra flag in brintuples for that. |
| */ |
| IndexBulkDeleteResult * |
| brinbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, |
| IndexBulkDeleteCallback callback, void *callback_state) |
| { |
| /* allocate stats if first time through, else re-use existing struct */ |
| if (stats == NULL) |
| stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); |
| |
| return stats; |
| } |
| |
| /* |
| * This routine is in charge of "vacuuming" a BRIN index: we just summarize |
| * ranges that are currently unsummarized. |
| */ |
| IndexBulkDeleteResult * |
| brinvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) |
| { |
| Relation heapRel; |
| |
| /* No-op in ANALYZE ONLY mode */ |
| if (info->analyze_only) |
| return stats; |
| |
| if (!stats) |
| stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); |
| stats->num_pages = RelationGetNumberOfBlocks(info->index); |
| /* rest of stats is initialized by zeroing */ |
| |
| heapRel = table_open(IndexGetRelation(RelationGetRelid(info->index), false), |
| AccessShareLock); |
| |
| brin_vacuum_scan(info->index, info->strategy); |
| |
| brinsummarize(info->index, heapRel, BRIN_ALL_BLOCKRANGES, false, |
| &stats->num_index_tuples, &stats->num_index_tuples); |
| |
| table_close(heapRel, AccessShareLock); |
| |
| return stats; |
| } |
| |
| /* |
| * reloptions processor for BRIN indexes |
| */ |
| bytea * |
| brinoptions(Datum reloptions, bool validate) |
| { |
| static const relopt_parse_elt tab[] = { |
| {"pages_per_range", RELOPT_TYPE_INT, offsetof(BrinOptions, pagesPerRange)}, |
| {"autosummarize", RELOPT_TYPE_BOOL, offsetof(BrinOptions, autosummarize)} |
| }; |
| |
| return (bytea *) build_reloptions(reloptions, validate, |
| RELOPT_KIND_BRIN, |
| sizeof(BrinOptions), |
| tab, lengthof(tab)); |
| } |
| |
| /* |
| * SQL-callable function to scan through an index and summarize all ranges |
| * that are not currently summarized. |
| */ |
| Datum |
| brin_summarize_new_values_internal(PG_FUNCTION_ARGS) |
| { |
| Datum relation = PG_GETARG_DATUM(0); |
| |
| return DirectFunctionCall2(brin_summarize_range_internal, |
| relation, |
| Int64GetDatum((int64) BRIN_ALL_BLOCKRANGES)); |
| } |
| |
| /* |
| * SQL-callable function to summarize the indicated page range, if not already |
| * summarized. If the second argument is BRIN_ALL_BLOCKRANGES, all |
| * unsummarized ranges are summarized. |
| */ |
| Datum |
| brin_summarize_range_internal(PG_FUNCTION_ARGS) |
| { |
| Oid indexoid = PG_GETARG_OID(0); |
| int64 heapBlk64 = PG_GETARG_INT64(1); |
| BlockNumber heapBlk; |
| Oid heapoid; |
| Relation indexRel; |
| Relation heapRel; |
| Oid save_userid; |
| int save_sec_context; |
| int save_nestlevel; |
| double numSummarized = 0; |
| |
| if (RecoveryInProgress()) |
| ereport(ERROR, |
| (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), |
| errmsg("recovery is in progress"), |
| errhint("BRIN control functions cannot be executed during recovery."))); |
| |
| if (heapBlk64 > BRIN_ALL_BLOCKRANGES || heapBlk64 < 0) |
| { |
| char *blk = psprintf(INT64_FORMAT, heapBlk64); |
| |
| ereport(ERROR, |
| (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), |
| errmsg("block number out of range: %s", blk))); |
| } |
| heapBlk = (BlockNumber) heapBlk64; |
| |
| /* |
| * We must lock table before index to avoid deadlocks. However, if the |
| * passed indexoid isn't an index then IndexGetRelation() will fail. |
| * Rather than emitting a not-very-helpful error message, postpone |
| * complaining, expecting that the is-it-an-index test below will fail. |
| */ |
| heapoid = IndexGetRelation(indexoid, true); |
| if (OidIsValid(heapoid)) |
| { |
| heapRel = table_open(heapoid, ShareUpdateExclusiveLock); |
| |
| /* |
| * Autovacuum calls us. For its benefit, switch to the table owner's |
| * userid, so that any index functions are run as that user. Also |
| * lock down security-restricted operations and arrange to make GUC |
| * variable changes local to this command. This is harmless, albeit |
| * unnecessary, when called from SQL, because we fail shortly if the |
| * user does not own the index. |
| */ |
| GetUserIdAndSecContext(&save_userid, &save_sec_context); |
| SetUserIdAndSecContext(heapRel->rd_rel->relowner, |
| save_sec_context | SECURITY_RESTRICTED_OPERATION); |
| save_nestlevel = NewGUCNestLevel(); |
| } |
| else |
| { |
| heapRel = NULL; |
| /* Set these just to suppress "uninitialized variable" warnings */ |
| save_userid = InvalidOid; |
| save_sec_context = -1; |
| save_nestlevel = -1; |
| } |
| |
| indexRel = index_open(indexoid, ShareUpdateExclusiveLock); |
| |
| /* Must be a BRIN index */ |
| if (indexRel->rd_rel->relkind != RELKIND_INDEX || |
| !IsIndexAccessMethod(indexRel->rd_rel->relam, BRIN_AM_OID)) |
| ereport(ERROR, |
| (errcode(ERRCODE_WRONG_OBJECT_TYPE), |
| errmsg("\"%s\" is not a BRIN index", |
| RelationGetRelationName(indexRel)))); |
| |
| /* User must own the index (comparable to privileges needed for VACUUM) */ |
| if (heapRel != NULL && !pg_class_ownercheck(indexoid, save_userid)) |
| aclcheck_error(ACLCHECK_NOT_OWNER, OBJECT_INDEX, |
| RelationGetRelationName(indexRel)); |
| |
| /* |
| * Since we did the IndexGetRelation call above without any lock, it's |
| * barely possible that a race against an index drop/recreation could have |
| * netted us the wrong table. Recheck. |
| */ |
| if (heapRel == NULL || heapoid != IndexGetRelation(indexoid, false)) |
| ereport(ERROR, |
| (errcode(ERRCODE_UNDEFINED_TABLE), |
| errmsg("could not open parent table of index \"%s\"", |
| RelationGetRelationName(indexRel)))); |
| |
| /* OK, do it */ |
| brinsummarize(indexRel, heapRel, heapBlk, true, &numSummarized, NULL); |
| |
| /* Roll back any GUC changes executed by index functions */ |
| AtEOXact_GUC(false, save_nestlevel); |
| |
| /* Restore userid and security context */ |
| SetUserIdAndSecContext(save_userid, save_sec_context); |
| |
| relation_close(indexRel, ShareUpdateExclusiveLock); |
| relation_close(heapRel, ShareUpdateExclusiveLock); |
| |
| PG_RETURN_INT32((int32) numSummarized); |
| } |
| |
| /* |
| * SQL-callable interface to mark a range as no longer summarized |
| */ |
| Datum |
| brin_desummarize_range(PG_FUNCTION_ARGS) |
| { |
| Oid indexoid = PG_GETARG_OID(0); |
| int64 heapBlk64 = PG_GETARG_INT64(1); |
| BlockNumber heapBlk; |
| Oid heapoid; |
| Relation heapRel; |
| Relation indexRel; |
| bool done; |
| |
| if (RecoveryInProgress()) |
| ereport(ERROR, |
| (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), |
| errmsg("recovery is in progress"), |
| errhint("BRIN control functions cannot be executed during recovery."))); |
| |
| if (heapBlk64 > MaxBlockNumber || heapBlk64 < 0) |
| { |
| char *blk = psprintf(INT64_FORMAT, heapBlk64); |
| |
| ereport(ERROR, |
| (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), |
| errmsg("block number out of range: %s", blk))); |
| } |
| heapBlk = (BlockNumber) heapBlk64; |
| |
| /* |
| * We must lock table before index to avoid deadlocks. However, if the |
| * passed indexoid isn't an index then IndexGetRelation() will fail. |
| * Rather than emitting a not-very-helpful error message, postpone |
| * complaining, expecting that the is-it-an-index test below will fail. |
| * |
| * Unlike brin_summarize_range(), autovacuum never calls this. Hence, we |
| * don't switch userid. |
| */ |
| heapoid = IndexGetRelation(indexoid, true); |
| if (OidIsValid(heapoid)) |
| heapRel = table_open(heapoid, ShareUpdateExclusiveLock); |
| else |
| heapRel = NULL; |
| |
| indexRel = index_open(indexoid, ShareUpdateExclusiveLock); |
| |
| /* Must be a BRIN index */ |
| if (indexRel->rd_rel->relkind != RELKIND_INDEX || |
| !IsIndexAccessMethod(indexRel->rd_rel->relam, BRIN_AM_OID)) |
| ereport(ERROR, |
| (errcode(ERRCODE_WRONG_OBJECT_TYPE), |
| errmsg("\"%s\" is not a BRIN index", |
| RelationGetRelationName(indexRel)))); |
| |
| /* User must own the index (comparable to privileges needed for VACUUM) */ |
| if (!pg_class_ownercheck(indexoid, GetUserId())) |
| aclcheck_error(ACLCHECK_NOT_OWNER, OBJECT_INDEX, |
| RelationGetRelationName(indexRel)); |
| |
| /* |
| * Since we did the IndexGetRelation call above without any lock, it's |
| * barely possible that a race against an index drop/recreation could have |
| * netted us the wrong table. Recheck. |
| */ |
| if (heapRel == NULL || heapoid != IndexGetRelation(indexoid, false)) |
| ereport(ERROR, |
| (errcode(ERRCODE_UNDEFINED_TABLE), |
| errmsg("could not open parent table of index \"%s\"", |
| RelationGetRelationName(indexRel)))); |
| |
| /* the revmap does the hard work */ |
| do |
| { |
| done = brinRevmapDesummarizeRange(indexRel, heapBlk); |
| } |
| while (!done); |
| |
| relation_close(indexRel, ShareUpdateExclusiveLock); |
| relation_close(heapRel, ShareUpdateExclusiveLock); |
| |
| PG_RETURN_VOID(); |
| } |
| |
| /* |
| * Build a BrinDesc used to create or scan a BRIN index |
| */ |
| BrinDesc * |
| brin_build_desc(Relation rel) |
| { |
| BrinOpcInfo **opcinfo; |
| BrinDesc *bdesc; |
| TupleDesc tupdesc; |
| int totalstored = 0; |
| int keyno; |
| long totalsize; |
| MemoryContext cxt; |
| MemoryContext oldcxt; |
| |
| cxt = AllocSetContextCreate(CurrentMemoryContext, |
| "brin desc cxt", |
| ALLOCSET_SMALL_SIZES); |
| oldcxt = MemoryContextSwitchTo(cxt); |
| tupdesc = RelationGetDescr(rel); |
| |
| /* |
| * Obtain BrinOpcInfo for each indexed column. While at it, accumulate |
| * the number of columns stored, since the number is opclass-defined. |
| */ |
| opcinfo = (BrinOpcInfo **) palloc(sizeof(BrinOpcInfo *) * tupdesc->natts); |
| for (keyno = 0; keyno < tupdesc->natts; keyno++) |
| { |
| FmgrInfo *opcInfoFn; |
| Form_pg_attribute attr = TupleDescAttr(tupdesc, keyno); |
| |
| opcInfoFn = index_getprocinfo(rel, keyno + 1, BRIN_PROCNUM_OPCINFO); |
| |
| opcinfo[keyno] = (BrinOpcInfo *) |
| DatumGetPointer(FunctionCall1(opcInfoFn, attr->atttypid)); |
| totalstored += opcinfo[keyno]->oi_nstored; |
| } |
| |
| /* Allocate our result struct and fill it in */ |
| totalsize = offsetof(BrinDesc, bd_info) + |
| sizeof(BrinOpcInfo *) * tupdesc->natts; |
| |
| bdesc = palloc(totalsize); |
| bdesc->bd_context = cxt; |
| bdesc->bd_index = rel; |
| bdesc->bd_tupdesc = tupdesc; |
| bdesc->bd_disktdesc = NULL; /* generated lazily */ |
| bdesc->bd_totalstored = totalstored; |
| |
| for (keyno = 0; keyno < tupdesc->natts; keyno++) |
| bdesc->bd_info[keyno] = opcinfo[keyno]; |
| pfree(opcinfo); |
| |
| MemoryContextSwitchTo(oldcxt); |
| |
| return bdesc; |
| } |
| |
| void |
| brin_free_desc(BrinDesc *bdesc) |
| { |
| /* make sure the tupdesc is still valid */ |
| Assert(bdesc->bd_tupdesc->tdrefcount >= 1); |
| /* no need for retail pfree */ |
| MemoryContextDelete(bdesc->bd_context); |
| } |
| |
| /* |
| * Fetch index's statistical data into *stats |
| */ |
| void |
| brinGetStats(Relation index, BrinStatsData *stats) |
| { |
| Buffer metabuffer; |
| Page metapage; |
| BrinMetaPageData *metadata; |
| |
| metabuffer = ReadBuffer(index, BRIN_METAPAGE_BLKNO); |
| LockBuffer(metabuffer, BUFFER_LOCK_SHARE); |
| metapage = BufferGetPage(metabuffer); |
| metadata = (BrinMetaPageData *) PageGetContents(metapage); |
| |
| stats->pagesPerRange = metadata->pagesPerRange; |
| |
| /* |
| * GPDB: Since planning is done on the QD and since there is no data on the QD, |
| * there are no revmap pages on the QD. So, it is currently not possible to get |
| * an estimate on the number of revmap pages (since we want to avoid dispatching |
| * during planning). |
| * |
| * For AO/CO tables, the following wouldn't be applicable anyway (we would have |
| * to look at the revmap chains etc). |
| * |
| * Even though we are unable to get an estimate on the number of revmap pages, |
| * it works out fine for AO/CO tables as these pages get treated like data pages |
| * (i.e. they are costed as random access), as well as they should be (due to |
| * chaining, please refer to the BRIN README). For heap tables, we end up losing |
| * out a little as we would be costing a BRIN plan higher, due to this limitation. |
| */ |
| #if 0 |
| stats->revmapNumPages = metadata->lastRevmapPage - 1; |
| #endif |
| stats->revmapNumPages = 0; |
| |
| UnlockReleaseBuffer(metabuffer); |
| } |
| |
| /* |
| * Initialize a BrinBuildState appropriate to create tuples on the given index. |
| */ |
| static BrinBuildState * |
| initialize_brin_buildstate(Relation idxRel, BrinRevmap *revmap, |
| BlockNumber pagesPerRange, bool isAO) |
| { |
| BrinBuildState *state; |
| |
| state = palloc(sizeof(BrinBuildState)); |
| |
| state->bs_irel = idxRel; |
| state->bs_numtuples = 0; |
| state->bs_currentInsertBuf = InvalidBuffer; |
| state->bs_pagesPerRange = pagesPerRange; |
| state->bs_currRangeStart = 0; |
| state->bs_rmAccess = revmap; |
| state->bs_bdesc = brin_build_desc(idxRel); |
| state->bs_dtuple = brin_new_memtuple(state->bs_bdesc); |
| |
| /* GPDB specific state for AO/CO tables */ |
| state->bs_isAO = isAO; |
| state->bs_aoHasDataTuple = false; |
| |
| brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc); |
| |
| return state; |
| } |
| |
| /* |
| * Release resources associated with a BrinBuildState. |
| */ |
| static void |
| terminate_brin_buildstate(BrinBuildState *state) |
| { |
| /* |
| * Release the last index buffer used. We might as well ensure that |
| * whatever free space remains in that page is available in FSM, too. |
| */ |
| if (!BufferIsInvalid(state->bs_currentInsertBuf)) |
| { |
| Page page; |
| Size freespace; |
| BlockNumber blk; |
| |
| page = BufferGetPage(state->bs_currentInsertBuf); |
| freespace = PageGetFreeSpace(page); |
| blk = BufferGetBlockNumber(state->bs_currentInsertBuf); |
| ReleaseBuffer(state->bs_currentInsertBuf); |
| RecordPageWithFreeSpace(state->bs_irel, blk, freespace); |
| FreeSpaceMapVacuumRange(state->bs_irel, blk, blk + 1); |
| } |
| |
| brin_free_desc(state->bs_bdesc); |
| pfree(state->bs_dtuple); |
| pfree(state); |
| } |
| |
| /* |
| * On the given BRIN index, summarize the heap page range that corresponds |
| * to the heap block number given. |
| * |
| * This routine can run in parallel with insertions into the heap. To avoid |
| * missing those values from the summary tuple, we first insert a placeholder |
| * index tuple into the index, then execute the heap scan; transactions |
| * concurrent with the scan update the placeholder tuple. After the scan, we |
| * union the placeholder tuple with the one computed by this routine. The |
| * update of the index value happens in a loop, so that if somebody updates |
| * the placeholder tuple after we read it, we detect the case and try again. |
| * This ensures that the concurrently inserted tuples are not lost. |
| * |
| * A further corner case is this routine being asked to summarize the partial |
| * range at the end of the table. heapNumBlocks is the (possibly outdated) |
| * table size; if we notice that the requested range lies beyond that size, |
| * we re-compute the table size after inserting the placeholder tuple, to |
| * avoid missing pages that were appended recently. |
| * |
| * GPDB: Since we have to support the notion of BlockSequences, heapNumBlks |
| * actually behaves as the ending block for the block sequence within which the |
| * supplied range lies, instead of the number of blocks in the relation. We |
| * don't rename the variable to avoid merge conflicts. |
| */ |
| static void |
| summarize_range(IndexInfo *indexInfo, BrinBuildState *state, Relation heapRel, |
| BlockNumber heapBlk, BlockNumber heapNumBlks) |
| { |
| Buffer phbuf; |
| BrinTuple *phtup; |
| Size phsz; |
| OffsetNumber offset; |
| BlockNumber scanNumBlks; |
| |
| /* |
| * Insert the placeholder tuple |
| */ |
| phbuf = InvalidBuffer; |
| phtup = brin_form_placeholder_tuple(state->bs_bdesc, heapBlk, &phsz); |
| offset = brin_doinsert(state->bs_irel, state->bs_pagesPerRange, |
| state->bs_rmAccess, &phbuf, |
| heapBlk, phtup, phsz); |
| |
| /* |
| * Compute range end. We hold ShareUpdateExclusive lock on table, so it |
| * cannot shrink concurrently (but it can grow). |
| * |
| * GPDB: The following assert only applies to heap tables, as for AO/CO |
| * tables, heapBlk need not be a multiple of bs_pagesPerRange. |
| */ |
| AssertImply(RelationIsHeap(heapRel), heapBlk % state->bs_pagesPerRange == 0); |
| if (heapBlk + state->bs_pagesPerRange > heapNumBlks) |
| { |
| BlockSequence blockSequence; |
| BlockNumber endblknum; |
| |
| SIMPLE_FAULT_INJECTOR("summarize_last_partial_range"); |
| |
| table_relation_get_block_sequence(heapRel, heapBlk, &blockSequence); |
| endblknum = blockSequence.startblknum + blockSequence.nblocks; |
| |
| /* |
| * If we're asked to scan what we believe to be the final range on the |
| * table (i.e. a range that might be partial) we need to recompute our |
| * idea of what the latest page is after inserting the placeholder |
| * tuple. Anyone that grows the table later will update the |
| * placeholder tuple, so it doesn't matter that we won't scan these |
| * pages ourselves. Careful: the table might have been extended |
| * beyond the current range, so clamp our result. |
| * |
| * Fortunately, this should occur infrequently. |
| */ |
| |
| if (endblknum != heapNumBlks && RelationStorageIsAO(heapRel)) |
| { |
| /* |
| * GPDB: We bail and don't summarize the final partial range if we |
| * find that the final range was extended (by another inserting |
| * transaction) while we are summarizing here. Currently, we don't |
| * have the support to handle the "any visible" mode described below |
| * in the appendonly AMs. This is why we need to bail. |
| */ |
| brin_free_tuple(phtup); |
| ReleaseBuffer(phbuf); |
| return; |
| } |
| |
| scanNumBlks = Min(endblknum - heapBlk, |
| state->bs_pagesPerRange); |
| } |
| else |
| { |
| /* Easy case: range is known to be complete */ |
| scanNumBlks = state->bs_pagesPerRange; |
| } |
| |
| /* |
| * Execute the partial heap scan covering the heap blocks in the specified |
| * page range, summarizing the heap tuples in it. This scan stops just |
| * short of brinbuildCallback creating the new index entry. |
| * |
| * Note that it is critical we use the "any visible" mode of |
| * table_index_build_range_scan here: otherwise, we would miss tuples |
| * inserted by transactions that are still in progress, among other corner |
| * cases. |
| */ |
| state->bs_currRangeStart = heapBlk; |
| table_index_build_range_scan(heapRel, state->bs_irel, indexInfo, false, true, false, |
| heapBlk, scanNumBlks, |
| brinbuildCallback, (void *) state, NULL); |
| |
| /* |
| * Now we update the values obtained by the scan with the placeholder |
| * tuple. We do this in a loop which only terminates if we're able to |
| * update the placeholder tuple successfully; if we are not, this means |
| * somebody else modified the placeholder tuple after we read it. |
| */ |
| for (;;) |
| { |
| BrinTuple *newtup; |
| Size newsize; |
| bool didupdate; |
| bool samepage; |
| |
| CHECK_FOR_INTERRUPTS(); |
| |
| /* |
| * Update the summary tuple and try to update. |
| */ |
| newtup = brin_form_tuple(state->bs_bdesc, |
| heapBlk, state->bs_dtuple, &newsize); |
| samepage = brin_can_do_samepage_update(phbuf, phsz, newsize); |
| didupdate = |
| brin_doupdate(state->bs_irel, state->bs_pagesPerRange, |
| state->bs_rmAccess, heapBlk, phbuf, offset, |
| phtup, phsz, newtup, newsize, samepage); |
| brin_free_tuple(phtup); |
| brin_free_tuple(newtup); |
| |
| /* If the update succeeded, we're done. */ |
| if (didupdate) |
| break; |
| |
| /* |
| * If the update didn't work, it might be because somebody updated the |
| * placeholder tuple concurrently. Extract the new version, union it |
| * with the values we have from the scan, and start over. (There are |
| * other reasons for the update to fail, but it's simple to treat them |
| * the same.) |
| */ |
| phtup = brinGetTupleForHeapBlock(state->bs_rmAccess, heapBlk, &phbuf, |
| &offset, &phsz, BUFFER_LOCK_SHARE, |
| NULL); |
| /* the placeholder tuple must exist */ |
| if (phtup == NULL) |
| elog(ERROR, "missing placeholder tuple"); |
| phtup = brin_copy_tuple(phtup, phsz, NULL, NULL); |
| LockBuffer(phbuf, BUFFER_LOCK_UNLOCK); |
| |
| /* merge it into the tuple from the heap scan */ |
| union_tuples(state->bs_bdesc, state->bs_dtuple, phtup); |
| } |
| |
| ReleaseBuffer(phbuf); |
| } |
| |
| /* |
| * Summarize page ranges that are not already summarized. If pageRange is |
| * BRIN_ALL_BLOCKRANGES then the whole table is scanned; otherwise, only the |
| * page range containing the given heap page number is scanned. |
| * If include_partial is true, then the partial range at the end of the table |
| * is summarized, otherwise not. |
| * |
| * For each new index tuple inserted, *numSummarized (if not NULL) is |
| * incremented; for each existing tuple, *numExisting (if not NULL) is |
| * incremented. |
| */ |
| static void |
| brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange, |
| bool include_partial, double *numSummarized, double *numExisting) |
| { |
| BrinRevmap *revmap; |
| BrinBuildState *state = NULL; |
| IndexInfo *indexInfo = NULL; |
| BlockNumber pagesPerRange; |
| Buffer buf; |
| |
| /* GPDB: Used for iterating over the revmap */ |
| int numSequences; |
| BlockSequence sequence; |
| BlockSequence *sequences = NULL; |
| BlockNumber startBlk = InvalidBlockNumber; |
| BlockNumber endBlk = InvalidBlockNumber; |
| |
| revmap = brinRevmapInitialize(index, &pagesPerRange, NULL); |
| |
| /* determine sequence(s) of pages to process */ |
| if (pageRange == BRIN_ALL_BLOCKRANGES) |
| sequences = table_relation_get_block_sequences(heapRel, |
| &numSequences); |
| else |
| { |
| /* For specific range summarization, use targeted API for efficiency */ |
| table_relation_get_block_sequence(heapRel, pageRange, &sequence); |
| numSequences = 1; |
| } |
| buf = InvalidBuffer; |
| |
| /* |
| * GPDB: We have the notion of BlockSequences to keep the following code |
| * section unified for AO/CO vs heap tables. Heap tables have only 1 |
| * block sequence, whereas AO/CO tables may have up to AOTupleId_MaxSegmentFileNum |
| * number of such sequences. The outer loop is thus a GPDB addition, whereas |
| * the inner one mostly stays the same (barring offset recalculation for |
| * both the all ranges case and specific range case) |
| */ |
| |
| for (int i = 0; i < numSequences; i++) |
| { |
| /* code in the loop left unindented to prevent merge conflicts */ |
| |
| if (pageRange == BRIN_ALL_BLOCKRANGES) |
| { |
| /* set up the start and end blocks for the next block sequence */ |
| startBlk = sequences[i].startblknum; |
| endBlk = sequences[i].startblknum + sequences[i].nblocks; |
| } |
| else |
| { |
| /* we have to scan the supplied heap block in its specified range */ |
| BlockNumber seqEndBlk; |
| |
| Assert(numSequences == 1); |
| |
| seqEndBlk = sequence.startblknum + sequence.nblocks; |
| startBlk = brin_range_start_blk(pageRange, |
| RelationStorageIsAO(heapRel), |
| pagesPerRange); |
| endBlk = Min(seqEndBlk, startBlk + pagesPerRange); |
| if (startBlk > endBlk) |
| { |
| /* Nothing to do if start point is beyond end of block sequence */ |
| brinRevmapTerminate(revmap); |
| return; |
| } |
| } |
| |
| if (RelationIsAppendOptimized(heapRel)) |
| brinRevmapAOPositionAtStart(revmap, |
| AOSegmentGet_blockSequenceNum(startBlk)); |
| |
| /* |
| * Scan the revmap to find unsummarized items for each block sequence |
| * involved. |
| */ |
| for (; startBlk < endBlk; startBlk += pagesPerRange) |
| { |
| BrinTuple *tup; |
| OffsetNumber off; |
| |
| /* |
| * Unless requested to summarize even a partial range, go away now if |
| * we think the next range is partial. Caller would pass true when it |
| * is typically run once bulk data loading is done |
| * (brin_summarize_new_values), and false when it is typically the |
| * result of arbitrarily-scheduled maintenance command (vacuuming). |
| */ |
| if (!include_partial && |
| (startBlk + pagesPerRange > endBlk)) |
| break; |
| |
| CHECK_FOR_INTERRUPTS(); |
| |
| tup = brinGetTupleForHeapBlock(revmap, startBlk, &buf, &off, NULL, |
| BUFFER_LOCK_SHARE, NULL); |
| if (tup == NULL) |
| { |
| /* no revmap entry for this heap range. Summarize it. */ |
| if (state == NULL) |
| { |
| /* first time through */ |
| Assert(!indexInfo); |
| state = initialize_brin_buildstate(index, revmap, |
| pagesPerRange, |
| RelationIsAppendOptimized(heapRel)); |
| indexInfo = BuildIndexInfo(index); |
| } |
| summarize_range(indexInfo, state, heapRel, startBlk, endBlk); |
| |
| /* and re-initialize state for the next range */ |
| brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc); |
| |
| if (numSummarized) |
| *numSummarized += 1.0; |
| } |
| else |
| { |
| if (numExisting) |
| *numExisting += 1.0; |
| LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
| } |
| } |
| |
| /* outer loop end */ |
| } |
| |
| if (BufferIsValid(buf)) |
| ReleaseBuffer(buf); |
| |
| /* free resources */ |
| brinRevmapTerminate(revmap); |
| if (state) |
| { |
| terminate_brin_buildstate(state); |
| pfree(indexInfo); |
| } |
| if (sequences) |
| pfree(sequences); |
| } |
| |
| /* |
| * Given a deformed tuple in the build state, convert it into the on-disk |
| * format and insert it into the index, making the revmap point to it. |
| */ |
| static void |
| form_and_insert_tuple(BrinBuildState *state) |
| { |
| BrinTuple *tup; |
| Size size; |
| |
| tup = brin_form_tuple(state->bs_bdesc, state->bs_currRangeStart, |
| state->bs_dtuple, &size); |
| brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess, |
| &state->bs_currentInsertBuf, state->bs_currRangeStart, |
| tup, size); |
| state->bs_numtuples++; |
| |
| pfree(tup); |
| } |
| |
| /* |
| * Given two deformed tuples, adjust the first one so that it's consistent |
| * with the summary values in both. |
| */ |
| static void |
| union_tuples(BrinDesc *bdesc, BrinMemTuple *a, BrinTuple *b) |
| { |
| int keyno; |
| BrinMemTuple *db; |
| MemoryContext cxt; |
| MemoryContext oldcxt; |
| |
| /* Use our own memory context to avoid retail pfree */ |
| cxt = AllocSetContextCreate(CurrentMemoryContext, |
| "brin union", |
| ALLOCSET_DEFAULT_SIZES); |
| oldcxt = MemoryContextSwitchTo(cxt); |
| db = brin_deform_tuple(bdesc, b, NULL); |
| MemoryContextSwitchTo(oldcxt); |
| |
| /* |
| * Check if the ranges are empty. |
| * |
| * If at least one of them is empty, we don't need to call per-key union |
| * functions at all. If "b" is empty, we just use "a" as the result (it |
| * might be empty fine, but that's fine). If "a" is empty but "b" is not, |
| * we use "b" as the result (but we have to copy the data into "a" first). |
| * |
| * Only when both ranges are non-empty, we actually do the per-key merge. |
| */ |
| |
| /* If "b" is empty - ignore it and just use "a" (even if it's empty etc.). */ |
| if (db->bt_empty_range) |
| { |
| /* skip the per-key merge */ |
| MemoryContextDelete(cxt); |
| return; |
| } |
| |
| /* |
| * Now we know "b" is not empty. If "a" is empty, then "b" is the result. |
| * But we need to copy the data from "b" to "a" first, because that's how |
| * we pass result out. |
| * |
| * We have to copy all the global/per-key flags etc. too. |
| */ |
| if (a->bt_empty_range) |
| { |
| for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++) |
| { |
| int i; |
| BrinValues *col_a = &a->bt_columns[keyno]; |
| BrinValues *col_b = &db->bt_columns[keyno]; |
| BrinOpcInfo *opcinfo = bdesc->bd_info[keyno]; |
| |
| col_a->bv_allnulls = col_b->bv_allnulls; |
| col_a->bv_hasnulls = col_b->bv_hasnulls; |
| |
| /* If "b" has no data, we're done. */ |
| if (col_b->bv_allnulls) |
| continue; |
| |
| for (i = 0; i < opcinfo->oi_nstored; i++) |
| col_a->bv_values[i] = |
| datumCopy(col_b->bv_values[i], |
| opcinfo->oi_typcache[i]->typbyval, |
| opcinfo->oi_typcache[i]->typlen); |
| } |
| |
| /* "a" started empty, but "b" was not empty, so remember that */ |
| a->bt_empty_range = false; |
| |
| /* skip the per-key merge */ |
| MemoryContextDelete(cxt); |
| return; |
| } |
| |
| /* Now we know neither range is empty. */ |
| for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++) |
| { |
| FmgrInfo *unionFn; |
| BrinValues *col_a = &a->bt_columns[keyno]; |
| BrinValues *col_b = &db->bt_columns[keyno]; |
| BrinOpcInfo *opcinfo = bdesc->bd_info[keyno]; |
| |
| if (opcinfo->oi_regular_nulls) |
| { |
| /* Adjust "hasnulls". */ |
| if (!col_a->bv_hasnulls && col_b->bv_hasnulls) |
| col_a->bv_hasnulls = true; |
| |
| /* If there are no values in B, there's nothing left to do. */ |
| if (col_b->bv_allnulls) |
| continue; |
| |
| /* |
| * Adjust "allnulls". If A doesn't have values, just copy the |
| * values from B into A, and we're done. We cannot run the |
| * operators in this case, because values in A might contain |
| * garbage. Note we already established that B contains values. |
| */ |
| if (col_a->bv_allnulls) |
| { |
| int i; |
| |
| col_a->bv_allnulls = false; |
| |
| for (i = 0; i < opcinfo->oi_nstored; i++) |
| col_a->bv_values[i] = |
| datumCopy(col_b->bv_values[i], |
| opcinfo->oi_typcache[i]->typbyval, |
| opcinfo->oi_typcache[i]->typlen); |
| |
| continue; |
| } |
| } |
| |
| unionFn = index_getprocinfo(bdesc->bd_index, keyno + 1, |
| BRIN_PROCNUM_UNION); |
| FunctionCall3Coll(unionFn, |
| bdesc->bd_index->rd_indcollation[keyno], |
| PointerGetDatum(bdesc), |
| PointerGetDatum(col_a), |
| PointerGetDatum(col_b)); |
| } |
| |
| MemoryContextDelete(cxt); |
| } |
| |
| /* |
| * brin_vacuum_scan |
| * Do a complete scan of the index during VACUUM. |
| * |
| * This routine scans the complete index looking for uncatalogued index pages, |
| * i.e. those that might have been lost due to a crash after index extension |
| * and such. |
| */ |
| static void |
| brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy) |
| { |
| BlockNumber nblocks; |
| BlockNumber blkno; |
| |
| /* |
| * Scan the index in physical order, and clean up any possible mess in |
| * each page. |
| */ |
| nblocks = RelationGetNumberOfBlocks(idxrel); |
| for (blkno = 0; blkno < nblocks; blkno++) |
| { |
| Buffer buf; |
| |
| CHECK_FOR_INTERRUPTS(); |
| |
| buf = ReadBufferExtended(idxrel, MAIN_FORKNUM, blkno, |
| RBM_NORMAL, strategy); |
| |
| brin_page_cleanup(idxrel, buf); |
| |
| ReleaseBuffer(buf); |
| } |
| |
| /* |
| * Update all upper pages in the index's FSM, as well. This ensures not |
| * only that we propagate leaf-page FSM updates made by brin_page_cleanup, |
| * but also that any pre-existing damage or out-of-dateness is repaired. |
| */ |
| FreeSpaceMapVacuum(idxrel); |
| } |
| |
| static bool |
| add_values_to_range(Relation idxRel, BrinDesc *bdesc, BrinMemTuple *dtup, |
| Datum *values, bool *nulls) |
| { |
| int keyno; |
| |
| /* If the range starts empty, we're certainly going to modify it. */ |
| bool modified = dtup->bt_empty_range; |
| |
| /* |
| * Compare the key values of the new tuple to the stored index values; our |
| * deformed tuple will get updated if the new tuple doesn't fit the |
| * original range (note this means we can't break out of the loop early). |
| * Make a note of whether this happens, so that we know to insert the |
| * modified tuple later. |
| */ |
| for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++) |
| { |
| Datum result; |
| BrinValues *bval; |
| FmgrInfo *addValue; |
| bool has_nulls; |
| |
| bval = &dtup->bt_columns[keyno]; |
| |
| /* |
| * Does the range have actual NULL values? Either of the flags can |
| * be set, but we ignore the state before adding first row. |
| * |
| * We have to remember this, because we'll modify the flags and we |
| * need to know if the range started as empty. |
| */ |
| has_nulls = ((!dtup->bt_empty_range) && |
| (bval->bv_hasnulls || bval->bv_allnulls)); |
| |
| /* |
| * If the value we're adding is NULL, handle it locally. Otherwise |
| * call the BRIN_PROCNUM_ADDVALUE procedure. |
| */ |
| if (bdesc->bd_info[keyno]->oi_regular_nulls && nulls[keyno]) |
| { |
| /* |
| * If the new value is null, we record that we saw it if it's the |
| * first one; otherwise, there's nothing to do. |
| */ |
| if (!bval->bv_hasnulls) |
| { |
| bval->bv_hasnulls = true; |
| modified = true; |
| } |
| |
| continue; |
| } |
| |
| addValue = index_getprocinfo(idxRel, keyno + 1, |
| BRIN_PROCNUM_ADDVALUE); |
| result = FunctionCall4Coll(addValue, |
| idxRel->rd_indcollation[keyno], |
| PointerGetDatum(bdesc), |
| PointerGetDatum(bval), |
| values[keyno], |
| nulls[keyno]); |
| /* if that returned true, we need to insert the updated tuple */ |
| modified |= DatumGetBool(result); |
| |
| /* |
| * If the range was had actual NULL values (i.e. did not start empty), |
| * make sure we don't forget about the NULL values. Either the allnulls |
| * flag is still set to true, or (if the opclass cleared it) we need to |
| * set hasnulls=true. |
| * |
| * XXX This can only happen when the opclass modified the tuple, so the |
| * modified flag should be set. |
| */ |
| if (has_nulls && !(bval->bv_hasnulls || bval->bv_allnulls)) |
| { |
| Assert(modified); |
| bval->bv_hasnulls = true; |
| } |
| } |
| |
| /* |
| * After updating summaries for all the keys, mark it as not empty. |
| * |
| * If we're actually changing the flag value (i.e. tuple started as empty), |
| * we should have modified the tuple. So we should not see empty range that |
| * was not modified. |
| */ |
| Assert(!dtup->bt_empty_range || modified); |
| dtup->bt_empty_range = false; |
| |
| return modified; |
| } |
| |
| static bool |
| check_null_keys(BrinValues *bval, ScanKey *nullkeys, int nnullkeys) |
| { |
| int keyno; |
| |
| /* |
| * First check if there are any IS [NOT] NULL scan keys, and if we're |
| * violating them. |
| */ |
| for (keyno = 0; keyno < nnullkeys; keyno++) |
| { |
| ScanKey key = nullkeys[keyno]; |
| |
| Assert(key->sk_attno == bval->bv_attno); |
| |
| /* Handle only IS NULL/IS NOT NULL tests */ |
| if (!(key->sk_flags & SK_ISNULL)) |
| continue; |
| |
| if (key->sk_flags & SK_SEARCHNULL) |
| { |
| /* IS NULL scan key, but range has no NULLs */ |
| if (!bval->bv_allnulls && !bval->bv_hasnulls) |
| return false; |
| } |
| else if (key->sk_flags & SK_SEARCHNOTNULL) |
| { |
| /* |
| * For IS NOT NULL, we can only skip ranges that are known to have |
| * only nulls. |
| */ |
| if (bval->bv_allnulls) |
| return false; |
| } |
| else |
| { |
| /* |
| * Neither IS NULL nor IS NOT NULL was used; assume all indexable |
| * operators are strict and thus return false with NULL value in |
| * the scan key. |
| */ |
| return false; |
| } |
| } |
| |
| return true; |
| } |