| /*------------------------------------------------------------------------- |
| * |
| * gistbuild.c |
| * build algorithm for GiST indexes implementation. |
| * |
| * There are two different strategies: |
| * |
| * 1. Sort all input tuples, pack them into GiST leaf pages in the sorted |
| * order, and create downlinks and internal pages as we go. This builds |
| * the index from the bottom up, similar to how B-tree index build |
| * works. |
| * |
| * 2. Start with an empty index, and insert all tuples one by one. |
| * |
| * The sorted method is used if the operator classes for all columns have |
| * a 'sortsupport' defined. Otherwise, we resort to the second strategy. |
| * |
| * The second strategy can optionally use buffers at different levels of |
| * the tree to reduce I/O, see "Buffering build algorithm" in the README |
| * for a more detailed explanation. It initially calls insert over and |
| * over, but switches to the buffered algorithm after a certain number of |
| * tuples (unless buffering mode is disabled). |
| * |
| * |
| * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group |
| * Portions Copyright (c) 1994, Regents of the University of California |
| * |
| * IDENTIFICATION |
| * src/backend/access/gist/gistbuild.c |
| * |
| *------------------------------------------------------------------------- |
| */ |
| #include "postgres.h" |
| |
| #include <math.h> |
| |
| #include "access/genam.h" |
| #include "access/gist_private.h" |
| #include "access/gistxlog.h" |
| #include "access/tableam.h" |
| #include "access/xloginsert.h" |
| #include "catalog/index.h" |
| #include "miscadmin.h" |
| #include "optimizer/optimizer.h" |
| #include "storage/bufmgr.h" |
| #include "storage/smgr.h" |
| #include "utils/memutils.h" |
| #include "utils/rel.h" |
| #include "utils/tuplesort.h" |
| |
| extern XLogRecPtr LSNForEncryption(bool use_wal_lsn); |
| |
| /* Step of index tuples for check whether to switch to buffering build mode */ |
| #define BUFFERING_MODE_SWITCH_CHECK_STEP 256 |
| |
| /* |
| * Number of tuples to process in the slow way before switching to buffering |
| * mode, when buffering is explicitly turned on. Also, the number of tuples |
| * to process between readjusting the buffer size parameter, while in |
| * buffering mode. |
| */ |
| #define BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET 4096 |
| |
| /* |
| * Strategy used to build the index. It can change between the |
| * GIST_BUFFERING_* modes on the fly, but if the Sorted method is used, |
| * that needs to be decided up-front and cannot be changed afterwards. |
| */ |
| typedef enum |
| { |
| GIST_SORTED_BUILD, /* bottom-up build by sorting */ |
| GIST_BUFFERING_DISABLED, /* in regular build mode and aren't going to |
| * switch */ |
| GIST_BUFFERING_AUTO, /* in regular build mode, but will switch to |
| * buffering build mode if the index grows too |
| * big */ |
| GIST_BUFFERING_STATS, /* gathering statistics of index tuple size |
| * before switching to the buffering build |
| * mode */ |
| GIST_BUFFERING_ACTIVE /* in buffering build mode */ |
| } GistBuildMode; |
| |
| /* Working state for gistbuild and its callback */ |
| typedef struct |
| { |
| Relation indexrel; |
| Relation heaprel; |
| GISTSTATE *giststate; |
| |
| Size freespace; /* amount of free space to leave on pages */ |
| |
| GistBuildMode buildMode; |
| |
| int64 indtuples; /* number of tuples indexed */ |
| |
| /* |
| * Extra data structures used during a buffering build. 'gfbb' contains |
| * information related to managing the build buffers. 'parentMap' is a |
| * lookup table of the parent of each internal page. |
| */ |
| int64 indtuplesSize; /* total size of all indexed tuples */ |
| GISTBuildBuffers *gfbb; |
| HTAB *parentMap; |
| |
| /* |
| * Extra data structures used during a sorting build. |
| */ |
| Tuplesortstate *sortstate; /* state data for tuplesort.c */ |
| |
| BlockNumber pages_allocated; |
| BlockNumber pages_written; |
| |
| int ready_num_pages; |
| BlockNumber ready_blknos[XLR_MAX_BLOCK_ID]; |
| Page ready_pages[XLR_MAX_BLOCK_ID]; |
| } GISTBuildState; |
| |
| #define GIST_SORTED_BUILD_PAGE_NUM 4 |
| |
| /* |
| * In sorted build, we use a stack of these structs, one for each level, |
| * to hold an in-memory buffer of last pages at the level. |
| * |
| * Sorting GiST build requires good linearization of the sort opclass. This is |
| * not always the case in multidimensional data. To tackle the anomalies, we |
| * buffer index tuples and apply picksplit that can be multidimension-aware. |
| */ |
| typedef struct GistSortedBuildLevelState |
| { |
| int current_page; |
| BlockNumber last_blkno; |
| struct GistSortedBuildLevelState *parent; /* Upper level, if any */ |
| Page pages[GIST_SORTED_BUILD_PAGE_NUM]; |
| } GistSortedBuildLevelState; |
| |
| /* prototypes for private functions */ |
| |
| static void gistSortedBuildCallback(Relation index, ItemPointer tid, |
| Datum *values, bool *isnull, |
| bool tupleIsAlive, void *state); |
| static void gist_indexsortbuild(GISTBuildState *state); |
| static void gist_indexsortbuild_levelstate_add(GISTBuildState *state, |
| GistSortedBuildLevelState *levelstate, |
| IndexTuple itup); |
| static void gist_indexsortbuild_levelstate_flush(GISTBuildState *state, |
| GistSortedBuildLevelState *levelstate); |
| static void gist_indexsortbuild_flush_ready_pages(GISTBuildState *state); |
| |
| static void gistInitBuffering(GISTBuildState *buildstate); |
| static int calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep); |
| static void gistBuildCallback(Relation index, |
| ItemPointer tid, |
| Datum *values, |
| bool *isnull, |
| bool tupleIsAlive, |
| void *state); |
| static void gistBufferingBuildInsert(GISTBuildState *buildstate, |
| IndexTuple itup); |
| static bool gistProcessItup(GISTBuildState *buildstate, IndexTuple itup, |
| BlockNumber startblkno, int startlevel); |
| static BlockNumber gistbufferinginserttuples(GISTBuildState *buildstate, |
| Buffer buffer, int level, |
| IndexTuple *itup, int ntup, OffsetNumber oldoffnum, |
| BlockNumber parentblk, OffsetNumber downlinkoffnum); |
| static Buffer gistBufferingFindCorrectParent(GISTBuildState *buildstate, |
| BlockNumber childblkno, int level, |
| BlockNumber *parentblkno, |
| OffsetNumber *downlinkoffnum); |
| static void gistProcessEmptyingQueue(GISTBuildState *buildstate); |
| static void gistEmptyAllBuffers(GISTBuildState *buildstate); |
| static int gistGetMaxLevel(Relation index); |
| |
| static void gistInitParentMap(GISTBuildState *buildstate); |
| static void gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child, |
| BlockNumber parent); |
| static void gistMemorizeAllDownlinks(GISTBuildState *buildstate, |
| Buffer parentbuf); |
| static BlockNumber gistGetParent(GISTBuildState *buildstate, BlockNumber child); |
| |
| |
| /* |
| * Main entry point to GiST index build. |
| */ |
| IndexBuildResult * |
| gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) |
| { |
| IndexBuildResult *result; |
| double reltuples; |
| GISTBuildState buildstate; |
| MemoryContext oldcxt = CurrentMemoryContext; |
| int fillfactor; |
| Oid SortSupportFnOids[INDEX_MAX_KEYS]; |
| GiSTOptions *options = (GiSTOptions *) index->rd_options; |
| |
| /* |
| * We expect to be called exactly once for any index relation. If that's |
| * not the case, big trouble's what we have. |
| */ |
| if (RelationGetNumberOfBlocks(index) != 0) |
| elog(ERROR, "index \"%s\" already contains data", |
| RelationGetRelationName(index)); |
| |
| buildstate.indexrel = index; |
| buildstate.heaprel = heap; |
| buildstate.sortstate = NULL; |
| buildstate.giststate = initGISTstate(index); |
| |
| /* |
| * Create a temporary memory context that is reset once for each tuple |
| * processed. (Note: we don't bother to make this a child of the |
| * giststate's scanCxt, so we have to delete it separately at the end.) |
| */ |
| buildstate.giststate->tempCxt = createTempGistContext(); |
| |
| /* |
| * Choose build strategy. First check whether the user specified to use |
| * buffering mode. (The use-case for that in the field is somewhat |
| * questionable perhaps, but it's important for testing purposes.) |
| */ |
| if (options) |
| { |
| if (options->buffering_mode == GIST_OPTION_BUFFERING_ON) |
| buildstate.buildMode = GIST_BUFFERING_STATS; |
| else if (options->buffering_mode == GIST_OPTION_BUFFERING_OFF) |
| buildstate.buildMode = GIST_BUFFERING_DISABLED; |
| else /* must be "auto" */ |
| buildstate.buildMode = GIST_BUFFERING_AUTO; |
| } |
| else |
| { |
| buildstate.buildMode = GIST_BUFFERING_AUTO; |
| } |
| |
| /* |
| * Unless buffering mode was forced, see if we can use sorting instead. |
| */ |
| if (buildstate.buildMode != GIST_BUFFERING_STATS) |
| { |
| bool hasallsortsupports = true; |
| int keyscount = IndexRelationGetNumberOfKeyAttributes(index); |
| |
| for (int i = 0; i < keyscount; i++) |
| { |
| SortSupportFnOids[i] = index_getprocid(index, i + 1, |
| GIST_SORTSUPPORT_PROC); |
| if (!OidIsValid(SortSupportFnOids[i])) |
| { |
| hasallsortsupports = false; |
| break; |
| } |
| } |
| if (hasallsortsupports) |
| buildstate.buildMode = GIST_SORTED_BUILD; |
| } |
| |
| /* |
| * Calculate target amount of free space to leave on pages. |
| */ |
| fillfactor = options ? options->fillfactor : GIST_DEFAULT_FILLFACTOR; |
| buildstate.freespace = BLCKSZ * (100 - fillfactor) / 100; |
| |
| /* |
| * Build the index using the chosen strategy. |
| */ |
| buildstate.indtuples = 0; |
| buildstate.indtuplesSize = 0; |
| |
| if (buildstate.buildMode == GIST_SORTED_BUILD) |
| { |
| /* |
| * Sort all data, build the index from bottom up. |
| */ |
| buildstate.sortstate = tuplesort_begin_index_gist(heap, |
| index, |
| maintenance_work_mem, |
| NULL, |
| TUPLESORT_NONE); |
| |
| /* Scan the table, adding all tuples to the tuplesort */ |
| reltuples = table_index_build_scan(heap, index, indexInfo, true, true, |
| gistSortedBuildCallback, |
| (void *) &buildstate, NULL); |
| |
| /* |
| * Perform the sort and build index pages. |
| */ |
| tuplesort_performsort(buildstate.sortstate); |
| |
| gist_indexsortbuild(&buildstate); |
| |
| tuplesort_end(buildstate.sortstate); |
| } |
| else |
| { |
| /* |
| * Initialize an empty index and insert all tuples, possibly using |
| * buffers on intermediate levels. |
| */ |
| Buffer buffer; |
| Page page; |
| |
| /* initialize the root page */ |
| buffer = gistNewBuffer(index, heap); |
| Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO); |
| page = BufferGetPage(buffer); |
| |
| START_CRIT_SECTION(); |
| |
| GISTInitBuffer(buffer, F_LEAF); |
| |
| MarkBufferDirty(buffer); |
| PageSetLSN(page, GistBuildLSN); |
| |
| UnlockReleaseBuffer(buffer); |
| |
| END_CRIT_SECTION(); |
| |
| /* Scan the table, inserting all the tuples to the index. */ |
| reltuples = table_index_build_scan(heap, index, indexInfo, true, true, |
| gistBuildCallback, |
| (void *) &buildstate, NULL); |
| |
| /* |
| * If buffering was used, flush out all the tuples that are still in |
| * the buffers. |
| */ |
| if (buildstate.buildMode == GIST_BUFFERING_ACTIVE) |
| { |
| elog(DEBUG1, "all tuples processed, emptying buffers"); |
| gistEmptyAllBuffers(&buildstate); |
| gistFreeBuildBuffers(buildstate.gfbb); |
| } |
| |
| /* |
| * We didn't write WAL records as we built the index, so if |
| * WAL-logging is required, write all pages to the WAL now. |
| */ |
| if (RelationNeedsWAL(index)) |
| { |
| log_newpage_range(index, MAIN_FORKNUM, |
| 0, RelationGetNumberOfBlocks(index), |
| true); |
| } |
| } |
| |
| /* okay, all heap tuples are indexed */ |
| MemoryContextSwitchTo(oldcxt); |
| MemoryContextDelete(buildstate.giststate->tempCxt); |
| |
| freeGISTstate(buildstate.giststate); |
| |
| /* |
| * Return statistics |
| */ |
| result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); |
| |
| result->heap_tuples = reltuples; |
| result->index_tuples = (double) buildstate.indtuples; |
| |
| return result; |
| } |
| |
| /*------------------------------------------------------------------------- |
| * Routines for sorted build |
| *------------------------------------------------------------------------- |
| */ |
| |
| /* |
| * Per-tuple callback for table_index_build_scan. |
| */ |
| static void |
| gistSortedBuildCallback(Relation index, |
| ItemPointer tid, |
| Datum *values, |
| bool *isnull, |
| bool tupleIsAlive, |
| void *state) |
| { |
| GISTBuildState *buildstate = (GISTBuildState *) state; |
| MemoryContext oldCtx; |
| Datum compressed_values[INDEX_MAX_KEYS]; |
| |
| oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt); |
| |
| /* Form an index tuple and point it at the heap tuple */ |
| gistCompressValues(buildstate->giststate, index, |
| values, isnull, |
| true, compressed_values); |
| |
| tuplesort_putindextuplevalues(buildstate->sortstate, |
| buildstate->indexrel, |
| tid, |
| compressed_values, isnull); |
| |
| MemoryContextSwitchTo(oldCtx); |
| MemoryContextReset(buildstate->giststate->tempCxt); |
| |
| /* Update tuple count. */ |
| buildstate->indtuples += 1; |
| } |
| |
| /* |
| * Build GiST index from bottom up from pre-sorted tuples. |
| */ |
| static void |
| gist_indexsortbuild(GISTBuildState *state) |
| { |
| IndexTuple itup; |
| GistSortedBuildLevelState *levelstate; |
| Page page; |
| |
| state->pages_allocated = 0; |
| state->pages_written = 0; |
| state->ready_num_pages = 0; |
| |
| /* |
| * Write an empty page as a placeholder for the root page. It will be |
| * replaced with the real root page at the end. |
| */ |
| page = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO); |
| smgrextend(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO, |
| page, true); |
| state->pages_allocated++; |
| state->pages_written++; |
| |
| /* Allocate a temporary buffer for the first leaf page batch. */ |
| levelstate = palloc0(sizeof(GistSortedBuildLevelState)); |
| levelstate->pages[0] = page; |
| levelstate->parent = NULL; |
| gistinitpage(page, F_LEAF); |
| |
| /* |
| * Fill index pages with tuples in the sorted order. |
| */ |
| while ((itup = tuplesort_getindextuple(state->sortstate, true)) != NULL) |
| { |
| gist_indexsortbuild_levelstate_add(state, levelstate, itup); |
| MemoryContextReset(state->giststate->tempCxt); |
| } |
| |
| /* |
| * Write out the partially full non-root pages. |
| * |
| * Keep in mind that flush can build a new root. If number of pages is > 1 |
| * then new root is required. |
| */ |
| while (levelstate->parent != NULL || levelstate->current_page != 0) |
| { |
| GistSortedBuildLevelState *parent; |
| |
| gist_indexsortbuild_levelstate_flush(state, levelstate); |
| parent = levelstate->parent; |
| for (int i = 0; i < GIST_SORTED_BUILD_PAGE_NUM; i++) |
| if (levelstate->pages[i]) |
| pfree(levelstate->pages[i]); |
| pfree(levelstate); |
| levelstate = parent; |
| } |
| |
| gist_indexsortbuild_flush_ready_pages(state); |
| |
| /* Write out the root */ |
| PageSetLSN(levelstate->pages[0], GistBuildLSN); |
| PageEncryptInplace(levelstate->pages[0], MAIN_FORKNUM, |
| GIST_ROOT_BLKNO); |
| PageSetChecksumInplace(levelstate->pages[0], GIST_ROOT_BLKNO); |
| smgrwrite(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO, |
| levelstate->pages[0], true); |
| if (RelationNeedsWAL(state->indexrel)) |
| log_newpage(&state->indexrel->rd_locator, MAIN_FORKNUM, GIST_ROOT_BLKNO, |
| levelstate->pages[0], true); |
| |
| pfree(levelstate->pages[0]); |
| pfree(levelstate); |
| |
| /* |
| * When we WAL-logged index pages, we must nonetheless fsync index files. |
| * Since we're building outside shared buffers, a CHECKPOINT occurring |
| * during the build has no way to flush the previously written data to |
| * disk (indeed it won't know the index even exists). A crash later on |
| * would replay WAL from the checkpoint, therefore it wouldn't replay our |
| * earlier WAL entries. If we do not fsync those pages here, they might |
| * still not be on disk when the crash occurs. |
| */ |
| if (RelationNeedsWAL(state->indexrel)) |
| smgrimmedsync(RelationGetSmgr(state->indexrel), MAIN_FORKNUM); |
| } |
| |
| /* |
| * Add tuple to a page. If the pages are full, write them out and re-initialize |
| * a new page first. |
| */ |
| static void |
| gist_indexsortbuild_levelstate_add(GISTBuildState *state, |
| GistSortedBuildLevelState *levelstate, |
| IndexTuple itup) |
| { |
| Size sizeNeeded; |
| |
| /* Check if tuple can be added to the current page */ |
| sizeNeeded = IndexTupleSize(itup) + sizeof(ItemIdData); /* fillfactor ignored */ |
| if (PageGetFreeSpace(levelstate->pages[levelstate->current_page]) < sizeNeeded) |
| { |
| Page newPage; |
| Page old_page = levelstate->pages[levelstate->current_page]; |
| uint16 old_page_flags = GistPageGetOpaque(old_page)->flags; |
| |
| if (levelstate->current_page + 1 == GIST_SORTED_BUILD_PAGE_NUM) |
| { |
| gist_indexsortbuild_levelstate_flush(state, levelstate); |
| } |
| else |
| levelstate->current_page++; |
| |
| if (levelstate->pages[levelstate->current_page] == NULL) |
| levelstate->pages[levelstate->current_page] = |
| palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); |
| |
| newPage = levelstate->pages[levelstate->current_page]; |
| gistinitpage(newPage, old_page_flags); |
| } |
| |
| gistfillbuffer(levelstate->pages[levelstate->current_page], &itup, 1, InvalidOffsetNumber); |
| } |
| |
| static void |
| gist_indexsortbuild_levelstate_flush(GISTBuildState *state, |
| GistSortedBuildLevelState *levelstate) |
| { |
| GistSortedBuildLevelState *parent; |
| BlockNumber blkno; |
| MemoryContext oldCtx; |
| IndexTuple union_tuple; |
| SplitedPageLayout *dist; |
| IndexTuple *itvec; |
| int vect_len; |
| bool isleaf = GistPageIsLeaf(levelstate->pages[0]); |
| |
| CHECK_FOR_INTERRUPTS(); |
| |
| oldCtx = MemoryContextSwitchTo(state->giststate->tempCxt); |
| |
| /* Get index tuples from first page */ |
| itvec = gistextractpage(levelstate->pages[0], &vect_len); |
| if (levelstate->current_page > 0) |
| { |
| /* Append tuples from each page */ |
| for (int i = 1; i < levelstate->current_page + 1; i++) |
| { |
| int len_local; |
| IndexTuple *itvec_local = gistextractpage(levelstate->pages[i], &len_local); |
| |
| itvec = gistjoinvector(itvec, &vect_len, itvec_local, len_local); |
| pfree(itvec_local); |
| } |
| |
| /* Apply picksplit to list of all collected tuples */ |
| dist = gistSplit(state->indexrel, levelstate->pages[0], itvec, vect_len, state->giststate); |
| } |
| else |
| { |
| /* Create splitted layout from single page */ |
| dist = (SplitedPageLayout *) palloc0(sizeof(SplitedPageLayout)); |
| union_tuple = gistunion(state->indexrel, itvec, vect_len, |
| state->giststate); |
| dist->itup = union_tuple; |
| dist->list = gistfillitupvec(itvec, vect_len, &(dist->lenlist)); |
| dist->block.num = vect_len; |
| } |
| |
| MemoryContextSwitchTo(oldCtx); |
| |
| /* Reset page counter */ |
| levelstate->current_page = 0; |
| |
| /* Create pages for all partitions in split result */ |
| for (; dist != NULL; dist = dist->next) |
| { |
| char *data; |
| Page target; |
| |
| /* check once per page */ |
| CHECK_FOR_INTERRUPTS(); |
| |
| /* Create page and copy data */ |
| data = (char *) (dist->list); |
| target = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO); |
| gistinitpage(target, isleaf ? F_LEAF : 0); |
| for (int i = 0; i < dist->block.num; i++) |
| { |
| IndexTuple thistup = (IndexTuple) data; |
| |
| if (PageAddItem(target, (Item) data, IndexTupleSize(thistup), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber) |
| elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(state->indexrel)); |
| |
| data += IndexTupleSize(thistup); |
| } |
| union_tuple = dist->itup; |
| |
| if (state->ready_num_pages == XLR_MAX_BLOCK_ID) |
| gist_indexsortbuild_flush_ready_pages(state); |
| |
| /* |
| * The page is now complete. Assign a block number to it, and add it |
| * to the list of finished pages. (We don't write it out immediately, |
| * because we want to WAL-log the pages in batches.) |
| */ |
| blkno = state->pages_allocated++; |
| state->ready_blknos[state->ready_num_pages] = blkno; |
| state->ready_pages[state->ready_num_pages] = target; |
| state->ready_num_pages++; |
| ItemPointerSetBlockNumber(&(union_tuple->t_tid), blkno); |
| |
| /* |
| * Set the right link to point to the previous page. This is just for |
| * debugging purposes: GiST only follows the right link if a page is |
| * split concurrently to a scan, and that cannot happen during index |
| * build. |
| * |
| * It's a bit counterintuitive that we set the right link on the new |
| * page to point to the previous page, not the other way around. But |
| * GiST pages are not ordered like B-tree pages are, so as long as the |
| * right-links form a chain through all the pages at the same level, |
| * the order doesn't matter. |
| */ |
| if (levelstate->last_blkno) |
| GistPageGetOpaque(target)->rightlink = levelstate->last_blkno; |
| levelstate->last_blkno = blkno; |
| |
| /* |
| * Insert the downlink to the parent page. If this was the root, |
| * create a new page as the parent, which becomes the new root. |
| */ |
| parent = levelstate->parent; |
| if (parent == NULL) |
| { |
| parent = palloc0(sizeof(GistSortedBuildLevelState)); |
| parent->pages[0] = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); |
| parent->parent = NULL; |
| gistinitpage(parent->pages[0], 0); |
| |
| levelstate->parent = parent; |
| } |
| gist_indexsortbuild_levelstate_add(state, parent, union_tuple); |
| } |
| } |
| |
| static void |
| gist_indexsortbuild_flush_ready_pages(GISTBuildState *state) |
| { |
| if (state->ready_num_pages == 0) |
| return; |
| |
| for (int i = 0; i < state->ready_num_pages; i++) |
| { |
| Page page = state->ready_pages[i]; |
| BlockNumber blkno = state->ready_blknos[i]; |
| |
| /* Currently, the blocks must be buffered in order. */ |
| if (blkno != state->pages_written) |
| elog(ERROR, "unexpected block number to flush GiST sorting build"); |
| |
| PageSetLSN(page, !FileEncryptionEnabled ? GistBuildLSN : |
| LSNForEncryption(RelationIsPermanent(state->indexrel))); |
| /* Make sure LSNs are vaild, and if encryption, are not constant. */ |
| Assert(!XLogRecPtrIsInvalid(PageGetLSN(page)) && |
| (!FileEncryptionEnabled || |
| PageGetLSN(page) != GistBuildLSN)); |
| PageEncryptInplace(page, MAIN_FORKNUM, |
| blkno); |
| PageSetChecksumInplace(page, blkno); |
| smgrextend(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, blkno, page, |
| true); |
| |
| state->pages_written++; |
| } |
| |
| if (RelationNeedsWAL(state->indexrel)) |
| log_newpages(&state->indexrel->rd_locator, MAIN_FORKNUM, state->ready_num_pages, |
| state->ready_blknos, state->ready_pages, true); |
| |
| for (int i = 0; i < state->ready_num_pages; i++) |
| pfree(state->ready_pages[i]); |
| |
| state->ready_num_pages = 0; |
| } |
| |
| |
| /*------------------------------------------------------------------------- |
| * Routines for non-sorted build |
| *------------------------------------------------------------------------- |
| */ |
| |
| /* |
| * Attempt to switch to buffering mode. |
| * |
| * If there is not enough memory for buffering build, sets bufferingMode |
| * to GIST_BUFFERING_DISABLED, so that we don't bother to try the switch |
| * anymore. Otherwise initializes the build buffers, and sets bufferingMode to |
| * GIST_BUFFERING_ACTIVE. |
| */ |
| static void |
| gistInitBuffering(GISTBuildState *buildstate) |
| { |
| Relation index = buildstate->indexrel; |
| int pagesPerBuffer; |
| Size pageFreeSpace; |
| Size itupAvgSize, |
| itupMinSize; |
| double avgIndexTuplesPerPage, |
| maxIndexTuplesPerPage; |
| int i; |
| int levelStep; |
| |
| /* Calc space of index page which is available for index tuples */ |
| pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData) |
| - sizeof(ItemIdData) |
| - buildstate->freespace; |
| |
| /* |
| * Calculate average size of already inserted index tuples using gathered |
| * statistics. |
| */ |
| itupAvgSize = (double) buildstate->indtuplesSize / |
| (double) buildstate->indtuples; |
| |
| /* |
| * Calculate minimal possible size of index tuple by index metadata. |
| * Minimal possible size of varlena is VARHDRSZ. |
| * |
| * XXX: that's not actually true, as a short varlen can be just 2 bytes. |
| * And we should take padding into account here. |
| */ |
| itupMinSize = (Size) MAXALIGN(sizeof(IndexTupleData)); |
| for (i = 0; i < index->rd_att->natts; i++) |
| { |
| if (TupleDescAttr(index->rd_att, i)->attlen < 0) |
| itupMinSize += VARHDRSZ; |
| else |
| itupMinSize += TupleDescAttr(index->rd_att, i)->attlen; |
| } |
| |
| /* Calculate average and maximal number of index tuples which fit to page */ |
| avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize; |
| maxIndexTuplesPerPage = pageFreeSpace / itupMinSize; |
| |
| /* |
| * We need to calculate two parameters for the buffering algorithm: |
| * levelStep and pagesPerBuffer. |
| * |
| * levelStep determines the size of subtree that we operate on, while |
| * emptying a buffer. A higher value is better, as you need fewer buffer |
| * emptying steps to build the index. However, if you set it too high, the |
| * subtree doesn't fit in cache anymore, and you quickly lose the benefit |
| * of the buffers. |
| * |
| * In Arge et al's paper, levelStep is chosen as logB(M/4B), where B is |
| * the number of tuples on page (ie. fanout), and M is the amount of |
| * internal memory available. Curiously, they doesn't explain *why* that |
| * setting is optimal. We calculate it by taking the highest levelStep so |
| * that a subtree still fits in cache. For a small B, our way of |
| * calculating levelStep is very close to Arge et al's formula. For a |
| * large B, our formula gives a value that is 2x higher. |
| * |
| * The average size (in pages) of a subtree of depth n can be calculated |
| * as a geometric series: |
| * |
| * B^0 + B^1 + B^2 + ... + B^n = (1 - B^(n + 1)) / (1 - B) |
| * |
| * where B is the average number of index tuples on page. The subtree is |
| * cached in the shared buffer cache and the OS cache, so we choose |
| * levelStep so that the subtree size is comfortably smaller than |
| * effective_cache_size, with a safety factor of 4. |
| * |
| * The estimate on the average number of index tuples on page is based on |
| * average tuple sizes observed before switching to buffered build, so the |
| * real subtree size can be somewhat larger. Also, it would selfish to |
| * gobble the whole cache for our index build. The safety factor of 4 |
| * should account for those effects. |
| * |
| * The other limiting factor for setting levelStep is that while |
| * processing a subtree, we need to hold one page for each buffer at the |
| * next lower buffered level. The max. number of buffers needed for that |
| * is maxIndexTuplesPerPage^levelStep. This is very conservative, but |
| * hopefully maintenance_work_mem is set high enough that you're |
| * constrained by effective_cache_size rather than maintenance_work_mem. |
| * |
| * XXX: the buffer hash table consumes a fair amount of memory too per |
| * buffer, but that is not currently taken into account. That scales on |
| * the total number of buffers used, ie. the index size and on levelStep. |
| * Note that a higher levelStep *reduces* the amount of memory needed for |
| * the hash table. |
| */ |
| levelStep = 1; |
| for (;;) |
| { |
| double subtreesize; |
| double maxlowestlevelpages; |
| |
| /* size of an average subtree at this levelStep (in pages). */ |
| subtreesize = |
| (1 - pow(avgIndexTuplesPerPage, (double) (levelStep + 1))) / |
| (1 - avgIndexTuplesPerPage); |
| |
| /* max number of pages at the lowest level of a subtree */ |
| maxlowestlevelpages = pow(maxIndexTuplesPerPage, (double) levelStep); |
| |
| /* subtree must fit in cache (with safety factor of 4) */ |
| if (subtreesize > effective_cache_size / 4) |
| break; |
| |
| /* each node in the lowest level of a subtree has one page in memory */ |
| if (maxlowestlevelpages > ((double) maintenance_work_mem * 1024) / BLCKSZ) |
| break; |
| |
| /* Good, we can handle this levelStep. See if we can go one higher. */ |
| levelStep++; |
| } |
| |
| /* |
| * We just reached an unacceptable value of levelStep in previous loop. |
| * So, decrease levelStep to get last acceptable value. |
| */ |
| levelStep--; |
| |
| /* |
| * If there's not enough cache or maintenance_work_mem, fall back to plain |
| * inserts. |
| */ |
| if (levelStep <= 0) |
| { |
| elog(DEBUG1, "failed to switch to buffered GiST build"); |
| buildstate->buildMode = GIST_BUFFERING_DISABLED; |
| return; |
| } |
| |
| /* |
| * The second parameter to set is pagesPerBuffer, which determines the |
| * size of each buffer. We adjust pagesPerBuffer also during the build, |
| * which is why this calculation is in a separate function. |
| */ |
| pagesPerBuffer = calculatePagesPerBuffer(buildstate, levelStep); |
| |
| /* Initialize GISTBuildBuffers with these parameters */ |
| buildstate->gfbb = gistInitBuildBuffers(pagesPerBuffer, levelStep, |
| gistGetMaxLevel(index)); |
| |
| gistInitParentMap(buildstate); |
| |
| buildstate->buildMode = GIST_BUFFERING_ACTIVE; |
| |
| elog(DEBUG1, "switched to buffered GiST build; level step = %d, pagesPerBuffer = %d", |
| levelStep, pagesPerBuffer); |
| } |
| |
| /* |
| * Calculate pagesPerBuffer parameter for the buffering algorithm. |
| * |
| * Buffer size is chosen so that assuming that tuples are distributed |
| * randomly, emptying half a buffer fills on average one page in every buffer |
| * at the next lower level. |
| */ |
| static int |
| calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep) |
| { |
| double pagesPerBuffer; |
| double avgIndexTuplesPerPage; |
| double itupAvgSize; |
| Size pageFreeSpace; |
| |
| /* Calc space of index page which is available for index tuples */ |
| pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData) |
| - sizeof(ItemIdData) |
| - buildstate->freespace; |
| |
| /* |
| * Calculate average size of already inserted index tuples using gathered |
| * statistics. |
| */ |
| itupAvgSize = (double) buildstate->indtuplesSize / |
| (double) buildstate->indtuples; |
| |
| avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize; |
| |
| /* |
| * Recalculate required size of buffers. |
| */ |
| pagesPerBuffer = 2 * pow(avgIndexTuplesPerPage, levelStep); |
| |
| return (int) rint(pagesPerBuffer); |
| } |
| |
| /* |
| * Per-tuple callback for table_index_build_scan. |
| */ |
| static void |
| gistBuildCallback(Relation index, |
| ItemPointer tid, |
| Datum *values, |
| bool *isnull, |
| bool tupleIsAlive, |
| void *state) |
| { |
| GISTBuildState *buildstate = (GISTBuildState *) state; |
| IndexTuple itup; |
| MemoryContext oldCtx; |
| |
| oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt); |
| |
| /* form an index tuple and point it at the heap tuple */ |
| itup = gistFormTuple(buildstate->giststate, index, |
| values, isnull, |
| true); |
| itup->t_tid = *tid; |
| |
| /* Update tuple count and total size. */ |
| buildstate->indtuples += 1; |
| buildstate->indtuplesSize += IndexTupleSize(itup); |
| |
| /* |
| * XXX In buffering builds, the tempCxt is also reset down inside |
| * gistProcessEmptyingQueue(). This is not great because it risks |
| * confusion and possible use of dangling pointers (for example, itup |
| * might be already freed when control returns here). It's generally |
| * better that a memory context be "owned" by only one function. However, |
| * currently this isn't causing issues so it doesn't seem worth the amount |
| * of refactoring that would be needed to avoid it. |
| */ |
| if (buildstate->buildMode == GIST_BUFFERING_ACTIVE) |
| { |
| /* We have buffers, so use them. */ |
| gistBufferingBuildInsert(buildstate, itup); |
| } |
| else |
| { |
| /* |
| * There's no buffers (yet). Since we already have the index relation |
| * locked, we call gistdoinsert directly. |
| */ |
| gistdoinsert(index, itup, buildstate->freespace, |
| buildstate->giststate, buildstate->heaprel, true); |
| } |
| |
| MemoryContextSwitchTo(oldCtx); |
| MemoryContextReset(buildstate->giststate->tempCxt); |
| |
| if (buildstate->buildMode == GIST_BUFFERING_ACTIVE && |
| buildstate->indtuples % BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET == 0) |
| { |
| /* Adjust the target buffer size now */ |
| buildstate->gfbb->pagesPerBuffer = |
| calculatePagesPerBuffer(buildstate, buildstate->gfbb->levelStep); |
| } |
| |
| /* |
| * In 'auto' mode, check if the index has grown too large to fit in cache, |
| * and switch to buffering mode if it has. |
| * |
| * To avoid excessive calls to smgrnblocks(), only check this every |
| * BUFFERING_MODE_SWITCH_CHECK_STEP index tuples. |
| * |
| * In 'stats' state, switch as soon as we have seen enough tuples to have |
| * some idea of the average tuple size. |
| */ |
| if ((buildstate->buildMode == GIST_BUFFERING_AUTO && |
| buildstate->indtuples % BUFFERING_MODE_SWITCH_CHECK_STEP == 0 && |
| effective_cache_size < smgrnblocks(RelationGetSmgr(index), |
| MAIN_FORKNUM)) || |
| (buildstate->buildMode == GIST_BUFFERING_STATS && |
| buildstate->indtuples >= BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET)) |
| { |
| /* |
| * Index doesn't fit in effective cache anymore. Try to switch to |
| * buffering build mode. |
| */ |
| gistInitBuffering(buildstate); |
| } |
| } |
| |
| /* |
| * Insert function for buffering index build. |
| */ |
| static void |
| gistBufferingBuildInsert(GISTBuildState *buildstate, IndexTuple itup) |
| { |
| /* Insert the tuple to buffers. */ |
| gistProcessItup(buildstate, itup, 0, buildstate->gfbb->rootlevel); |
| |
| /* If we filled up (half of a) buffer, process buffer emptying. */ |
| gistProcessEmptyingQueue(buildstate); |
| } |
| |
| /* |
| * Process an index tuple. Runs the tuple down the tree until we reach a leaf |
| * page or node buffer, and inserts the tuple there. Returns true if we have |
| * to stop buffer emptying process (because one of child buffers can't take |
| * index tuples anymore). |
| */ |
| static bool |
| gistProcessItup(GISTBuildState *buildstate, IndexTuple itup, |
| BlockNumber startblkno, int startlevel) |
| { |
| GISTSTATE *giststate = buildstate->giststate; |
| GISTBuildBuffers *gfbb = buildstate->gfbb; |
| Relation indexrel = buildstate->indexrel; |
| BlockNumber childblkno; |
| Buffer buffer; |
| bool result = false; |
| BlockNumber blkno; |
| int level; |
| OffsetNumber downlinkoffnum = InvalidOffsetNumber; |
| BlockNumber parentblkno = InvalidBlockNumber; |
| |
| CHECK_FOR_INTERRUPTS(); |
| |
| /* |
| * Loop until we reach a leaf page (level == 0) or a level with buffers |
| * (not including the level we start at, because we would otherwise make |
| * no progress). |
| */ |
| blkno = startblkno; |
| level = startlevel; |
| for (;;) |
| { |
| ItemId iid; |
| IndexTuple idxtuple, |
| newtup; |
| Page page; |
| OffsetNumber childoffnum; |
| |
| /* Have we reached a level with buffers? */ |
| if (LEVEL_HAS_BUFFERS(level, gfbb) && level != startlevel) |
| break; |
| |
| /* Have we reached a leaf page? */ |
| if (level == 0) |
| break; |
| |
| /* |
| * Nope. Descend down to the next level then. Choose a child to |
| * descend down to. |
| */ |
| |
| buffer = ReadBuffer(indexrel, blkno); |
| LockBuffer(buffer, GIST_EXCLUSIVE); |
| |
| page = (Page) BufferGetPage(buffer); |
| childoffnum = gistchoose(indexrel, page, itup, giststate); |
| iid = PageGetItemId(page, childoffnum); |
| idxtuple = (IndexTuple) PageGetItem(page, iid); |
| childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); |
| |
| if (level > 1) |
| gistMemorizeParent(buildstate, childblkno, blkno); |
| |
| /* |
| * Check that the key representing the target child node is consistent |
| * with the key we're inserting. Update it if it's not. |
| */ |
| newtup = gistgetadjusted(indexrel, idxtuple, itup, giststate); |
| if (newtup) |
| { |
| blkno = gistbufferinginserttuples(buildstate, |
| buffer, |
| level, |
| &newtup, |
| 1, |
| childoffnum, |
| InvalidBlockNumber, |
| InvalidOffsetNumber); |
| /* gistbufferinginserttuples() released the buffer */ |
| } |
| else |
| UnlockReleaseBuffer(buffer); |
| |
| /* Descend to the child */ |
| parentblkno = blkno; |
| blkno = childblkno; |
| downlinkoffnum = childoffnum; |
| Assert(level > 0); |
| level--; |
| } |
| |
| if (LEVEL_HAS_BUFFERS(level, gfbb)) |
| { |
| /* |
| * We've reached level with buffers. Place the index tuple to the |
| * buffer, and add the buffer to the emptying queue if it overflows. |
| */ |
| GISTNodeBuffer *childNodeBuffer; |
| |
| /* Find the buffer or create a new one */ |
| childNodeBuffer = gistGetNodeBuffer(gfbb, giststate, blkno, level); |
| |
| /* Add index tuple to it */ |
| gistPushItupToNodeBuffer(gfbb, childNodeBuffer, itup); |
| |
| if (BUFFER_OVERFLOWED(childNodeBuffer, gfbb)) |
| result = true; |
| } |
| else |
| { |
| /* |
| * We've reached a leaf page. Place the tuple here. |
| */ |
| Assert(level == 0); |
| buffer = ReadBuffer(indexrel, blkno); |
| LockBuffer(buffer, GIST_EXCLUSIVE); |
| gistbufferinginserttuples(buildstate, buffer, level, |
| &itup, 1, InvalidOffsetNumber, |
| parentblkno, downlinkoffnum); |
| /* gistbufferinginserttuples() released the buffer */ |
| } |
| |
| return result; |
| } |
| |
| /* |
| * Insert tuples to a given page. |
| * |
| * This is analogous with gistinserttuples() in the regular insertion code. |
| * |
| * Returns the block number of the page where the (first) new or updated tuple |
| * was inserted. Usually that's the original page, but might be a sibling page |
| * if the original page was split. |
| * |
| * Caller should hold a lock on 'buffer' on entry. This function will unlock |
| * and unpin it. |
| */ |
| static BlockNumber |
| gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, int level, |
| IndexTuple *itup, int ntup, OffsetNumber oldoffnum, |
| BlockNumber parentblk, OffsetNumber downlinkoffnum) |
| { |
| GISTBuildBuffers *gfbb = buildstate->gfbb; |
| List *splitinfo; |
| bool is_split; |
| BlockNumber placed_to_blk = InvalidBlockNumber; |
| |
| is_split = gistplacetopage(buildstate->indexrel, |
| buildstate->freespace, |
| buildstate->giststate, |
| buffer, |
| itup, ntup, oldoffnum, &placed_to_blk, |
| InvalidBuffer, |
| &splitinfo, |
| false, |
| buildstate->heaprel, true); |
| |
| /* |
| * If this is a root split, update the root path item kept in memory. This |
| * ensures that all path stacks are always complete, including all parent |
| * nodes up to the root. That simplifies the algorithm to re-find correct |
| * parent. |
| */ |
| if (is_split && BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO) |
| { |
| Page page = BufferGetPage(buffer); |
| OffsetNumber off; |
| OffsetNumber maxoff; |
| |
| Assert(level == gfbb->rootlevel); |
| gfbb->rootlevel++; |
| |
| elog(DEBUG2, "splitting GiST root page, now %d levels deep", gfbb->rootlevel); |
| |
| /* |
| * All the downlinks on the old root page are now on one of the child |
| * pages. Visit all the new child pages to memorize the parents of the |
| * grandchildren. |
| */ |
| if (gfbb->rootlevel > 1) |
| { |
| maxoff = PageGetMaxOffsetNumber(page); |
| for (off = FirstOffsetNumber; off <= maxoff; off++) |
| { |
| ItemId iid = PageGetItemId(page, off); |
| IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); |
| BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); |
| Buffer childbuf = ReadBuffer(buildstate->indexrel, childblkno); |
| |
| LockBuffer(childbuf, GIST_SHARE); |
| gistMemorizeAllDownlinks(buildstate, childbuf); |
| UnlockReleaseBuffer(childbuf); |
| |
| /* |
| * Also remember that the parent of the new child page is the |
| * root block. |
| */ |
| gistMemorizeParent(buildstate, childblkno, GIST_ROOT_BLKNO); |
| } |
| } |
| } |
| |
| if (splitinfo) |
| { |
| /* |
| * Insert the downlinks to the parent. This is analogous with |
| * gistfinishsplit() in the regular insertion code, but the locking is |
| * simpler, and we have to maintain the buffers on internal nodes and |
| * the parent map. |
| */ |
| IndexTuple *downlinks; |
| int ndownlinks, |
| i; |
| Buffer parentBuffer; |
| ListCell *lc; |
| |
| /* Parent may have changed since we memorized this path. */ |
| parentBuffer = |
| gistBufferingFindCorrectParent(buildstate, |
| BufferGetBlockNumber(buffer), |
| level, |
| &parentblk, |
| &downlinkoffnum); |
| |
| /* |
| * If there's a buffer associated with this page, that needs to be |
| * split too. gistRelocateBuildBuffersOnSplit() will also adjust the |
| * downlinks in 'splitinfo', to make sure they're consistent not only |
| * with the tuples already on the pages, but also the tuples in the |
| * buffers that will eventually be inserted to them. |
| */ |
| gistRelocateBuildBuffersOnSplit(gfbb, |
| buildstate->giststate, |
| buildstate->indexrel, |
| level, |
| buffer, splitinfo); |
| |
| /* Create an array of all the downlink tuples */ |
| ndownlinks = list_length(splitinfo); |
| downlinks = (IndexTuple *) palloc(sizeof(IndexTuple) * ndownlinks); |
| i = 0; |
| foreach(lc, splitinfo) |
| { |
| GISTPageSplitInfo *splitinfo = lfirst(lc); |
| |
| /* |
| * Remember the parent of each new child page in our parent map. |
| * This assumes that the downlinks fit on the parent page. If the |
| * parent page is split, too, when we recurse up to insert the |
| * downlinks, the recursive gistbufferinginserttuples() call will |
| * update the map again. |
| */ |
| if (level > 0) |
| gistMemorizeParent(buildstate, |
| BufferGetBlockNumber(splitinfo->buf), |
| BufferGetBlockNumber(parentBuffer)); |
| |
| /* |
| * Also update the parent map for all the downlinks that got moved |
| * to a different page. (actually this also loops through the |
| * downlinks that stayed on the original page, but it does no |
| * harm). |
| */ |
| if (level > 1) |
| gistMemorizeAllDownlinks(buildstate, splitinfo->buf); |
| |
| /* |
| * Since there's no concurrent access, we can release the lower |
| * level buffers immediately. This includes the original page. |
| */ |
| UnlockReleaseBuffer(splitinfo->buf); |
| downlinks[i++] = splitinfo->downlink; |
| } |
| |
| /* Insert them into parent. */ |
| gistbufferinginserttuples(buildstate, parentBuffer, level + 1, |
| downlinks, ndownlinks, downlinkoffnum, |
| InvalidBlockNumber, InvalidOffsetNumber); |
| |
| list_free_deep(splitinfo); /* we don't need this anymore */ |
| } |
| else |
| UnlockReleaseBuffer(buffer); |
| |
| return placed_to_blk; |
| } |
| |
| /* |
| * Find the downlink pointing to a child page. |
| * |
| * 'childblkno' indicates the child page to find the parent for. 'level' is |
| * the level of the child. On entry, *parentblkno and *downlinkoffnum can |
| * point to a location where the downlink used to be - we will check that |
| * location first, and save some cycles if it hasn't moved. The function |
| * returns a buffer containing the downlink, exclusively-locked, and |
| * *parentblkno and *downlinkoffnum are set to the real location of the |
| * downlink. |
| * |
| * If the child page is a leaf (level == 0), the caller must supply a correct |
| * parentblkno. Otherwise we use the parent map hash table to find the parent |
| * block. |
| * |
| * This function serves the same purpose as gistFindCorrectParent() during |
| * normal index inserts, but this is simpler because we don't need to deal |
| * with concurrent inserts. |
| */ |
| static Buffer |
| gistBufferingFindCorrectParent(GISTBuildState *buildstate, |
| BlockNumber childblkno, int level, |
| BlockNumber *parentblkno, |
| OffsetNumber *downlinkoffnum) |
| { |
| BlockNumber parent; |
| Buffer buffer; |
| Page page; |
| OffsetNumber maxoff; |
| OffsetNumber off; |
| |
| if (level > 0) |
| parent = gistGetParent(buildstate, childblkno); |
| else |
| { |
| /* |
| * For a leaf page, the caller must supply a correct parent block |
| * number. |
| */ |
| if (*parentblkno == InvalidBlockNumber) |
| elog(ERROR, "no parent buffer provided of child %u", childblkno); |
| parent = *parentblkno; |
| } |
| |
| buffer = ReadBuffer(buildstate->indexrel, parent); |
| page = BufferGetPage(buffer); |
| LockBuffer(buffer, GIST_EXCLUSIVE); |
| gistcheckpage(buildstate->indexrel, buffer); |
| maxoff = PageGetMaxOffsetNumber(page); |
| |
| /* Check if it was not moved */ |
| if (parent == *parentblkno && *parentblkno != InvalidBlockNumber && |
| *downlinkoffnum != InvalidOffsetNumber && *downlinkoffnum <= maxoff) |
| { |
| ItemId iid = PageGetItemId(page, *downlinkoffnum); |
| IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); |
| |
| if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno) |
| { |
| /* Still there */ |
| return buffer; |
| } |
| } |
| |
| /* |
| * Downlink was not at the offset where it used to be. Scan the page to |
| * find it. During normal gist insertions, it might've moved to another |
| * page, to the right, but during a buffering build, we keep track of the |
| * parent of each page in the lookup table so we should always know what |
| * page it's on. |
| */ |
| for (off = FirstOffsetNumber; off <= maxoff; off = OffsetNumberNext(off)) |
| { |
| ItemId iid = PageGetItemId(page, off); |
| IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); |
| |
| if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno) |
| { |
| /* yes!!, found it */ |
| *downlinkoffnum = off; |
| return buffer; |
| } |
| } |
| |
| elog(ERROR, "failed to re-find parent for block %u", childblkno); |
| return InvalidBuffer; /* keep compiler quiet */ |
| } |
| |
| /* |
| * Process buffers emptying stack. Emptying of one buffer can cause emptying |
| * of other buffers. This function iterates until this cascading emptying |
| * process finished, e.g. until buffers emptying stack is empty. |
| */ |
| static void |
| gistProcessEmptyingQueue(GISTBuildState *buildstate) |
| { |
| GISTBuildBuffers *gfbb = buildstate->gfbb; |
| |
| /* Iterate while we have elements in buffers emptying stack. */ |
| while (gfbb->bufferEmptyingQueue != NIL) |
| { |
| GISTNodeBuffer *emptyingNodeBuffer; |
| |
| /* Get node buffer from emptying stack. */ |
| emptyingNodeBuffer = (GISTNodeBuffer *) linitial(gfbb->bufferEmptyingQueue); |
| gfbb->bufferEmptyingQueue = list_delete_first(gfbb->bufferEmptyingQueue); |
| emptyingNodeBuffer->queuedForEmptying = false; |
| |
| /* |
| * We are going to load last pages of buffers where emptying will be |
| * to. So let's unload any previously loaded buffers. |
| */ |
| gistUnloadNodeBuffers(gfbb); |
| |
| /* |
| * Pop tuples from the buffer and run them down to the buffers at |
| * lower level, or leaf pages. We continue until one of the lower |
| * level buffers fills up, or this buffer runs empty. |
| * |
| * In Arge et al's paper, the buffer emptying is stopped after |
| * processing 1/2 node buffer worth of tuples, to avoid overfilling |
| * any of the lower level buffers. However, it's more efficient to |
| * keep going until one of the lower level buffers actually fills up, |
| * so that's what we do. This doesn't need to be exact, if a buffer |
| * overfills by a few tuples, there's no harm done. |
| */ |
| while (true) |
| { |
| IndexTuple itup; |
| |
| /* Get next index tuple from the buffer */ |
| if (!gistPopItupFromNodeBuffer(gfbb, emptyingNodeBuffer, &itup)) |
| break; |
| |
| /* |
| * Run it down to the underlying node buffer or leaf page. |
| * |
| * Note: it's possible that the buffer we're emptying splits as a |
| * result of this call. If that happens, our emptyingNodeBuffer |
| * points to the left half of the split. After split, it's very |
| * likely that the new left buffer is no longer over the half-full |
| * threshold, but we might as well keep flushing tuples from it |
| * until we fill a lower-level buffer. |
| */ |
| if (gistProcessItup(buildstate, itup, emptyingNodeBuffer->nodeBlocknum, emptyingNodeBuffer->level)) |
| { |
| /* |
| * A lower level buffer filled up. Stop emptying this buffer, |
| * to avoid overflowing the lower level buffer. |
| */ |
| break; |
| } |
| |
| /* Free all the memory allocated during index tuple processing */ |
| MemoryContextReset(buildstate->giststate->tempCxt); |
| } |
| } |
| } |
| |
| /* |
| * Empty all node buffers, from top to bottom. This is done at the end of |
| * index build to flush all remaining tuples to the index. |
| * |
| * Note: This destroys the buffersOnLevels lists, so the buffers should not |
| * be inserted to after this call. |
| */ |
| static void |
| gistEmptyAllBuffers(GISTBuildState *buildstate) |
| { |
| GISTBuildBuffers *gfbb = buildstate->gfbb; |
| MemoryContext oldCtx; |
| int i; |
| |
| oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt); |
| |
| /* |
| * Iterate through the levels from top to bottom. |
| */ |
| for (i = gfbb->buffersOnLevelsLen - 1; i >= 0; i--) |
| { |
| /* |
| * Empty all buffers on this level. Note that new buffers can pop up |
| * in the list during the processing, as a result of page splits, so a |
| * simple walk through the list won't work. We remove buffers from the |
| * list when we see them empty; a buffer can't become non-empty once |
| * it's been fully emptied. |
| */ |
| while (gfbb->buffersOnLevels[i] != NIL) |
| { |
| GISTNodeBuffer *nodeBuffer; |
| |
| nodeBuffer = (GISTNodeBuffer *) linitial(gfbb->buffersOnLevels[i]); |
| |
| if (nodeBuffer->blocksCount != 0) |
| { |
| /* |
| * Add this buffer to the emptying queue, and proceed to empty |
| * the queue. |
| */ |
| if (!nodeBuffer->queuedForEmptying) |
| { |
| MemoryContextSwitchTo(gfbb->context); |
| nodeBuffer->queuedForEmptying = true; |
| gfbb->bufferEmptyingQueue = |
| lcons(nodeBuffer, gfbb->bufferEmptyingQueue); |
| MemoryContextSwitchTo(buildstate->giststate->tempCxt); |
| } |
| gistProcessEmptyingQueue(buildstate); |
| } |
| else |
| gfbb->buffersOnLevels[i] = |
| list_delete_first(gfbb->buffersOnLevels[i]); |
| } |
| elog(DEBUG2, "emptied all buffers at level %d", i); |
| } |
| MemoryContextSwitchTo(oldCtx); |
| } |
| |
| /* |
| * Get the depth of the GiST index. |
| */ |
| static int |
| gistGetMaxLevel(Relation index) |
| { |
| int maxLevel; |
| BlockNumber blkno; |
| |
| /* |
| * Traverse down the tree, starting from the root, until we hit the leaf |
| * level. |
| */ |
| maxLevel = 0; |
| blkno = GIST_ROOT_BLKNO; |
| while (true) |
| { |
| Buffer buffer; |
| Page page; |
| IndexTuple itup; |
| |
| buffer = ReadBuffer(index, blkno); |
| |
| /* |
| * There's no concurrent access during index build, so locking is just |
| * pro forma. |
| */ |
| LockBuffer(buffer, GIST_SHARE); |
| page = (Page) BufferGetPage(buffer); |
| |
| if (GistPageIsLeaf(page)) |
| { |
| /* We hit the bottom, so we're done. */ |
| UnlockReleaseBuffer(buffer); |
| break; |
| } |
| |
| /* |
| * Pick the first downlink on the page, and follow it. It doesn't |
| * matter which downlink we choose, the tree has the same depth |
| * everywhere, so we just pick the first one. |
| */ |
| itup = (IndexTuple) PageGetItem(page, |
| PageGetItemId(page, FirstOffsetNumber)); |
| blkno = ItemPointerGetBlockNumber(&(itup->t_tid)); |
| UnlockReleaseBuffer(buffer); |
| |
| /* |
| * We're going down on the tree. It means that there is yet one more |
| * level in the tree. |
| */ |
| maxLevel++; |
| } |
| return maxLevel; |
| } |
| |
| |
| /* |
| * Routines for managing the parent map. |
| * |
| * Whenever a page is split, we need to insert the downlinks into the parent. |
| * We need to somehow find the parent page to do that. In normal insertions, |
| * we keep a stack of nodes visited when we descend the tree. However, in |
| * buffering build, we can start descending the tree from any internal node, |
| * when we empty a buffer by cascading tuples to its children. So we don't |
| * have a full stack up to the root available at that time. |
| * |
| * So instead, we maintain a hash table to track the parent of every internal |
| * page. We don't need to track the parents of leaf nodes, however. Whenever |
| * we insert to a leaf, we've just descended down from its parent, so we know |
| * its immediate parent already. This helps a lot to limit the memory used |
| * by this hash table. |
| * |
| * Whenever an internal node is split, the parent map needs to be updated. |
| * the parent of the new child page needs to be recorded, and also the |
| * entries for all page whose downlinks are moved to a new page at the split |
| * needs to be updated. |
| * |
| * We also update the parent map whenever we descend the tree. That might seem |
| * unnecessary, because we maintain the map whenever a downlink is moved or |
| * created, but it is needed because we switch to buffering mode after |
| * creating a tree with regular index inserts. Any pages created before |
| * switching to buffering mode will not be present in the parent map initially, |
| * but will be added there the first time we visit them. |
| */ |
| |
| typedef struct |
| { |
| BlockNumber childblkno; /* hash key */ |
| BlockNumber parentblkno; |
| } ParentMapEntry; |
| |
| static void |
| gistInitParentMap(GISTBuildState *buildstate) |
| { |
| HASHCTL hashCtl; |
| |
| hashCtl.keysize = sizeof(BlockNumber); |
| hashCtl.entrysize = sizeof(ParentMapEntry); |
| hashCtl.hcxt = CurrentMemoryContext; |
| buildstate->parentMap = hash_create("gistbuild parent map", |
| 1024, |
| &hashCtl, |
| HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); |
| } |
| |
| static void |
| gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child, BlockNumber parent) |
| { |
| ParentMapEntry *entry; |
| bool found; |
| |
| entry = (ParentMapEntry *) hash_search(buildstate->parentMap, |
| &child, |
| HASH_ENTER, |
| &found); |
| entry->parentblkno = parent; |
| } |
| |
| /* |
| * Scan all downlinks on a page, and memorize their parent. |
| */ |
| static void |
| gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parentbuf) |
| { |
| OffsetNumber maxoff; |
| OffsetNumber off; |
| BlockNumber parentblkno = BufferGetBlockNumber(parentbuf); |
| Page page = BufferGetPage(parentbuf); |
| |
| Assert(!GistPageIsLeaf(page)); |
| |
| maxoff = PageGetMaxOffsetNumber(page); |
| for (off = FirstOffsetNumber; off <= maxoff; off++) |
| { |
| ItemId iid = PageGetItemId(page, off); |
| IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); |
| BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); |
| |
| gistMemorizeParent(buildstate, childblkno, parentblkno); |
| } |
| } |
| |
| static BlockNumber |
| gistGetParent(GISTBuildState *buildstate, BlockNumber child) |
| { |
| ParentMapEntry *entry; |
| bool found; |
| |
| /* Find node buffer in hash table */ |
| entry = (ParentMapEntry *) hash_search(buildstate->parentMap, |
| &child, |
| HASH_FIND, |
| &found); |
| if (!found) |
| elog(ERROR, "could not find parent of block %u in lookup table", child); |
| |
| return entry->parentblkno; |
| } |