| /*------------------------------------------------------------------------- |
| * |
| * nbtsort.c |
| * Build a btree from sorted input by loading leaf pages sequentially. |
| * |
| * NOTES |
| * |
| * We use tuplesort.c to sort the given index tuples into order. |
| * Then we scan the index tuples in order and build the btree pages |
| * for each level. We load source tuples into leaf-level pages. |
| * Whenever we fill a page at one level, we add a link to it to its |
| * parent level (starting a new parent level if necessary). When |
| * done, we write out each final page on each level, adding it to |
| * its parent level. When we have only one page on a level, it must be |
| * the root -- it can be attached to the btree metapage and we are done. |
| * |
| * It is not wise to pack the pages entirely full, since then *any* |
| * insertion would cause a split (and not only of the leaf page; the need |
| * for a split would cascade right up the tree). The steady-state load |
| * factor for btrees is usually estimated at 70%. We choose to pack leaf |
| * pages to the user-controllable fill factor (default 90%) while upper pages |
| * are always packed to 70%. This gives us reasonable density (there aren't |
| * many upper pages if the keys are reasonable-size) without risking a lot of |
| * cascading splits during early insertions. |
| * |
| * Formerly the index pages being built were kept in shared buffers, but |
| * that is of no value (since other backends have no interest in them yet) |
| * and it created locking problems for CHECKPOINT, because the upper-level |
| * pages were held exclusive-locked for long periods. Now we just build |
| * the pages in local memory and smgrwrite or smgrextend them as we finish |
| * them. They will need to be re-read into shared buffers on first use after |
| * the build finishes. |
| * |
| * This code isn't concerned about the FSM at all. The caller is responsible |
| * for initializing that. |
| * |
| * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group |
| * Portions Copyright (c) 1994, Regents of the University of California |
| * |
| * IDENTIFICATION |
| * src/backend/access/nbtree/nbtsort.c |
| * |
| *------------------------------------------------------------------------- |
| */ |
| |
| #include "postgres.h" |
| |
| #include "access/nbtree.h" |
| #include "access/parallel.h" |
| #include "access/relscan.h" |
| #include "access/table.h" |
| #include "access/xact.h" |
| #include "access/xlog.h" |
| #include "access/xloginsert.h" |
| #include "catalog/index.h" |
| #include "commands/progress.h" |
| #include "executor/instrument.h" |
| #include "miscadmin.h" |
| #include "pgstat.h" |
| #include "storage/smgr.h" |
| #include "tcop/tcopprot.h" /* pgrminclude ignore */ |
| #include "utils/rel.h" |
| #include "utils/sortsupport.h" |
| #include "utils/tuplesort.h" |
| |
| |
| /* Magic numbers for parallel state sharing */ |
| #define PARALLEL_KEY_BTREE_SHARED UINT64CONST(0xA000000000000001) |
| #define PARALLEL_KEY_TUPLESORT UINT64CONST(0xA000000000000002) |
| #define PARALLEL_KEY_TUPLESORT_SPOOL2 UINT64CONST(0xA000000000000003) |
| #define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xA000000000000004) |
| #define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xA000000000000005) |
| #define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xA000000000000006) |
| |
| /* |
| * DISABLE_LEADER_PARTICIPATION disables the leader's participation in |
| * parallel index builds. This may be useful as a debugging aid. |
| #undef DISABLE_LEADER_PARTICIPATION |
| */ |
| |
| /* |
| * Status record for spooling/sorting phase. (Note we may have two of |
| * these due to the special requirements for uniqueness-checking with |
| * dead tuples.) |
| */ |
| typedef struct BTSpool |
| { |
| void *sortstate; /* state data for tuplesort.c */ |
| Relation heap; |
| Relation index; |
| bool isunique; |
| bool nulls_not_distinct; |
| } BTSpool; |
| |
| /* |
| * Status for index builds performed in parallel. This is allocated in a |
| * dynamic shared memory segment. Note that there is a separate tuplesort TOC |
| * entry, private to tuplesort.c but allocated by this module on its behalf. |
| */ |
| typedef struct BTShared |
| { |
| /* |
| * These fields are not modified during the sort. They primarily exist |
| * for the benefit of worker processes that need to create BTSpool state |
| * corresponding to that used by the leader. |
| */ |
| Oid heaprelid; |
| Oid indexrelid; |
| bool isunique; |
| bool nulls_not_distinct; |
| bool isconcurrent; |
| int scantuplesortstates; |
| |
| /* |
| * workersdonecv is used to monitor the progress of workers. All parallel |
| * participants must indicate that they are done before leader can use |
| * mutable state that workers maintain during scan (and before leader can |
| * proceed to tuplesort_performsort()). |
| */ |
| ConditionVariable workersdonecv; |
| |
| /* |
| * mutex protects all fields before heapdesc. |
| * |
| * These fields contain status information of interest to B-Tree index |
| * builds that must work just the same when an index is built in parallel. |
| */ |
| slock_t mutex; |
| |
| /* |
| * Mutable state that is maintained by workers, and reported back to |
| * leader at end of parallel scan. |
| * |
| * nparticipantsdone is number of worker processes finished. |
| * |
| * reltuples is the total number of input heap tuples. |
| * |
| * havedead indicates if RECENTLY_DEAD tuples were encountered during |
| * build. |
| * |
| * indtuples is the total number of tuples that made it into the index. |
| * |
| * brokenhotchain indicates if any worker detected a broken HOT chain |
| * during build. |
| */ |
| int nparticipantsdone; |
| double reltuples; |
| bool havedead; |
| double indtuples; |
| bool brokenhotchain; |
| |
| /* |
| * ParallelTableScanDescData data follows. Can't directly embed here, as |
| * implementations of the parallel table scan desc interface might need |
| * stronger alignment. |
| */ |
| } BTShared; |
| |
| /* |
| * Return pointer to a BTShared's parallel table scan. |
| * |
| * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just |
| * MAXALIGN. |
| */ |
| #define ParallelTableScanFromBTShared(shared) \ |
| (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared))) |
| |
| /* |
| * Status for leader in parallel index build. |
| */ |
| typedef struct BTLeader |
| { |
| /* parallel context itself */ |
| ParallelContext *pcxt; |
| |
| /* |
| * nparticipanttuplesorts is the exact number of worker processes |
| * successfully launched, plus one leader process if it participates as a |
| * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader |
| * participating as a worker). |
| */ |
| int nparticipanttuplesorts; |
| |
| /* |
| * Leader process convenience pointers to shared state (leader avoids TOC |
| * lookups). |
| * |
| * btshared is the shared state for entire build. sharedsort is the |
| * shared, tuplesort-managed state passed to each process tuplesort. |
| * sharedsort2 is the corresponding btspool2 shared state, used only when |
| * building unique indexes. snapshot is the snapshot used by the scan iff |
| * an MVCC snapshot is required. |
| */ |
| BTShared *btshared; |
| Sharedsort *sharedsort; |
| Sharedsort *sharedsort2; |
| Snapshot snapshot; |
| WalUsage *walusage; |
| BufferUsage *bufferusage; |
| } BTLeader; |
| |
| /* |
| * Working state for btbuild and its callback. |
| * |
| * When parallel CREATE INDEX is used, there is a BTBuildState for each |
| * participant. |
| */ |
| typedef struct BTBuildState |
| { |
| bool isunique; |
| bool nulls_not_distinct; |
| bool havedead; |
| Relation heap; |
| BTSpool *spool; |
| |
| /* |
| * spool2 is needed only when the index is a unique index. Dead tuples are |
| * put into spool2 instead of spool in order to avoid uniqueness check. |
| */ |
| BTSpool *spool2; |
| double indtuples; |
| |
| /* |
| * btleader is only present when a parallel index build is performed, and |
| * only in the leader process. (Actually, only the leader has a |
| * BTBuildState. Workers have their own spool and spool2, though.) |
| */ |
| BTLeader *btleader; |
| } BTBuildState; |
| |
| /* |
| * Status record for a btree page being built. We have one of these |
| * for each active tree level. |
| */ |
| typedef struct BTPageState |
| { |
| Page btps_page; /* workspace for page building */ |
| BlockNumber btps_blkno; /* block # to write this page at */ |
| IndexTuple btps_lowkey; /* page's strict lower bound pivot tuple */ |
| OffsetNumber btps_lastoff; /* last item offset loaded */ |
| Size btps_lastextra; /* last item's extra posting list space */ |
| uint32 btps_level; /* tree level (0 = leaf) */ |
| Size btps_full; /* "full" if less than this much free space */ |
| struct BTPageState *btps_next; /* link to parent level, if any */ |
| } BTPageState; |
| |
| /* |
| * Overall status record for index writing phase. |
| */ |
| typedef struct BTWriteState |
| { |
| Relation heap; |
| Relation index; |
| BTScanInsert inskey; /* generic insertion scankey */ |
| bool btws_use_wal; /* dump pages to WAL? */ |
| BlockNumber btws_pages_alloced; /* # pages allocated */ |
| BlockNumber btws_pages_written; /* # pages written out */ |
| Page btws_zeropage; /* workspace for filling zeroes */ |
| } BTWriteState; |
| |
| |
| static double _bt_spools_heapscan(Relation heap, Relation index, |
| BTBuildState *buildstate, IndexInfo *indexInfo); |
| static void _bt_spooldestroy(BTSpool *btspool); |
| static void _bt_spool(BTSpool *btspool, ItemPointer self, |
| Datum *values, bool *isnull); |
| static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2); |
| static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values, |
| bool *isnull, bool tupleIsAlive, void *state); |
| static Page _bt_blnewpage(uint32 level); |
| static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level); |
| static void _bt_slideleft(Page rightmostpage); |
| static void _bt_sortaddtup(Page page, Size itemsize, |
| IndexTuple itup, OffsetNumber itup_off, |
| bool newfirstdataitem); |
| static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, |
| IndexTuple itup, Size truncextra); |
| static void _bt_sort_dedup_finish_pending(BTWriteState *wstate, |
| BTPageState *state, |
| BTDedupState dstate); |
| static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state); |
| static void _bt_load(BTWriteState *wstate, |
| BTSpool *btspool, BTSpool *btspool2); |
| static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, |
| int request); |
| static void _bt_end_parallel(BTLeader *btleader); |
| static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot); |
| static double _bt_parallel_heapscan(BTBuildState *buildstate, |
| bool *brokenhotchain); |
| static void _bt_leader_participate_as_worker(BTBuildState *buildstate); |
| static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, |
| BTShared *btshared, Sharedsort *sharedsort, |
| Sharedsort *sharedsort2, int sortmem, |
| bool progress); |
| |
| |
| /* |
| * btbuild() -- build a new btree index. |
| */ |
| IndexBuildResult * |
| btbuild(Relation heap, Relation index, IndexInfo *indexInfo) |
| { |
| IndexBuildResult *result; |
| BTBuildState buildstate; |
| double reltuples; |
| |
| #ifdef BTREE_BUILD_STATS |
| if (log_btree_build_stats) |
| ResetUsage(); |
| #endif /* BTREE_BUILD_STATS */ |
| |
| buildstate.isunique = indexInfo->ii_Unique; |
| buildstate.nulls_not_distinct = indexInfo->ii_NullsNotDistinct; |
| buildstate.havedead = false; |
| buildstate.heap = heap; |
| buildstate.spool = NULL; |
| buildstate.spool2 = NULL; |
| buildstate.indtuples = 0; |
| buildstate.btleader = NULL; |
| |
| /* |
| * We expect to be called exactly once for any index relation. If that's |
| * not the case, big trouble's what we have. |
| */ |
| if (RelationGetNumberOfBlocks(index) != 0) |
| elog(ERROR, "index \"%s\" already contains data", |
| RelationGetRelationName(index)); |
| |
| reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo); |
| |
| /* |
| * Finish the build by (1) completing the sort of the spool file, (2) |
| * inserting the sorted tuples into btree pages and (3) building the upper |
| * levels. Finally, it may also be necessary to end use of parallelism. |
| */ |
| _bt_leafbuild(buildstate.spool, buildstate.spool2); |
| _bt_spooldestroy(buildstate.spool); |
| if (buildstate.spool2) |
| _bt_spooldestroy(buildstate.spool2); |
| if (buildstate.btleader) |
| _bt_end_parallel(buildstate.btleader); |
| |
| result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); |
| |
| result->heap_tuples = reltuples; |
| result->index_tuples = buildstate.indtuples; |
| |
| #ifdef BTREE_BUILD_STATS |
| if (log_btree_build_stats) |
| { |
| ShowUsage("BTREE BUILD STATS"); |
| ResetUsage(); |
| } |
| #endif /* BTREE_BUILD_STATS */ |
| |
| return result; |
| } |
| |
| /* |
| * Create and initialize one or two spool structures, and save them in caller's |
| * buildstate argument. May also fill-in fields within indexInfo used by index |
| * builds. |
| * |
| * Scans the heap, possibly in parallel, filling spools with IndexTuples. This |
| * routine encapsulates all aspects of managing parallelism. Caller need only |
| * call _bt_end_parallel() in parallel case after it is done with spool/spool2. |
| * |
| * Returns the total number of heap tuples scanned. |
| */ |
| static double |
| _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, |
| IndexInfo *indexInfo) |
| { |
| BTSpool *btspool = (BTSpool *) palloc0(sizeof(BTSpool)); |
| SortCoordinate coordinate = NULL; |
| double reltuples = 0; |
| |
| /* |
| * We size the sort area as maintenance_work_mem rather than work_mem to |
| * speed index creation. This should be OK since a single backend can't |
| * run multiple index creations in parallel (see also: notes on |
| * parallelism and maintenance_work_mem below). |
| */ |
| btspool->heap = heap; |
| btspool->index = index; |
| btspool->isunique = indexInfo->ii_Unique; |
| btspool->nulls_not_distinct = indexInfo->ii_NullsNotDistinct; |
| |
| /* Save as primary spool */ |
| buildstate->spool = btspool; |
| |
| /* Report table scan phase started */ |
| pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, |
| PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN); |
| |
| /* Attempt to launch parallel worker scan when required */ |
| if (indexInfo->ii_ParallelWorkers > 0) |
| _bt_begin_parallel(buildstate, indexInfo->ii_Concurrent, |
| indexInfo->ii_ParallelWorkers); |
| |
| /* |
| * If parallel build requested and at least one worker process was |
| * successfully launched, set up coordination state |
| */ |
| if (buildstate->btleader) |
| { |
| coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData)); |
| coordinate->isWorker = false; |
| coordinate->nParticipants = |
| buildstate->btleader->nparticipanttuplesorts; |
| coordinate->sharedsort = buildstate->btleader->sharedsort; |
| } |
| |
| /* |
| * Begin serial/leader tuplesort. |
| * |
| * In cases where parallelism is involved, the leader receives the same |
| * share of maintenance_work_mem as a serial sort (it is generally treated |
| * in the same way as a serial sort once we return). Parallel worker |
| * Tuplesortstates will have received only a fraction of |
| * maintenance_work_mem, though. |
| * |
| * We rely on the lifetime of the Leader Tuplesortstate almost not |
| * overlapping with any worker Tuplesortstate's lifetime. There may be |
| * some small overlap, but that's okay because we rely on leader |
| * Tuplesortstate only allocating a small, fixed amount of memory here. |
| * When its tuplesort_performsort() is called (by our caller), and |
| * significant amounts of memory are likely to be used, all workers must |
| * have already freed almost all memory held by their Tuplesortstates |
| * (they are about to go away completely, too). The overall effect is |
| * that maintenance_work_mem always represents an absolute high watermark |
| * on the amount of memory used by a CREATE INDEX operation, regardless of |
| * the use of parallelism or any other factor. |
| */ |
| buildstate->spool->sortstate = |
| tuplesort_begin_index_btree(heap, index, buildstate->isunique, |
| buildstate->nulls_not_distinct, |
| maintenance_work_mem, coordinate, |
| TUPLESORT_NONE); |
| |
| /* |
| * If building a unique index, put dead tuples in a second spool to keep |
| * them out of the uniqueness check. We expect that the second spool (for |
| * dead tuples) won't get very full, so we give it only work_mem. |
| */ |
| if (indexInfo->ii_Unique) |
| { |
| BTSpool *btspool2 = (BTSpool *) palloc0(sizeof(BTSpool)); |
| SortCoordinate coordinate2 = NULL; |
| |
| /* Initialize secondary spool */ |
| btspool2->heap = heap; |
| btspool2->index = index; |
| btspool2->isunique = false; |
| /* Save as secondary spool */ |
| buildstate->spool2 = btspool2; |
| |
| if (buildstate->btleader) |
| { |
| /* |
| * Set up non-private state that is passed to |
| * tuplesort_begin_index_btree() about the basic high level |
| * coordination of a parallel sort. |
| */ |
| coordinate2 = (SortCoordinate) palloc0(sizeof(SortCoordinateData)); |
| coordinate2->isWorker = false; |
| coordinate2->nParticipants = |
| buildstate->btleader->nparticipanttuplesorts; |
| coordinate2->sharedsort = buildstate->btleader->sharedsort2; |
| } |
| |
| /* |
| * We expect that the second one (for dead tuples) won't get very |
| * full, so we give it only work_mem |
| */ |
| buildstate->spool2->sortstate = |
| tuplesort_begin_index_btree(heap, index, false, false, work_mem, |
| coordinate2, TUPLESORT_NONE); |
| } |
| |
| /* Fill spool using either serial or parallel heap scan */ |
| if (!buildstate->btleader) |
| reltuples = table_index_build_scan(heap, index, indexInfo, true, true, |
| _bt_build_callback, (void *) buildstate, |
| NULL); |
| else |
| reltuples = _bt_parallel_heapscan(buildstate, |
| &indexInfo->ii_BrokenHotChain); |
| |
| /* |
| * Set the progress target for the next phase. Reset the block number |
| * values set by table_index_build_scan |
| */ |
| { |
| const int progress_index[] = { |
| PROGRESS_CREATEIDX_TUPLES_TOTAL, |
| PROGRESS_SCAN_BLOCKS_TOTAL, |
| PROGRESS_SCAN_BLOCKS_DONE |
| }; |
| const int64 progress_vals[] = { |
| buildstate->indtuples, |
| 0, 0 |
| }; |
| |
| pgstat_progress_update_multi_param(3, progress_index, progress_vals); |
| } |
| |
| /* okay, all heap tuples are spooled */ |
| if (buildstate->spool2 && !buildstate->havedead) |
| { |
| /* spool2 turns out to be unnecessary */ |
| _bt_spooldestroy(buildstate->spool2); |
| buildstate->spool2 = NULL; |
| } |
| |
| return reltuples; |
| } |
| |
| /* |
| * clean up a spool structure and its substructures. |
| */ |
| static void |
| _bt_spooldestroy(BTSpool *btspool) |
| { |
| tuplesort_end(btspool->sortstate); |
| pfree(btspool); |
| } |
| |
| /* |
| * spool an index entry into the sort file. |
| */ |
| static void |
| _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull) |
| { |
| tuplesort_putindextuplevalues(btspool->sortstate, btspool->index, |
| self, values, isnull); |
| } |
| |
| /* |
| * given a spool loaded by successive calls to _bt_spool, |
| * create an entire btree. |
| */ |
| static void |
| _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2) |
| { |
| BTWriteState wstate; |
| |
| #ifdef BTREE_BUILD_STATS |
| if (log_btree_build_stats) |
| { |
| ShowUsage("BTREE BUILD (Spool) STATISTICS"); |
| ResetUsage(); |
| } |
| #endif /* BTREE_BUILD_STATS */ |
| |
| /* Execute the sort */ |
| pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, |
| PROGRESS_BTREE_PHASE_PERFORMSORT_1); |
| tuplesort_performsort(btspool->sortstate); |
| if (btspool2) |
| { |
| pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, |
| PROGRESS_BTREE_PHASE_PERFORMSORT_2); |
| tuplesort_performsort(btspool2->sortstate); |
| } |
| |
| wstate.heap = btspool->heap; |
| wstate.index = btspool->index; |
| wstate.inskey = _bt_mkscankey(wstate.index, NULL); |
| /* _bt_mkscankey() won't set allequalimage without metapage */ |
| wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true); |
| wstate.btws_use_wal = RelationNeedsWAL(wstate.index); |
| |
| /* reserve the metapage */ |
| wstate.btws_pages_alloced = BTREE_METAPAGE + 1; |
| wstate.btws_pages_written = 0; |
| wstate.btws_zeropage = NULL; /* until needed */ |
| |
| pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, |
| PROGRESS_BTREE_PHASE_LEAF_LOAD); |
| _bt_load(&wstate, btspool, btspool2); |
| } |
| |
| /* |
| * Per-tuple callback for table_index_build_scan |
| */ |
| static void |
| _bt_build_callback(Relation index, |
| ItemPointer tid, |
| Datum *values, |
| bool *isnull, |
| bool tupleIsAlive, |
| void *state) |
| { |
| BTBuildState *buildstate = (BTBuildState *) state; |
| |
| /* |
| * insert the index tuple into the appropriate spool file for subsequent |
| * processing |
| */ |
| if (tupleIsAlive || buildstate->spool2 == NULL) |
| _bt_spool(buildstate->spool, tid, values, isnull); |
| else |
| { |
| /* dead tuples are put into spool2 */ |
| buildstate->havedead = true; |
| _bt_spool(buildstate->spool2, tid, values, isnull); |
| } |
| |
| buildstate->indtuples += 1; |
| } |
| |
| /* |
| * allocate workspace for a new, clean btree page, not linked to any siblings. |
| */ |
| static Page |
| _bt_blnewpage(uint32 level) |
| { |
| Page page; |
| BTPageOpaque opaque; |
| |
| page = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); |
| |
| /* Zero the page and set up standard page header info */ |
| _bt_pageinit(page, BLCKSZ); |
| |
| /* Initialize BT opaque state */ |
| opaque = BTPageGetOpaque(page); |
| opaque->btpo_prev = opaque->btpo_next = P_NONE; |
| opaque->btpo_level = level; |
| opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF; |
| opaque->btpo_cycleid = 0; |
| |
| /* Make the P_HIKEY line pointer appear allocated */ |
| ((PageHeader) page)->pd_lower += sizeof(ItemIdData); |
| |
| return page; |
| } |
| |
| /* |
| * emit a completed btree page, and release the working storage. |
| */ |
| static void |
| _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno) |
| { |
| /* XLOG stuff */ |
| if (wstate->btws_use_wal) |
| { |
| /* We use the XLOG_FPI record type for this */ |
| log_newpage(&wstate->index->rd_locator, MAIN_FORKNUM, blkno, page, true); |
| } |
| |
| /* |
| * If we have to write pages nonsequentially, fill in the space with |
| * zeroes until we come back and overwrite. This is not logically |
| * necessary on standard Unix filesystems (unwritten space will read as |
| * zeroes anyway), but it should help to avoid fragmentation. The dummy |
| * pages aren't WAL-logged though. |
| */ |
| while (blkno > wstate->btws_pages_written) |
| { |
| if (!wstate->btws_zeropage) |
| wstate->btws_zeropage = (Page) palloc_aligned(BLCKSZ, |
| PG_IO_ALIGN_SIZE, |
| MCXT_ALLOC_ZERO); |
| /* don't set checksum for all-zero page */ |
| smgrextend(RelationGetSmgr(wstate->index), MAIN_FORKNUM, |
| wstate->btws_pages_written++, |
| wstate->btws_zeropage, |
| true); |
| } |
| |
| PageEncryptInplace(page, MAIN_FORKNUM, |
| blkno); |
| PageSetChecksumInplace(page, blkno); |
| |
| /* |
| * Now write the page. There's no need for smgr to schedule an fsync for |
| * this write; we'll do it ourselves before ending the build. |
| */ |
| if (blkno == wstate->btws_pages_written) |
| { |
| /* extending the file... */ |
| smgrextend(RelationGetSmgr(wstate->index), MAIN_FORKNUM, blkno, |
| page, true); |
| wstate->btws_pages_written++; |
| } |
| else |
| { |
| /* overwriting a block we zero-filled before */ |
| smgrwrite(RelationGetSmgr(wstate->index), MAIN_FORKNUM, blkno, |
| page, true); |
| } |
| |
| pfree(page); |
| } |
| |
| /* |
| * allocate and initialize a new BTPageState. the returned structure |
| * is suitable for immediate use by _bt_buildadd. |
| */ |
| static BTPageState * |
| _bt_pagestate(BTWriteState *wstate, uint32 level) |
| { |
| BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState)); |
| |
| /* create initial page for level */ |
| state->btps_page = _bt_blnewpage(level); |
| |
| /* and assign it a page position */ |
| state->btps_blkno = wstate->btws_pages_alloced++; |
| |
| state->btps_lowkey = NULL; |
| /* initialize lastoff so first item goes into P_FIRSTKEY */ |
| state->btps_lastoff = P_HIKEY; |
| state->btps_lastextra = 0; |
| state->btps_level = level; |
| /* set "full" threshold based on level. See notes at head of file. */ |
| if (level > 0) |
| state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100); |
| else |
| state->btps_full = BTGetTargetPageFreeSpace(wstate->index); |
| |
| /* no parent level, yet */ |
| state->btps_next = NULL; |
| |
| return state; |
| } |
| |
| /* |
| * Slide the array of ItemIds from the page back one slot (from P_FIRSTKEY to |
| * P_HIKEY, overwriting P_HIKEY). |
| * |
| * _bt_blnewpage() makes the P_HIKEY line pointer appear allocated, but the |
| * rightmost page on its level is not supposed to get a high key. Now that |
| * it's clear that this page is a rightmost page, remove the unneeded empty |
| * P_HIKEY line pointer space. |
| */ |
| static void |
| _bt_slideleft(Page rightmostpage) |
| { |
| OffsetNumber off; |
| OffsetNumber maxoff; |
| ItemId previi; |
| |
| maxoff = PageGetMaxOffsetNumber(rightmostpage); |
| Assert(maxoff >= P_FIRSTKEY); |
| previi = PageGetItemId(rightmostpage, P_HIKEY); |
| for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off)) |
| { |
| ItemId thisii = PageGetItemId(rightmostpage, off); |
| |
| *previi = *thisii; |
| previi = thisii; |
| } |
| ((PageHeader) rightmostpage)->pd_lower -= sizeof(ItemIdData); |
| } |
| |
| /* |
| * Add an item to a page being built. |
| * |
| * This is very similar to nbtinsert.c's _bt_pgaddtup(), but this variant |
| * raises an error directly. |
| * |
| * Note that our nbtsort.c caller does not know yet if the page will be |
| * rightmost. Offset P_FIRSTKEY is always assumed to be the first data key by |
| * caller. Page that turns out to be the rightmost on its level is fixed by |
| * calling _bt_slideleft(). |
| */ |
| static void |
| _bt_sortaddtup(Page page, |
| Size itemsize, |
| IndexTuple itup, |
| OffsetNumber itup_off, |
| bool newfirstdataitem) |
| { |
| IndexTupleData trunctuple; |
| |
| if (newfirstdataitem) |
| { |
| trunctuple = *itup; |
| trunctuple.t_info = sizeof(IndexTupleData); |
| BTreeTupleSetNAtts(&trunctuple, 0, false); |
| itup = &trunctuple; |
| itemsize = sizeof(IndexTupleData); |
| } |
| |
| if (PageAddItem(page, (Item) itup, itemsize, itup_off, |
| false, false) == InvalidOffsetNumber) |
| elog(ERROR, "failed to add item to the index page"); |
| } |
| |
| /*---------- |
| * Add an item to a disk page from the sort output (or add a posting list |
| * item formed from the sort output). |
| * |
| * We must be careful to observe the page layout conventions of nbtsearch.c: |
| * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY. |
| * - on non-leaf pages, the key portion of the first item need not be |
| * stored, we should store only the link. |
| * |
| * A leaf page being built looks like: |
| * |
| * +----------------+---------------------------------+ |
| * | PageHeaderData | linp0 linp1 linp2 ... | |
| * +-----------+----+---------------------------------+ |
| * | ... linpN | | |
| * +-----------+--------------------------------------+ |
| * | ^ last | |
| * | | |
| * +-------------+------------------------------------+ |
| * | | itemN ... | |
| * +-------------+------------------+-----------------+ |
| * | ... item3 item2 item1 | "special space" | |
| * +--------------------------------+-----------------+ |
| * |
| * Contrast this with the diagram in bufpage.h; note the mismatch |
| * between linps and items. This is because we reserve linp0 as a |
| * placeholder for the pointer to the "high key" item; when we have |
| * filled up the page, we will set linp0 to point to itemN and clear |
| * linpN. On the other hand, if we find this is the last (rightmost) |
| * page, we leave the items alone and slide the linp array over. If |
| * the high key is to be truncated, offset 1 is deleted, and we insert |
| * the truncated high key at offset 1. |
| * |
| * 'last' pointer indicates the last offset added to the page. |
| * |
| * 'truncextra' is the size of the posting list in itup, if any. This |
| * information is stashed for the next call here, when we may benefit |
| * from considering the impact of truncating away the posting list on |
| * the page before deciding to finish the page off. Posting lists are |
| * often relatively large, so it is worth going to the trouble of |
| * accounting for the saving from truncating away the posting list of |
| * the tuple that becomes the high key (that may be the only way to |
| * get close to target free space on the page). Note that this is |
| * only used for the soft fillfactor-wise limit, not the critical hard |
| * limit. |
| *---------- |
| */ |
| static void |
| _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, |
| Size truncextra) |
| { |
| Page npage; |
| BlockNumber nblkno; |
| OffsetNumber last_off; |
| Size last_truncextra; |
| Size pgspc; |
| Size itupsz; |
| bool isleaf; |
| |
| /* |
| * This is a handy place to check for cancel interrupts during the btree |
| * load phase of index creation. |
| */ |
| CHECK_FOR_INTERRUPTS(); |
| |
| npage = state->btps_page; |
| nblkno = state->btps_blkno; |
| last_off = state->btps_lastoff; |
| last_truncextra = state->btps_lastextra; |
| state->btps_lastextra = truncextra; |
| |
| pgspc = PageGetFreeSpace(npage); |
| itupsz = IndexTupleSize(itup); |
| itupsz = MAXALIGN(itupsz); |
| /* Leaf case has slightly different rules due to suffix truncation */ |
| isleaf = (state->btps_level == 0); |
| |
| /* |
| * Check whether the new item can fit on a btree page on current level at |
| * all. |
| * |
| * Every newly built index will treat heap TID as part of the keyspace, |
| * which imposes the requirement that new high keys must occasionally have |
| * a heap TID appended within _bt_truncate(). That may leave a new pivot |
| * tuple one or two MAXALIGN() quantums larger than the original |
| * firstright tuple it's derived from. v4 deals with the problem by |
| * decreasing the limit on the size of tuples inserted on the leaf level |
| * by the same small amount. Enforce the new v4+ limit on the leaf level, |
| * and the old limit on internal levels, since pivot tuples may need to |
| * make use of the reserved space. This should never fail on internal |
| * pages. |
| */ |
| if (unlikely(itupsz > BTMaxItemSize(npage))) |
| _bt_check_third_page(wstate->index, wstate->heap, isleaf, npage, |
| itup); |
| |
| /* |
| * Check to see if current page will fit new item, with space left over to |
| * append a heap TID during suffix truncation when page is a leaf page. |
| * |
| * It is guaranteed that we can fit at least 2 non-pivot tuples plus a |
| * high key with heap TID when finishing off a leaf page, since we rely on |
| * _bt_check_third_page() rejecting oversized non-pivot tuples. On |
| * internal pages we can always fit 3 pivot tuples with larger internal |
| * page tuple limit (includes page high key). |
| * |
| * Most of the time, a page is only "full" in the sense that the soft |
| * fillfactor-wise limit has been exceeded. However, we must always leave |
| * at least two items plus a high key on each page before starting a new |
| * page. Disregard fillfactor and insert on "full" current page if we |
| * don't have the minimum number of items yet. (Note that we deliberately |
| * assume that suffix truncation neither enlarges nor shrinks new high key |
| * when applying soft limit, except when last tuple has a posting list.) |
| */ |
| Assert(last_truncextra == 0 || isleaf); |
| if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) || |
| (pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY)) |
| { |
| /* |
| * Finish off the page and write it out. |
| */ |
| Page opage = npage; |
| BlockNumber oblkno = nblkno; |
| ItemId ii; |
| ItemId hii; |
| IndexTuple oitup; |
| |
| /* Create new page of same level */ |
| npage = _bt_blnewpage(state->btps_level); |
| |
| /* and assign it a page position */ |
| nblkno = wstate->btws_pages_alloced++; |
| |
| /* |
| * We copy the last item on the page into the new page, and then |
| * rearrange the old page so that the 'last item' becomes its high key |
| * rather than a true data item. There had better be at least two |
| * items on the page already, else the page would be empty of useful |
| * data. |
| */ |
| Assert(last_off > P_FIRSTKEY); |
| ii = PageGetItemId(opage, last_off); |
| oitup = (IndexTuple) PageGetItem(opage, ii); |
| _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY, |
| !isleaf); |
| |
| /* |
| * Move 'last' into the high key position on opage. _bt_blnewpage() |
| * allocated empty space for a line pointer when opage was first |
| * created, so this is a matter of rearranging already-allocated space |
| * on page, and initializing high key line pointer. (Actually, leaf |
| * pages must also swap oitup with a truncated version of oitup, which |
| * is sometimes larger than oitup, though never by more than the space |
| * needed to append a heap TID.) |
| */ |
| hii = PageGetItemId(opage, P_HIKEY); |
| *hii = *ii; |
| ItemIdSetUnused(ii); /* redundant */ |
| ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData); |
| |
| if (isleaf) |
| { |
| IndexTuple lastleft; |
| IndexTuple truncated; |
| |
| /* |
| * Truncate away any unneeded attributes from high key on leaf |
| * level. This is only done at the leaf level because downlinks |
| * in internal pages are either negative infinity items, or get |
| * their contents from copying from one level down. See also: |
| * _bt_split(). |
| * |
| * We don't try to bias our choice of split point to make it more |
| * likely that _bt_truncate() can truncate away more attributes, |
| * whereas the split point used within _bt_split() is chosen much |
| * more delicately. Even still, the lastleft and firstright |
| * tuples passed to _bt_truncate() here are at least not fully |
| * equal to each other when deduplication is used, unless there is |
| * a large group of duplicates (also, unique index builds usually |
| * have few or no spool2 duplicates). When the split point is |
| * between two unequal tuples, _bt_truncate() will avoid including |
| * a heap TID in the new high key, which is the most important |
| * benefit of suffix truncation. |
| * |
| * Overwrite the old item with new truncated high key directly. |
| * oitup is already located at the physical beginning of tuple |
| * space, so this should directly reuse the existing tuple space. |
| */ |
| ii = PageGetItemId(opage, OffsetNumberPrev(last_off)); |
| lastleft = (IndexTuple) PageGetItem(opage, ii); |
| |
| Assert(IndexTupleSize(oitup) > last_truncextra); |
| truncated = _bt_truncate(wstate->index, lastleft, oitup, |
| wstate->inskey); |
| if (!PageIndexTupleOverwrite(opage, P_HIKEY, (Item) truncated, |
| IndexTupleSize(truncated))) |
| elog(ERROR, "failed to add high key to the index page"); |
| pfree(truncated); |
| |
| /* oitup should continue to point to the page's high key */ |
| hii = PageGetItemId(opage, P_HIKEY); |
| oitup = (IndexTuple) PageGetItem(opage, hii); |
| } |
| |
| /* |
| * Link the old page into its parent, using its low key. If we don't |
| * have a parent, we have to create one; this adds a new btree level. |
| */ |
| if (state->btps_next == NULL) |
| state->btps_next = _bt_pagestate(wstate, state->btps_level + 1); |
| |
| Assert((BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) <= |
| IndexRelationGetNumberOfKeyAttributes(wstate->index) && |
| BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) > 0) || |
| P_LEFTMOST(BTPageGetOpaque(opage))); |
| Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 || |
| !P_LEFTMOST(BTPageGetOpaque(opage))); |
| BTreeTupleSetDownLink(state->btps_lowkey, oblkno); |
| _bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0); |
| pfree(state->btps_lowkey); |
| |
| /* |
| * Save a copy of the high key from the old page. It is also the low |
| * key for the new page. |
| */ |
| state->btps_lowkey = CopyIndexTuple(oitup); |
| |
| /* |
| * Set the sibling links for both pages. |
| */ |
| { |
| BTPageOpaque oopaque = BTPageGetOpaque(opage); |
| BTPageOpaque nopaque = BTPageGetOpaque(npage); |
| |
| oopaque->btpo_next = nblkno; |
| nopaque->btpo_prev = oblkno; |
| nopaque->btpo_next = P_NONE; /* redundant */ |
| } |
| |
| /* |
| * Write out the old page. We never need to touch it again, so we can |
| * free the opage workspace too. |
| */ |
| _bt_blwritepage(wstate, opage, oblkno); |
| |
| /* |
| * Reset last_off to point to new page |
| */ |
| last_off = P_FIRSTKEY; |
| } |
| |
| /* |
| * By here, either original page is still the current page, or a new page |
| * was created that became the current page. Either way, the current page |
| * definitely has space for new item. |
| * |
| * If the new item is the first for its page, it must also be the first |
| * item on its entire level. On later same-level pages, a low key for a |
| * page will be copied from the prior page in the code above. Generate a |
| * minus infinity low key here instead. |
| */ |
| if (last_off == P_HIKEY) |
| { |
| Assert(state->btps_lowkey == NULL); |
| state->btps_lowkey = palloc0(sizeof(IndexTupleData)); |
| state->btps_lowkey->t_info = sizeof(IndexTupleData); |
| BTreeTupleSetNAtts(state->btps_lowkey, 0, false); |
| } |
| |
| /* |
| * Add the new item into the current page. |
| */ |
| last_off = OffsetNumberNext(last_off); |
| _bt_sortaddtup(npage, itupsz, itup, last_off, |
| !isleaf && last_off == P_FIRSTKEY); |
| |
| state->btps_page = npage; |
| state->btps_blkno = nblkno; |
| state->btps_lastoff = last_off; |
| } |
| |
| /* |
| * Finalize pending posting list tuple, and add it to the index. Final tuple |
| * is based on saved base tuple, and saved list of heap TIDs. |
| * |
| * This is almost like _bt_dedup_finish_pending(), but it adds a new tuple |
| * using _bt_buildadd(). |
| */ |
| static void |
| _bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state, |
| BTDedupState dstate) |
| { |
| Assert(dstate->nitems > 0); |
| |
| if (dstate->nitems == 1) |
| _bt_buildadd(wstate, state, dstate->base, 0); |
| else |
| { |
| IndexTuple postingtuple; |
| Size truncextra; |
| |
| /* form a tuple with a posting list */ |
| postingtuple = _bt_form_posting(dstate->base, |
| dstate->htids, |
| dstate->nhtids); |
| /* Calculate posting list overhead */ |
| truncextra = IndexTupleSize(postingtuple) - |
| BTreeTupleGetPostingOffset(postingtuple); |
| |
| _bt_buildadd(wstate, state, postingtuple, truncextra); |
| pfree(postingtuple); |
| } |
| |
| dstate->nmaxitems = 0; |
| dstate->nhtids = 0; |
| dstate->nitems = 0; |
| dstate->phystupsize = 0; |
| } |
| |
| /* |
| * Finish writing out the completed btree. |
| */ |
| static void |
| _bt_uppershutdown(BTWriteState *wstate, BTPageState *state) |
| { |
| BTPageState *s; |
| BlockNumber rootblkno = P_NONE; |
| uint32 rootlevel = 0; |
| Page metapage; |
| |
| /* |
| * Each iteration of this loop completes one more level of the tree. |
| */ |
| for (s = state; s != NULL; s = s->btps_next) |
| { |
| BlockNumber blkno; |
| BTPageOpaque opaque; |
| |
| blkno = s->btps_blkno; |
| opaque = BTPageGetOpaque(s->btps_page); |
| |
| /* |
| * We have to link the last page on this level to somewhere. |
| * |
| * If we're at the top, it's the root, so attach it to the metapage. |
| * Otherwise, add an entry for it to its parent using its low key. |
| * This may cause the last page of the parent level to split, but |
| * that's not a problem -- we haven't gotten to it yet. |
| */ |
| if (s->btps_next == NULL) |
| { |
| opaque->btpo_flags |= BTP_ROOT; |
| rootblkno = blkno; |
| rootlevel = s->btps_level; |
| } |
| else |
| { |
| Assert((BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) <= |
| IndexRelationGetNumberOfKeyAttributes(wstate->index) && |
| BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) > 0) || |
| P_LEFTMOST(opaque)); |
| Assert(BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) == 0 || |
| !P_LEFTMOST(opaque)); |
| BTreeTupleSetDownLink(s->btps_lowkey, blkno); |
| _bt_buildadd(wstate, s->btps_next, s->btps_lowkey, 0); |
| pfree(s->btps_lowkey); |
| s->btps_lowkey = NULL; |
| } |
| |
| /* |
| * This is the rightmost page, so the ItemId array needs to be slid |
| * back one slot. Then we can dump out the page. |
| */ |
| _bt_slideleft(s->btps_page); |
| _bt_blwritepage(wstate, s->btps_page, s->btps_blkno); |
| s->btps_page = NULL; /* writepage freed the workspace */ |
| } |
| |
| /* |
| * As the last step in the process, construct the metapage and make it |
| * point to the new root (unless we had no data at all, in which case it's |
| * set to point to "P_NONE"). This changes the index to the "valid" state |
| * by filling in a valid magic number in the metapage. |
| */ |
| metapage = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); |
| _bt_initmetapage(metapage, rootblkno, rootlevel, |
| wstate->inskey->allequalimage); |
| _bt_blwritepage(wstate, metapage, BTREE_METAPAGE); |
| } |
| |
| /* |
| * Read tuples in correct sort order from tuplesort, and load them into |
| * btree leaves. |
| */ |
| static void |
| _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) |
| { |
| BTPageState *state = NULL; |
| bool merge = (btspool2 != NULL); |
| IndexTuple itup, |
| itup2 = NULL; |
| bool load1; |
| TupleDesc tupdes = RelationGetDescr(wstate->index); |
| int i, |
| keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index); |
| SortSupport sortKeys; |
| int64 tuples_done = 0; |
| bool deduplicate; |
| |
| deduplicate = wstate->inskey->allequalimage && !btspool->isunique && |
| BTGetDeduplicateItems(wstate->index); |
| |
| if (merge) |
| { |
| /* |
| * Another BTSpool for dead tuples exists. Now we have to merge |
| * btspool and btspool2. |
| */ |
| |
| /* the preparation of merge */ |
| itup = tuplesort_getindextuple(btspool->sortstate, true); |
| itup2 = tuplesort_getindextuple(btspool2->sortstate, true); |
| |
| /* Prepare SortSupport data for each column */ |
| sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData)); |
| |
| for (i = 0; i < keysz; i++) |
| { |
| SortSupport sortKey = sortKeys + i; |
| ScanKey scanKey = wstate->inskey->scankeys + i; |
| int16 strategy; |
| |
| sortKey->ssup_cxt = CurrentMemoryContext; |
| sortKey->ssup_collation = scanKey->sk_collation; |
| sortKey->ssup_nulls_first = |
| (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; |
| sortKey->ssup_attno = scanKey->sk_attno; |
| /* Abbreviation is not supported here */ |
| sortKey->abbreviate = false; |
| |
| Assert(sortKey->ssup_attno != 0); |
| |
| strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? |
| BTGreaterStrategyNumber : BTLessStrategyNumber; |
| |
| PrepareSortSupportFromIndexRel(wstate->index, strategy, sortKey); |
| } |
| |
| for (;;) |
| { |
| load1 = true; /* load BTSpool next ? */ |
| if (itup2 == NULL) |
| { |
| if (itup == NULL) |
| break; |
| } |
| else if (itup != NULL) |
| { |
| int32 compare = 0; |
| |
| for (i = 1; i <= keysz; i++) |
| { |
| SortSupport entry; |
| Datum attrDatum1, |
| attrDatum2; |
| bool isNull1, |
| isNull2; |
| |
| entry = sortKeys + i - 1; |
| attrDatum1 = index_getattr(itup, i, tupdes, &isNull1); |
| attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2); |
| |
| compare = ApplySortComparator(attrDatum1, isNull1, |
| attrDatum2, isNull2, |
| entry); |
| if (compare > 0) |
| { |
| load1 = false; |
| break; |
| } |
| else if (compare < 0) |
| break; |
| } |
| |
| /* |
| * If key values are equal, we sort on ItemPointer. This is |
| * required for btree indexes, since heap TID is treated as an |
| * implicit last key attribute in order to ensure that all |
| * keys in the index are physically unique. |
| */ |
| if (compare == 0) |
| { |
| compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid); |
| Assert(compare != 0); |
| if (compare > 0) |
| load1 = false; |
| } |
| } |
| else |
| load1 = false; |
| |
| /* When we see first tuple, create first index page */ |
| if (state == NULL) |
| state = _bt_pagestate(wstate, 0); |
| |
| if (load1) |
| { |
| _bt_buildadd(wstate, state, itup, 0); |
| itup = tuplesort_getindextuple(btspool->sortstate, true); |
| } |
| else |
| { |
| _bt_buildadd(wstate, state, itup2, 0); |
| itup2 = tuplesort_getindextuple(btspool2->sortstate, true); |
| } |
| |
| /* Report progress */ |
| pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, |
| ++tuples_done); |
| } |
| pfree(sortKeys); |
| } |
| else if (deduplicate) |
| { |
| /* merge is unnecessary, deduplicate into posting lists */ |
| BTDedupState dstate; |
| |
| dstate = (BTDedupState) palloc(sizeof(BTDedupStateData)); |
| dstate->deduplicate = true; /* unused */ |
| dstate->nmaxitems = 0; /* unused */ |
| dstate->maxpostingsize = 0; /* set later */ |
| /* Metadata about base tuple of current pending posting list */ |
| dstate->base = NULL; |
| dstate->baseoff = InvalidOffsetNumber; /* unused */ |
| dstate->basetupsize = 0; |
| /* Metadata about current pending posting list TIDs */ |
| dstate->htids = NULL; |
| dstate->nhtids = 0; |
| dstate->nitems = 0; |
| dstate->phystupsize = 0; /* unused */ |
| dstate->nintervals = 0; /* unused */ |
| |
| while ((itup = tuplesort_getindextuple(btspool->sortstate, |
| true)) != NULL) |
| { |
| /* When we see first tuple, create first index page */ |
| if (state == NULL) |
| { |
| state = _bt_pagestate(wstate, 0); |
| |
| /* |
| * Limit size of posting list tuples to 1/10 space we want to |
| * leave behind on the page, plus space for final item's line |
| * pointer. This is equal to the space that we'd like to |
| * leave behind on each leaf page when fillfactor is 90, |
| * allowing us to get close to fillfactor% space utilization |
| * when there happen to be a great many duplicates. (This |
| * makes higher leaf fillfactor settings ineffective when |
| * building indexes that have many duplicates, but packing |
| * leaf pages full with few very large tuples doesn't seem |
| * like a useful goal.) |
| */ |
| dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) - |
| sizeof(ItemIdData); |
| Assert(dstate->maxpostingsize <= BTMaxItemSize(state->btps_page) && |
| dstate->maxpostingsize <= INDEX_SIZE_MASK); |
| dstate->htids = palloc(dstate->maxpostingsize); |
| |
| /* start new pending posting list with itup copy */ |
| _bt_dedup_start_pending(dstate, CopyIndexTuple(itup), |
| InvalidOffsetNumber); |
| } |
| else if (_bt_keep_natts_fast(wstate->index, dstate->base, |
| itup) > keysz && |
| _bt_dedup_save_htid(dstate, itup)) |
| { |
| /* |
| * Tuple is equal to base tuple of pending posting list. Heap |
| * TID from itup has been saved in state. |
| */ |
| } |
| else |
| { |
| /* |
| * Tuple is not equal to pending posting list tuple, or |
| * _bt_dedup_save_htid() opted to not merge current item into |
| * pending posting list. |
| */ |
| _bt_sort_dedup_finish_pending(wstate, state, dstate); |
| pfree(dstate->base); |
| |
| /* start new pending posting list with itup copy */ |
| _bt_dedup_start_pending(dstate, CopyIndexTuple(itup), |
| InvalidOffsetNumber); |
| } |
| |
| /* Report progress */ |
| pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, |
| ++tuples_done); |
| } |
| |
| if (state) |
| { |
| /* |
| * Handle the last item (there must be a last item when the |
| * tuplesort returned one or more tuples) |
| */ |
| _bt_sort_dedup_finish_pending(wstate, state, dstate); |
| pfree(dstate->base); |
| pfree(dstate->htids); |
| } |
| |
| pfree(dstate); |
| } |
| else |
| { |
| /* merging and deduplication are both unnecessary */ |
| while ((itup = tuplesort_getindextuple(btspool->sortstate, |
| true)) != NULL) |
| { |
| /* When we see first tuple, create first index page */ |
| if (state == NULL) |
| state = _bt_pagestate(wstate, 0); |
| |
| _bt_buildadd(wstate, state, itup, 0); |
| |
| /* Report progress */ |
| pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, |
| ++tuples_done); |
| } |
| } |
| |
| /* Close down final pages and write the metapage */ |
| _bt_uppershutdown(wstate, state); |
| |
| /* |
| * When we WAL-logged index pages, we must nonetheless fsync index files. |
| * Since we're building outside shared buffers, a CHECKPOINT occurring |
| * during the build has no way to flush the previously written data to |
| * disk (indeed it won't know the index even exists). A crash later on |
| * would replay WAL from the checkpoint, therefore it wouldn't replay our |
| * earlier WAL entries. If we do not fsync those pages here, they might |
| * still not be on disk when the crash occurs. |
| */ |
| if (wstate->btws_use_wal) |
| smgrimmedsync(RelationGetSmgr(wstate->index), MAIN_FORKNUM); |
| } |
| |
| /* |
| * Create parallel context, and launch workers for leader. |
| * |
| * buildstate argument should be initialized (with the exception of the |
| * tuplesort state in spools, which may later be created based on shared |
| * state initially set up here). |
| * |
| * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY. |
| * |
| * request is the target number of parallel worker processes to launch. |
| * |
| * Sets buildstate's BTLeader, which caller must use to shut down parallel |
| * mode by passing it to _bt_end_parallel() at the very end of its index |
| * build. If not even a single worker process can be launched, this is |
| * never set, and caller should proceed with a serial index build. |
| */ |
| static void |
| _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) |
| { |
| ParallelContext *pcxt; |
| int scantuplesortstates; |
| Snapshot snapshot; |
| Size estbtshared; |
| Size estsort; |
| BTShared *btshared; |
| Sharedsort *sharedsort; |
| Sharedsort *sharedsort2; |
| BTSpool *btspool = buildstate->spool; |
| BTLeader *btleader = (BTLeader *) palloc0(sizeof(BTLeader)); |
| WalUsage *walusage; |
| BufferUsage *bufferusage; |
| bool leaderparticipates = true; |
| int querylen; |
| |
| #ifdef DISABLE_LEADER_PARTICIPATION |
| leaderparticipates = false; |
| #endif |
| |
| /* |
| * Enter parallel mode, and create context for parallel build of btree |
| * index |
| */ |
| EnterParallelMode(); |
| Assert(request > 0); |
| pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main", |
| request); |
| |
| scantuplesortstates = leaderparticipates ? request + 1 : request; |
| |
| /* |
| * Prepare for scan of the base relation. In a normal index build, we use |
| * SnapshotAny because we must retrieve all tuples and do our own time |
| * qual checks (because we have to index RECENTLY_DEAD tuples). In a |
| * concurrent build, we take a regular MVCC snapshot and index whatever's |
| * live according to that. |
| */ |
| if (!isconcurrent) |
| snapshot = SnapshotAny; |
| else |
| snapshot = RegisterSnapshot(GetTransactionSnapshot()); |
| |
| /* |
| * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and |
| * PARALLEL_KEY_TUPLESORT tuplesort workspace |
| */ |
| estbtshared = _bt_parallel_estimate_shared(btspool->heap, snapshot); |
| shm_toc_estimate_chunk(&pcxt->estimator, estbtshared); |
| estsort = tuplesort_estimate_shared(scantuplesortstates); |
| shm_toc_estimate_chunk(&pcxt->estimator, estsort); |
| |
| /* |
| * Unique case requires a second spool, and so we may have to account for |
| * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2 |
| */ |
| if (!btspool->isunique) |
| shm_toc_estimate_keys(&pcxt->estimator, 2); |
| else |
| { |
| shm_toc_estimate_chunk(&pcxt->estimator, estsort); |
| shm_toc_estimate_keys(&pcxt->estimator, 3); |
| } |
| |
| /* |
| * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE |
| * and PARALLEL_KEY_BUFFER_USAGE. |
| * |
| * If there are no extensions loaded that care, we could skip this. We |
| * have no way of knowing whether anyone's looking at pgWalUsage or |
| * pgBufferUsage, so do it unconditionally. |
| */ |
| shm_toc_estimate_chunk(&pcxt->estimator, |
| mul_size(sizeof(WalUsage), pcxt->nworkers)); |
| shm_toc_estimate_keys(&pcxt->estimator, 1); |
| shm_toc_estimate_chunk(&pcxt->estimator, |
| mul_size(sizeof(BufferUsage), pcxt->nworkers)); |
| shm_toc_estimate_keys(&pcxt->estimator, 1); |
| |
| /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */ |
| if (debug_query_string) |
| { |
| querylen = strlen(debug_query_string); |
| shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1); |
| shm_toc_estimate_keys(&pcxt->estimator, 1); |
| } |
| else |
| querylen = 0; /* keep compiler quiet */ |
| |
| /* Everyone's had a chance to ask for space, so now create the DSM */ |
| InitializeParallelDSM(pcxt); |
| |
| /* If no DSM segment was available, back out (do serial build) */ |
| if (pcxt->seg == NULL) |
| { |
| if (IsMVCCSnapshot(snapshot)) |
| UnregisterSnapshot(snapshot); |
| DestroyParallelContext(pcxt); |
| ExitParallelMode(); |
| return; |
| } |
| |
| /* Store shared build state, for which we reserved space */ |
| btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared); |
| /* Initialize immutable state */ |
| btshared->heaprelid = RelationGetRelid(btspool->heap); |
| btshared->indexrelid = RelationGetRelid(btspool->index); |
| btshared->isunique = btspool->isunique; |
| btshared->nulls_not_distinct = btspool->nulls_not_distinct; |
| btshared->isconcurrent = isconcurrent; |
| btshared->scantuplesortstates = scantuplesortstates; |
| ConditionVariableInit(&btshared->workersdonecv); |
| SpinLockInit(&btshared->mutex); |
| /* Initialize mutable state */ |
| btshared->nparticipantsdone = 0; |
| btshared->reltuples = 0.0; |
| btshared->havedead = false; |
| btshared->indtuples = 0.0; |
| btshared->brokenhotchain = false; |
| table_parallelscan_initialize(btspool->heap, |
| ParallelTableScanFromBTShared(btshared), |
| snapshot); |
| |
| /* |
| * Store shared tuplesort-private state, for which we reserved space. |
| * Then, initialize opaque state using tuplesort routine. |
| */ |
| sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort); |
| tuplesort_initialize_shared(sharedsort, scantuplesortstates, |
| pcxt->seg); |
| |
| shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared); |
| shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort); |
| |
| /* Unique case requires a second spool, and associated shared state */ |
| if (!btspool->isunique) |
| sharedsort2 = NULL; |
| else |
| { |
| /* |
| * Store additional shared tuplesort-private state, for which we |
| * reserved space. Then, initialize opaque state using tuplesort |
| * routine. |
| */ |
| sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort); |
| tuplesort_initialize_shared(sharedsort2, scantuplesortstates, |
| pcxt->seg); |
| |
| shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT_SPOOL2, sharedsort2); |
| } |
| |
| /* Store query string for workers */ |
| if (debug_query_string) |
| { |
| char *sharedquery; |
| |
| sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1); |
| memcpy(sharedquery, debug_query_string, querylen + 1); |
| shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery); |
| } |
| |
| /* |
| * Allocate space for each worker's WalUsage and BufferUsage; no need to |
| * initialize. |
| */ |
| walusage = shm_toc_allocate(pcxt->toc, |
| mul_size(sizeof(WalUsage), pcxt->nworkers)); |
| shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage); |
| bufferusage = shm_toc_allocate(pcxt->toc, |
| mul_size(sizeof(BufferUsage), pcxt->nworkers)); |
| shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage); |
| |
| /* Launch workers, saving status for leader/caller */ |
| LaunchParallelWorkers(pcxt); |
| btleader->pcxt = pcxt; |
| btleader->nparticipanttuplesorts = pcxt->nworkers_launched; |
| if (leaderparticipates) |
| btleader->nparticipanttuplesorts++; |
| btleader->btshared = btshared; |
| btleader->sharedsort = sharedsort; |
| btleader->sharedsort2 = sharedsort2; |
| btleader->snapshot = snapshot; |
| btleader->walusage = walusage; |
| btleader->bufferusage = bufferusage; |
| |
| /* If no workers were successfully launched, back out (do serial build) */ |
| if (pcxt->nworkers_launched == 0) |
| { |
| _bt_end_parallel(btleader); |
| return; |
| } |
| |
| /* Save leader state now that it's clear build will be parallel */ |
| buildstate->btleader = btleader; |
| |
| /* Join heap scan ourselves */ |
| if (leaderparticipates) |
| _bt_leader_participate_as_worker(buildstate); |
| |
| /* |
| * Caller needs to wait for all launched workers when we return. Make |
| * sure that the failure-to-start case will not hang forever. |
| */ |
| WaitForParallelWorkersToAttach(pcxt); |
| } |
| |
| /* |
| * Shut down workers, destroy parallel context, and end parallel mode. |
| */ |
| static void |
| _bt_end_parallel(BTLeader *btleader) |
| { |
| int i; |
| |
| /* Shutdown worker processes */ |
| WaitForParallelWorkersToFinish(btleader->pcxt); |
| |
| /* |
| * Next, accumulate WAL usage. (This must wait for the workers to finish, |
| * or we might get incomplete data.) |
| */ |
| for (i = 0; i < btleader->pcxt->nworkers_launched; i++) |
| InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]); |
| |
| /* Free last reference to MVCC snapshot, if one was used */ |
| if (IsMVCCSnapshot(btleader->snapshot)) |
| UnregisterSnapshot(btleader->snapshot); |
| DestroyParallelContext(btleader->pcxt); |
| ExitParallelMode(); |
| } |
| |
| /* |
| * Returns size of shared memory required to store state for a parallel |
| * btree index build based on the snapshot its parallel scan will use. |
| */ |
| static Size |
| _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot) |
| { |
| /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */ |
| return add_size(BUFFERALIGN(sizeof(BTShared)), |
| table_parallelscan_estimate(heap, snapshot)); |
| } |
| |
| /* |
| * Within leader, wait for end of heap scan. |
| * |
| * When called, parallel heap scan started by _bt_begin_parallel() will |
| * already be underway within worker processes (when leader participates |
| * as a worker, we should end up here just as workers are finishing). |
| * |
| * Fills in fields needed for ambuild statistics, and lets caller set |
| * field indicating that some worker encountered a broken HOT chain. |
| * |
| * Returns the total number of heap tuples scanned. |
| */ |
| static double |
| _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain) |
| { |
| BTShared *btshared = buildstate->btleader->btshared; |
| int nparticipanttuplesorts; |
| double reltuples; |
| |
| nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts; |
| for (;;) |
| { |
| SpinLockAcquire(&btshared->mutex); |
| if (btshared->nparticipantsdone == nparticipanttuplesorts) |
| { |
| buildstate->havedead = btshared->havedead; |
| buildstate->indtuples = btshared->indtuples; |
| *brokenhotchain = btshared->brokenhotchain; |
| reltuples = btshared->reltuples; |
| SpinLockRelease(&btshared->mutex); |
| break; |
| } |
| SpinLockRelease(&btshared->mutex); |
| |
| ConditionVariableSleep(&btshared->workersdonecv, |
| WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN); |
| } |
| |
| ConditionVariableCancelSleep(); |
| |
| return reltuples; |
| } |
| |
| /* |
| * Within leader, participate as a parallel worker. |
| */ |
| static void |
| _bt_leader_participate_as_worker(BTBuildState *buildstate) |
| { |
| BTLeader *btleader = buildstate->btleader; |
| BTSpool *leaderworker; |
| BTSpool *leaderworker2; |
| int sortmem; |
| |
| /* Allocate memory and initialize private spool */ |
| leaderworker = (BTSpool *) palloc0(sizeof(BTSpool)); |
| leaderworker->heap = buildstate->spool->heap; |
| leaderworker->index = buildstate->spool->index; |
| leaderworker->isunique = buildstate->spool->isunique; |
| leaderworker->nulls_not_distinct = buildstate->spool->nulls_not_distinct; |
| |
| /* Initialize second spool, if required */ |
| if (!btleader->btshared->isunique) |
| leaderworker2 = NULL; |
| else |
| { |
| /* Allocate memory for worker's own private secondary spool */ |
| leaderworker2 = (BTSpool *) palloc0(sizeof(BTSpool)); |
| |
| /* Initialize worker's own secondary spool */ |
| leaderworker2->heap = leaderworker->heap; |
| leaderworker2->index = leaderworker->index; |
| leaderworker2->isunique = false; |
| } |
| |
| /* |
| * Might as well use reliable figure when doling out maintenance_work_mem |
| * (when requested number of workers were not launched, this will be |
| * somewhat higher than it is for other workers). |
| */ |
| sortmem = maintenance_work_mem / btleader->nparticipanttuplesorts; |
| |
| /* Perform work common to all participants */ |
| _bt_parallel_scan_and_sort(leaderworker, leaderworker2, btleader->btshared, |
| btleader->sharedsort, btleader->sharedsort2, |
| sortmem, true); |
| |
| #ifdef BTREE_BUILD_STATS |
| if (log_btree_build_stats) |
| { |
| ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS"); |
| ResetUsage(); |
| } |
| #endif /* BTREE_BUILD_STATS */ |
| } |
| |
| /* |
| * Perform work within a launched parallel process. |
| */ |
| void |
| _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) |
| { |
| char *sharedquery; |
| BTSpool *btspool; |
| BTSpool *btspool2; |
| BTShared *btshared; |
| Sharedsort *sharedsort; |
| Sharedsort *sharedsort2; |
| Relation heapRel; |
| Relation indexRel; |
| LOCKMODE heapLockmode; |
| LOCKMODE indexLockmode; |
| WalUsage *walusage; |
| BufferUsage *bufferusage; |
| int sortmem; |
| |
| #ifdef BTREE_BUILD_STATS |
| if (log_btree_build_stats) |
| ResetUsage(); |
| #endif /* BTREE_BUILD_STATS */ |
| |
| /* |
| * The only possible status flag that can be set to the parallel worker is |
| * PROC_IN_SAFE_IC. |
| */ |
| Assert((MyProc->statusFlags == 0) || |
| (MyProc->statusFlags == PROC_IN_SAFE_IC)); |
| |
| /* Set debug_query_string for individual workers first */ |
| sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true); |
| debug_query_string = sharedquery; |
| |
| /* Report the query string from leader */ |
| pgstat_report_activity(STATE_RUNNING, debug_query_string); |
| |
| /* Look up nbtree shared state */ |
| btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false); |
| |
| /* Open relations using lock modes known to be obtained by index.c */ |
| if (!btshared->isconcurrent) |
| { |
| heapLockmode = ShareLock; |
| indexLockmode = AccessExclusiveLock; |
| } |
| else |
| { |
| heapLockmode = ShareUpdateExclusiveLock; |
| indexLockmode = RowExclusiveLock; |
| } |
| |
| /* Open relations within worker */ |
| heapRel = table_open(btshared->heaprelid, heapLockmode); |
| indexRel = index_open(btshared->indexrelid, indexLockmode); |
| |
| /* Initialize worker's own spool */ |
| btspool = (BTSpool *) palloc0(sizeof(BTSpool)); |
| btspool->heap = heapRel; |
| btspool->index = indexRel; |
| btspool->isunique = btshared->isunique; |
| btspool->nulls_not_distinct = btshared->nulls_not_distinct; |
| |
| /* Look up shared state private to tuplesort.c */ |
| sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false); |
| tuplesort_attach_shared(sharedsort, seg); |
| if (!btshared->isunique) |
| { |
| btspool2 = NULL; |
| sharedsort2 = NULL; |
| } |
| else |
| { |
| /* Allocate memory for worker's own private secondary spool */ |
| btspool2 = (BTSpool *) palloc0(sizeof(BTSpool)); |
| |
| /* Initialize worker's own secondary spool */ |
| btspool2->heap = btspool->heap; |
| btspool2->index = btspool->index; |
| btspool2->isunique = false; |
| /* Look up shared state private to tuplesort.c */ |
| sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false); |
| tuplesort_attach_shared(sharedsort2, seg); |
| } |
| |
| /* Prepare to track buffer usage during parallel execution */ |
| InstrStartParallelQuery(); |
| |
| /* Perform sorting of spool, and possibly a spool2 */ |
| sortmem = maintenance_work_mem / btshared->scantuplesortstates; |
| _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort, |
| sharedsort2, sortmem, false); |
| |
| /* Report WAL/buffer usage during parallel execution */ |
| bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); |
| walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false); |
| InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber], |
| &walusage[ParallelWorkerNumber]); |
| |
| #ifdef BTREE_BUILD_STATS |
| if (log_btree_build_stats) |
| { |
| ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS"); |
| ResetUsage(); |
| } |
| #endif /* BTREE_BUILD_STATS */ |
| |
| index_close(indexRel, indexLockmode); |
| table_close(heapRel, heapLockmode); |
| } |
| |
| /* |
| * Perform a worker's portion of a parallel sort. |
| * |
| * This generates a tuplesort for passed btspool, and a second tuplesort |
| * state if a second btspool is need (i.e. for unique index builds). All |
| * other spool fields should already be set when this is called. |
| * |
| * sortmem is the amount of working memory to use within each worker, |
| * expressed in KBs. |
| * |
| * When this returns, workers are done, and need only release resources. |
| */ |
| static void |
| _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, |
| BTShared *btshared, Sharedsort *sharedsort, |
| Sharedsort *sharedsort2, int sortmem, bool progress) |
| { |
| SortCoordinate coordinate; |
| BTBuildState buildstate; |
| TableScanDesc scan; |
| double reltuples; |
| IndexInfo *indexInfo; |
| |
| /* Initialize local tuplesort coordination state */ |
| coordinate = palloc0(sizeof(SortCoordinateData)); |
| coordinate->isWorker = true; |
| coordinate->nParticipants = -1; |
| coordinate->sharedsort = sharedsort; |
| |
| /* Begin "partial" tuplesort */ |
| btspool->sortstate = tuplesort_begin_index_btree(btspool->heap, |
| btspool->index, |
| btspool->isunique, |
| btspool->nulls_not_distinct, |
| sortmem, coordinate, |
| TUPLESORT_NONE); |
| |
| /* |
| * Just as with serial case, there may be a second spool. If so, a |
| * second, dedicated spool2 partial tuplesort is required. |
| */ |
| if (btspool2) |
| { |
| SortCoordinate coordinate2; |
| |
| /* |
| * We expect that the second one (for dead tuples) won't get very |
| * full, so we give it only work_mem (unless sortmem is less for |
| * worker). Worker processes are generally permitted to allocate |
| * work_mem independently. |
| */ |
| coordinate2 = palloc0(sizeof(SortCoordinateData)); |
| coordinate2->isWorker = true; |
| coordinate2->nParticipants = -1; |
| coordinate2->sharedsort = sharedsort2; |
| btspool2->sortstate = |
| tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false, |
| Min(sortmem, work_mem), coordinate2, |
| false); |
| } |
| |
| /* Fill in buildstate for _bt_build_callback() */ |
| buildstate.isunique = btshared->isunique; |
| buildstate.nulls_not_distinct = btshared->nulls_not_distinct; |
| buildstate.havedead = false; |
| buildstate.heap = btspool->heap; |
| buildstate.spool = btspool; |
| buildstate.spool2 = btspool2; |
| buildstate.indtuples = 0; |
| buildstate.btleader = NULL; |
| |
| /* Join parallel scan */ |
| indexInfo = BuildIndexInfo(btspool->index); |
| indexInfo->ii_Concurrent = btshared->isconcurrent; |
| scan = table_beginscan_parallel(btspool->heap, |
| ParallelTableScanFromBTShared(btshared)); |
| reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo, |
| true, progress, _bt_build_callback, |
| (void *) &buildstate, scan); |
| |
| /* Execute this worker's part of the sort */ |
| if (progress) |
| pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, |
| PROGRESS_BTREE_PHASE_PERFORMSORT_1); |
| tuplesort_performsort(btspool->sortstate); |
| if (btspool2) |
| { |
| if (progress) |
| pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, |
| PROGRESS_BTREE_PHASE_PERFORMSORT_2); |
| tuplesort_performsort(btspool2->sortstate); |
| } |
| |
| /* |
| * Done. Record ambuild statistics, and whether we encountered a broken |
| * HOT chain. |
| */ |
| SpinLockAcquire(&btshared->mutex); |
| btshared->nparticipantsdone++; |
| btshared->reltuples += reltuples; |
| if (buildstate.havedead) |
| btshared->havedead = true; |
| btshared->indtuples += buildstate.indtuples; |
| if (indexInfo->ii_BrokenHotChain) |
| btshared->brokenhotchain = true; |
| SpinLockRelease(&btshared->mutex); |
| |
| /* Notify leader */ |
| ConditionVariableSignal(&btshared->workersdonecv); |
| |
| /* We can end tuplesorts immediately */ |
| tuplesort_end(btspool->sortstate); |
| if (btspool2) |
| tuplesort_end(btspool2->sortstate); |
| } |