src/backend/access/gin/ginfast.c - cloudberry - Git at Google

 /*-------------------------------------------------------------------------
  *
  * ginfast.c
  *	  Fast insert routines for the Postgres inverted index access method.
  *	  Pending entries are stored in linear list of pages.  Later on
  *	  (typically during VACUUM), ginInsertCleanup() will be invoked to
  *	  transfer pending entries into the regular index structure.  This
  *	  wins because bulk insertion is much more efficient than retail.
  *
  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
  *			src/backend/access/gin/ginfast.c
  *
  *-------------------------------------------------------------------------
  */

 #include "postgres.h"

 #include "access/gin_private.h"
 #include "access/ginxlog.h"
 #include "access/xlog.h"
 #include "access/xloginsert.h"
 #include "catalog/pg_am.h"
 #include "commands/vacuum.h"
 #include "miscadmin.h"
 #include "port/pg_bitutils.h"
 #include "postmaster/autovacuum.h"
 #include "storage/indexfsm.h"
 #include "storage/lmgr.h"
 #include "storage/predicate.h"
 #include "utils/acl.h"
 #include "utils/builtins.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"

 /* GUC parameter */
 int			gin_pending_list_limit = 0;

 #define GIN_PAGE_FREESIZE \
 	( BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - MAXALIGN(sizeof(GinPageOpaqueData)) )

 typedef struct KeyArray
 {
 	Datum	   *keys;			/* expansible array */
 	GinNullCategory *categories;	/* another expansible array */
 	int32		nvalues;		/* current number of valid entries */
 	int32		maxvalues;		/* allocated size of arrays */
 } KeyArray;


 /*
  * Build a pending-list page from the given array of tuples, and write it out.
  *
  * Returns amount of free space left on the page.
  */
 static int32
 writeListPage(Relation index, Buffer buffer,
 			  IndexTuple *tuples, int32 ntuples, BlockNumber rightlink)
 {
 	Page		page = BufferGetPage(buffer);
 	int32		i,
 				freesize,
 				size = 0;
 	OffsetNumber l,
 				off;
 	PGAlignedBlock workspace;
 	char	   *ptr;

 	START_CRIT_SECTION();

 	GinInitBuffer(buffer, GIN_LIST);

 	off = FirstOffsetNumber;
 	ptr = workspace.data;

 	for (i = 0; i < ntuples; i++)
 	{
 		int			this_size = IndexTupleSize(tuples[i]);

 		memcpy(ptr, tuples[i], this_size);
 		ptr += this_size;
 		size += this_size;

 		l = PageAddItem(page, (Item) tuples[i], this_size, off, false, false);

 		if (l == InvalidOffsetNumber)
 			elog(ERROR, "failed to add item to index page in \"%s\"",
 				 RelationGetRelationName(index));

 		off++;
 	}

 	Assert(size <= BLCKSZ);		/* else we overran workspace */

 	GinPageGetOpaque(page)->rightlink = rightlink;

 	/*
 	 * tail page may contain only whole row(s) or final part of row placed on
 	 * previous pages (a "row" here meaning all the index tuples generated for
 	 * one heap tuple)
 	 */
 	if (rightlink == InvalidBlockNumber)
 	{
 		GinPageSetFullRow(page);
 		GinPageGetOpaque(page)->maxoff = 1;
 	}
 	else
 	{
 		GinPageGetOpaque(page)->maxoff = 0;
 	}

 	MarkBufferDirty(buffer);

 	if (RelationNeedsWAL(index))
 	{
 		ginxlogInsertListPage data;
 		XLogRecPtr	recptr;

 		data.rightlink = rightlink;
 		data.ntuples = ntuples;

 		XLogBeginInsert();
 		XLogRegisterData((char *) &data, sizeof(ginxlogInsertListPage));

 		XLogRegisterBuffer(0, buffer, REGBUF_WILL_INIT);
 		XLogRegisterBufData(0, workspace.data, size);

 		recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_INSERT_LISTPAGE);
 		PageSetLSN(page, recptr);
 	}

 	/* get free space before releasing buffer */
 	freesize = PageGetExactFreeSpace(page);

 	UnlockReleaseBuffer(buffer);

 	END_CRIT_SECTION();

 	return freesize;
 }

 static void
 makeSublist(Relation index, IndexTuple *tuples, int32 ntuples,
 			GinMetaPageData *res)
 {
 	Buffer		curBuffer = InvalidBuffer;
 	Buffer		prevBuffer = InvalidBuffer;
 	int			i,
 				size = 0,
 				tupsize;
 	int			startTuple = 0;

 	Assert(ntuples > 0);

 	/*
 	 * Split tuples into pages
 	 */
 	for (i = 0; i < ntuples; i++)
 	{
 		if (curBuffer == InvalidBuffer)
 		{
 			curBuffer = GinNewBuffer(index);

 			if (prevBuffer != InvalidBuffer)
 			{
 				res->nPendingPages++;
 				writeListPage(index, prevBuffer,
 							  tuples + startTuple,
 							  i - startTuple,
 							  BufferGetBlockNumber(curBuffer));
 			}
 			else
 			{
 				res->head = BufferGetBlockNumber(curBuffer);
 			}

 			prevBuffer = curBuffer;
 			startTuple = i;
 			size = 0;
 		}

 		tupsize = MAXALIGN(IndexTupleSize(tuples[i])) + sizeof(ItemIdData);

 		if (size + tupsize > GinListPageSize)
 		{
 			/* won't fit, force a new page and reprocess */
 			i--;
 			curBuffer = InvalidBuffer;
 		}
 		else
 		{
 			size += tupsize;
 		}
 	}

 	/*
 	 * Write last page
 	 */
 	res->tail = BufferGetBlockNumber(curBuffer);
 	res->tailFreeSize = writeListPage(index, curBuffer,
 									  tuples + startTuple,
 									  ntuples - startTuple,
 									  InvalidBlockNumber);
 	res->nPendingPages++;
 	/* that was only one heap tuple */
 	res->nPendingHeapTuples = 1;
 }

 /*
  * Write the index tuples contained in *collector into the index's
  * pending list.
  *
  * Function guarantees that all these tuples will be inserted consecutively,
  * preserving order
  */
 void
 ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector)
 {
 	Relation	index = ginstate->index;
 	Buffer		metabuffer;
 	Page		metapage;
 	GinMetaPageData *metadata = NULL;
 	Buffer		buffer = InvalidBuffer;
 	Page		page = NULL;
 	ginxlogUpdateMeta data;
 	bool		separateList = false;
 	bool		needCleanup = false;
 	int			cleanupSize;
 	bool		needWal;

 	if (collector->ntuples == 0)
 		return;

 	needWal = RelationNeedsWAL(index);

 	data.locator = index->rd_locator;
 	data.ntuples = 0;
 	data.newRightlink = data.prevTail = InvalidBlockNumber;

 	metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO);
 	metapage = BufferGetPage(metabuffer);

 	/*
 	 * An insertion to the pending list could logically belong anywhere in the
 	 * tree, so it conflicts with all serializable scans.  All scans acquire a
 	 * predicate lock on the metabuffer to represent that.  Therefore we'll
 	 * check for conflicts in, but not until we have the page locked and are
 	 * ready to modify the page.
 	 */

 	if (collector->sumsize + collector->ntuples * sizeof(ItemIdData) > GinListPageSize)
 	{
 		/*
 		 * Total size is greater than one page => make sublist
 		 */
 		separateList = true;
 	}
 	else
 	{
 		LockBuffer(metabuffer, GIN_EXCLUSIVE);
 		metadata = GinPageGetMeta(metapage);

 		if (metadata->head == InvalidBlockNumber ||
 			collector->sumsize + collector->ntuples * sizeof(ItemIdData) > metadata->tailFreeSize)
 		{
 			/*
 			 * Pending list is empty or total size is greater than freespace
 			 * on tail page => make sublist
 			 *
 			 * We unlock metabuffer to keep high concurrency
 			 */
 			separateList = true;
 			LockBuffer(metabuffer, GIN_UNLOCK);
 		}
 	}

 	if (separateList)
 	{
 		/*
 		 * We should make sublist separately and append it to the tail
 		 */
 		GinMetaPageData sublist;

 		memset(&sublist, 0, sizeof(GinMetaPageData));
 		makeSublist(index, collector->tuples, collector->ntuples, &sublist);

 		/*
 		 * metapage was unlocked, see above
 		 */
 		LockBuffer(metabuffer, GIN_EXCLUSIVE);
 		metadata = GinPageGetMeta(metapage);

 		CheckForSerializableConflictIn(index, NULL, GIN_METAPAGE_BLKNO);

 		if (metadata->head == InvalidBlockNumber)
 		{
 			/*
 			 * Main list is empty, so just insert sublist as main list
 			 */
 			START_CRIT_SECTION();

 			metadata->head = sublist.head;
 			metadata->tail = sublist.tail;
 			metadata->tailFreeSize = sublist.tailFreeSize;

 			metadata->nPendingPages = sublist.nPendingPages;
 			metadata->nPendingHeapTuples = sublist.nPendingHeapTuples;

 			if (needWal)
 				XLogBeginInsert();
 		}
 		else
 		{
 			/*
 			 * Merge lists
 			 */
 			data.prevTail = metadata->tail;
 			data.newRightlink = sublist.head;

 			buffer = ReadBuffer(index, metadata->tail);
 			LockBuffer(buffer, GIN_EXCLUSIVE);
 			page = BufferGetPage(buffer);

 			Assert(GinPageGetOpaque(page)->rightlink == InvalidBlockNumber);

 			START_CRIT_SECTION();

 			GinPageGetOpaque(page)->rightlink = sublist.head;

 			MarkBufferDirty(buffer);

 			metadata->tail = sublist.tail;
 			metadata->tailFreeSize = sublist.tailFreeSize;

 			metadata->nPendingPages += sublist.nPendingPages;
 			metadata->nPendingHeapTuples += sublist.nPendingHeapTuples;

 			if (needWal)
 			{
 				XLogBeginInsert();
 				XLogRegisterBuffer(1, buffer, REGBUF_STANDARD);
 			}
 		}
 	}
 	else
 	{
 		/*
 		 * Insert into tail page.  Metapage is already locked
 		 */
 		OffsetNumber l,
 					off;
 		int			i,
 					tupsize;
 		char	   *ptr;
 		char	   *collectordata;

 		CheckForSerializableConflictIn(index, NULL, GIN_METAPAGE_BLKNO);

 		buffer = ReadBuffer(index, metadata->tail);
 		LockBuffer(buffer, GIN_EXCLUSIVE);
 		page = BufferGetPage(buffer);

 		off = (PageIsEmpty(page)) ? FirstOffsetNumber :
 			OffsetNumberNext(PageGetMaxOffsetNumber(page));

 		collectordata = ptr = (char *) palloc(collector->sumsize);

 		data.ntuples = collector->ntuples;

 		START_CRIT_SECTION();

 		if (needWal)
 			XLogBeginInsert();

 		/*
 		 * Increase counter of heap tuples
 		 */
 		Assert(GinPageGetOpaque(page)->maxoff <= metadata->nPendingHeapTuples);
 		GinPageGetOpaque(page)->maxoff++;
 		metadata->nPendingHeapTuples++;

 		for (i = 0; i < collector->ntuples; i++)
 		{
 			tupsize = IndexTupleSize(collector->tuples[i]);
 			l = PageAddItem(page, (Item) collector->tuples[i], tupsize, off, false, false);

 			if (l == InvalidOffsetNumber)
 				elog(ERROR, "failed to add item to index page in \"%s\"",
 					 RelationGetRelationName(index));

 			memcpy(ptr, collector->tuples[i], tupsize);
 			ptr += tupsize;

 			off++;
 		}

 		Assert((ptr - collectordata) <= collector->sumsize);
 		if (needWal)
 		{
 			XLogRegisterBuffer(1, buffer, REGBUF_STANDARD);
 			XLogRegisterBufData(1, collectordata, collector->sumsize);
 		}

 		metadata->tailFreeSize = PageGetExactFreeSpace(page);

 		MarkBufferDirty(buffer);
 	}

 	/*
 	 * Set pd_lower just past the end of the metadata.  This is essential,
 	 * because without doing so, metadata will be lost if xlog.c compresses
 	 * the page.  (We must do this here because pre-v11 versions of PG did not
 	 * set the metapage's pd_lower correctly, so a pg_upgraded index might
 	 * contain the wrong value.)
 	 */
 	((PageHeader) metapage)->pd_lower =
 		((char *) metadata + sizeof(GinMetaPageData)) - (char *) metapage;

 	/*
 	 * Write metabuffer, make xlog entry
 	 */
 	MarkBufferDirty(metabuffer);

 	if (needWal)
 	{
 		XLogRecPtr	recptr;

 		memcpy(&data.metadata, metadata, sizeof(GinMetaPageData));

 		XLogRegisterBuffer(0, metabuffer, REGBUF_WILL_INIT | REGBUF_STANDARD);
 		XLogRegisterData((char *) &data, sizeof(ginxlogUpdateMeta));

 		recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_UPDATE_META_PAGE);
 		PageSetLSN(metapage, recptr);

 		if (buffer != InvalidBuffer)
 		{
 			PageSetLSN(page, recptr);
 		}
 	}

 	if (buffer != InvalidBuffer)
 		UnlockReleaseBuffer(buffer);

 	/*
 	 * Force pending list cleanup when it becomes too long. And,
 	 * ginInsertCleanup could take significant amount of time, so we prefer to
 	 * call it when it can do all the work in a single collection cycle. In
 	 * non-vacuum mode, it shouldn't require maintenance_work_mem, so fire it
 	 * while pending list is still small enough to fit into
 	 * gin_pending_list_limit.
 	 *
 	 * ginInsertCleanup() should not be called inside our CRIT_SECTION.
 	 */
 	cleanupSize = GinGetPendingListCleanupSize(index);
 	if (metadata->nPendingPages * GIN_PAGE_FREESIZE > cleanupSize * 1024L)
 		needCleanup = true;

 	UnlockReleaseBuffer(metabuffer);

 	END_CRIT_SECTION();

 	/*
 	 * Since it could contend with concurrent cleanup process we cleanup
 	 * pending list not forcibly.
 	 */
 	if (needCleanup)
 		ginInsertCleanup(ginstate, false, true, false, NULL);
 }

 /*
  * Create temporary index tuples for a single indexable item (one index column
  * for the heap tuple specified by ht_ctid), and append them to the array
  * in *collector.  They will subsequently be written out using
  * ginHeapTupleFastInsert.  Note that to guarantee consistent state, all
  * temp tuples for a given heap tuple must be written in one call to
  * ginHeapTupleFastInsert.
  */
 void
 ginHeapTupleFastCollect(GinState *ginstate,
 						GinTupleCollector *collector,
 						OffsetNumber attnum, Datum value, bool isNull,
 						ItemPointer ht_ctid)
 {
 	Datum	   *entries;
 	GinNullCategory *categories;
 	int32		i,
 				nentries;

 	/*
 	 * Extract the key values that need to be inserted in the index
 	 */
 	entries = ginExtractEntries(ginstate, attnum, value, isNull,
 								&nentries, &categories);

 	/*
 	 * Protect against integer overflow in allocation calculations
 	 */
 	if (nentries < 0 ||
 		collector->ntuples + nentries > MaxAllocSize / sizeof(IndexTuple))
 		elog(ERROR, "too many entries for GIN index");

 	/*
 	 * Allocate/reallocate memory for storing collected tuples
 	 */
 	if (collector->tuples == NULL)
 	{
 		/*
 		 * Determine the number of elements to allocate in the tuples array
 		 * initially.  Make it a power of 2 to avoid wasting memory when
 		 * resizing (since palloc likes powers of 2).
 		 */
 		collector->lentuples = pg_nextpower2_32(Max(16, nentries));
 		collector->tuples = palloc_array(IndexTuple, collector->lentuples);
 	}
 	else if (collector->lentuples < collector->ntuples + nentries)
 	{
 		/*
 		 * Advance lentuples to the next suitable power of 2.  This won't
 		 * overflow, though we could get to a value that exceeds
 		 * MaxAllocSize/sizeof(IndexTuple), causing an error in repalloc.
 		 */
 		collector->lentuples = pg_nextpower2_32(collector->ntuples + nentries);
 		collector->tuples = repalloc_array(collector->tuples,
 										   IndexTuple, collector->lentuples);
 	}

 	/*
 	 * Build an index tuple for each key value, and add to array.  In pending
 	 * tuples we just stick the heap TID into t_tid.
 	 */
 	for (i = 0; i < nentries; i++)
 	{
 		IndexTuple	itup;

 		itup = GinFormTuple(ginstate, attnum, entries[i], categories[i],
 							NULL, 0, 0, true);
 		itup->t_tid = *ht_ctid;
 		collector->tuples[collector->ntuples++] = itup;
 		collector->sumsize += IndexTupleSize(itup);
 	}
 }

 /*
  * Deletes pending list pages up to (not including) newHead page.
  * If newHead == InvalidBlockNumber then function drops the whole list.
  *
  * metapage is pinned and exclusive-locked throughout this function.
  */
 static void
 shiftList(Relation index, Buffer metabuffer, BlockNumber newHead,
 		  bool fill_fsm, IndexBulkDeleteResult *stats)
 {
 	Page		metapage;
 	GinMetaPageData *metadata;
 	BlockNumber blknoToDelete;

 	metapage = BufferGetPage(metabuffer);
 	metadata = GinPageGetMeta(metapage);
 	blknoToDelete = metadata->head;

 	do
 	{
 		Page		page;
 		int			i;
 		int64		nDeletedHeapTuples = 0;
 		ginxlogDeleteListPages data;
 		Buffer		buffers[GIN_NDELETE_AT_ONCE];
 		BlockNumber freespace[GIN_NDELETE_AT_ONCE];

 		data.ndeleted = 0;
 		while (data.ndeleted < GIN_NDELETE_AT_ONCE && blknoToDelete != newHead)
 		{
 			freespace[data.ndeleted] = blknoToDelete;
 			buffers[data.ndeleted] = ReadBuffer(index, blknoToDelete);
 			LockBuffer(buffers[data.ndeleted], GIN_EXCLUSIVE);
 			page = BufferGetPage(buffers[data.ndeleted]);

 			data.ndeleted++;

 			Assert(!GinPageIsDeleted(page));

 			nDeletedHeapTuples += GinPageGetOpaque(page)->maxoff;
 			blknoToDelete = GinPageGetOpaque(page)->rightlink;
 		}

 		if (stats)
 			stats->pages_deleted += data.ndeleted;

 		/*
 		 * This operation touches an unusually large number of pages, so
 		 * prepare the XLogInsert machinery for that before entering the
 		 * critical section.
 		 */
 		if (RelationNeedsWAL(index))
 			XLogEnsureRecordSpace(data.ndeleted, 0);

 		START_CRIT_SECTION();

 		metadata->head = blknoToDelete;

 		Assert(metadata->nPendingPages >= data.ndeleted);
 		metadata->nPendingPages -= data.ndeleted;
 		Assert(metadata->nPendingHeapTuples >= nDeletedHeapTuples);
 		metadata->nPendingHeapTuples -= nDeletedHeapTuples;

 		if (blknoToDelete == InvalidBlockNumber)
 		{
 			metadata->tail = InvalidBlockNumber;
 			metadata->tailFreeSize = 0;
 			metadata->nPendingPages = 0;
 			metadata->nPendingHeapTuples = 0;
 		}

 		/*
 		 * Set pd_lower just past the end of the metadata.  This is essential,
 		 * because without doing so, metadata will be lost if xlog.c
 		 * compresses the page.  (We must do this here because pre-v11
 		 * versions of PG did not set the metapage's pd_lower correctly, so a
 		 * pg_upgraded index might contain the wrong value.)
 		 */
 		((PageHeader) metapage)->pd_lower =
 			((char *) metadata + sizeof(GinMetaPageData)) - (char *) metapage;

 		MarkBufferDirty(metabuffer);

 		for (i = 0; i < data.ndeleted; i++)
 		{
 			page = BufferGetPage(buffers[i]);
 			GinPageGetOpaque(page)->flags = GIN_DELETED;
 			MarkBufferDirty(buffers[i]);
 		}

 		if (RelationNeedsWAL(index))
 		{
 			XLogRecPtr	recptr;

 			XLogBeginInsert();
 			XLogRegisterBuffer(0, metabuffer,
 							   REGBUF_WILL_INIT | REGBUF_STANDARD);
 			for (i = 0; i < data.ndeleted; i++)
 				XLogRegisterBuffer(i + 1, buffers[i], REGBUF_WILL_INIT);

 			memcpy(&data.metadata, metadata, sizeof(GinMetaPageData));

 			XLogRegisterData((char *) &data,
 							 sizeof(ginxlogDeleteListPages));

 			recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_DELETE_LISTPAGE);
 			PageSetLSN(metapage, recptr);

 			for (i = 0; i < data.ndeleted; i++)
 			{
 				page = BufferGetPage(buffers[i]);
 				PageSetLSN(page, recptr);
 			}
 		}

 		for (i = 0; i < data.ndeleted; i++)
 			UnlockReleaseBuffer(buffers[i]);

 		END_CRIT_SECTION();

 		for (i = 0; fill_fsm && i < data.ndeleted; i++)
 			RecordFreeIndexPage(index, freespace[i]);

 	} while (blknoToDelete != newHead);
 }

 /* Initialize empty KeyArray */
 static void
 initKeyArray(KeyArray *keys, int32 maxvalues)
 {
 	keys->keys = palloc_array(Datum, maxvalues);
 	keys->categories = palloc_array(GinNullCategory, maxvalues);
 	keys->nvalues = 0;
 	keys->maxvalues = maxvalues;
 }

 /* Add datum to KeyArray, resizing if needed */
 static void
 addDatum(KeyArray *keys, Datum datum, GinNullCategory category)
 {
 	if (keys->nvalues >= keys->maxvalues)
 	{
 		keys->maxvalues *= 2;
 		keys->keys = repalloc_array(keys->keys, Datum, keys->maxvalues);
 		keys->categories = repalloc_array(keys->categories, GinNullCategory, keys->maxvalues);
 	}

 	keys->keys[keys->nvalues] = datum;
 	keys->categories[keys->nvalues] = category;
 	keys->nvalues++;
 }

 /*
  * Collect data from a pending-list page in preparation for insertion into
  * the main index.
  *
  * Go through all tuples >= startoff on page and collect values in accum
  *
  * Note that ka is just workspace --- it does not carry any state across
  * calls.
  */
 static void
 processPendingPage(BuildAccumulator *accum, KeyArray *ka,
 				   Page page, OffsetNumber startoff)
 {
 	ItemPointerData heapptr;
 	OffsetNumber i,
 				maxoff;
 	OffsetNumber attrnum;

 	/* reset *ka to empty */
 	ka->nvalues = 0;

 	maxoff = PageGetMaxOffsetNumber(page);
 	Assert(maxoff >= FirstOffsetNumber);
 	ItemPointerSetInvalid(&heapptr);
 	attrnum = 0;

 	for (i = startoff; i <= maxoff; i = OffsetNumberNext(i))
 	{
 		IndexTuple	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, i));
 		OffsetNumber curattnum;
 		Datum		curkey;
 		GinNullCategory curcategory;

 		/* Check for change of heap TID or attnum */
 		curattnum = gintuple_get_attrnum(accum->ginstate, itup);

 		if (!ItemPointerIsValid(&heapptr))
 		{
 			heapptr = itup->t_tid;
 			attrnum = curattnum;
 		}
 		else if (!(ItemPointerEquals(&heapptr, &itup->t_tid) &&
 				   curattnum == attrnum))
 		{
 			/*
 			 * ginInsertBAEntries can insert several datums per call, but only
 			 * for one heap tuple and one column.  So call it at a boundary,
 			 * and reset ka.
 			 */
 			ginInsertBAEntries(accum, &heapptr, attrnum,
 							   ka->keys, ka->categories, ka->nvalues);
 			ka->nvalues = 0;
 			heapptr = itup->t_tid;
 			attrnum = curattnum;
 		}

 		/* Add key to KeyArray */
 		curkey = gintuple_get_key(accum->ginstate, itup, &curcategory);
 		addDatum(ka, curkey, curcategory);
 	}

 	/* Dump out all remaining keys */
 	ginInsertBAEntries(accum, &heapptr, attrnum,
 					   ka->keys, ka->categories, ka->nvalues);
 }

 /*
  * Move tuples from pending pages into regular GIN structure.
  *
  * On first glance it looks completely not crash-safe. But if we crash
  * after posting entries to the main index and before removing them from the
  * pending list, it's okay because when we redo the posting later on, nothing
  * bad will happen.
  *
  * fill_fsm indicates that ginInsertCleanup should add deleted pages
  * to FSM otherwise caller is responsible to put deleted pages into
  * FSM.
  *
  * If stats isn't null, we count deleted pending pages into the counts.
  */
 void
 ginInsertCleanup(GinState *ginstate, bool full_clean,
 				 bool fill_fsm, bool forceCleanup,
 				 IndexBulkDeleteResult *stats)
 {
 	Relation	index = ginstate->index;
 	Buffer		metabuffer,
 				buffer;
 	Page		metapage,
 				page;
 	GinMetaPageData *metadata;
 	MemoryContext opCtx,
 				oldCtx;
 	BuildAccumulator accum;
 	KeyArray	datums;
 	BlockNumber blkno,
 				blknoFinish;
 	bool		cleanupFinish = false;
 	bool		fsm_vac = false;
 	Size		workMemory;

 	/*
 	 * We would like to prevent concurrent cleanup process. For that we will
 	 * lock metapage in exclusive mode using LockPage() call. Nobody other
 	 * will use that lock for metapage, so we keep possibility of concurrent
 	 * insertion into pending list
 	 */

 	if (forceCleanup)
 	{
 		/*
 		 * We are called from [auto]vacuum/analyze or gin_clean_pending_list()
 		 * and we would like to wait concurrent cleanup to finish.
 		 */
 		LockPage(index, GIN_METAPAGE_BLKNO, ExclusiveLock);
 		workMemory =
 			(IsAutoVacuumWorkerProcess() && autovacuum_work_mem != -1) ?
 			autovacuum_work_mem : maintenance_work_mem;
 	}
 	else
 	{
 		/*
 		 * We are called from regular insert and if we see concurrent cleanup
 		 * just exit in hope that concurrent process will clean up pending
 		 * list.
 		 */
 		if (!ConditionalLockPage(index, GIN_METAPAGE_BLKNO, ExclusiveLock))
 			return;
 		workMemory = work_mem;
 	}

 	metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO);
 	LockBuffer(metabuffer, GIN_SHARE);
 	metapage = BufferGetPage(metabuffer);
 	metadata = GinPageGetMeta(metapage);

 	if (metadata->head == InvalidBlockNumber)
 	{
 		/* Nothing to do */
 		UnlockReleaseBuffer(metabuffer);
 		UnlockPage(index, GIN_METAPAGE_BLKNO, ExclusiveLock);
 		return;
 	}

 	/*
 	 * Remember a tail page to prevent infinite cleanup if other backends add
 	 * new tuples faster than we can cleanup.
 	 */
 	blknoFinish = metadata->tail;

 	/*
 	 * Read and lock head of pending list
 	 */
 	blkno = metadata->head;
 	buffer = ReadBuffer(index, blkno);
 	LockBuffer(buffer, GIN_SHARE);
 	page = BufferGetPage(buffer);

 	LockBuffer(metabuffer, GIN_UNLOCK);

 	/*
 	 * Initialize.  All temporary space will be in opCtx
 	 */
 	opCtx = AllocSetContextCreate(CurrentMemoryContext,
 								  "GIN insert cleanup temporary context",
 								  ALLOCSET_DEFAULT_SIZES);

 	oldCtx = MemoryContextSwitchTo(opCtx);

 	initKeyArray(&datums, 128);
 	ginInitBA(&accum);
 	accum.ginstate = ginstate;

 	/*
 	 * At the top of this loop, we have pin and lock on the current page of
 	 * the pending list.  However, we'll release that before exiting the loop.
 	 * Note we also have pin but not lock on the metapage.
 	 */
 	for (;;)
 	{
 		Assert(!GinPageIsDeleted(page));

 		/*
 		 * Are we walk through the page which as we remember was a tail when
 		 * we start our cleanup?  But if caller asks us to clean up whole
 		 * pending list then ignore old tail, we will work until list becomes
 		 * empty.
 		 */
 		if (blkno == blknoFinish && full_clean == false)
 			cleanupFinish = true;

 		/*
 		 * read page's datums into accum
 		 */
 		processPendingPage(&accum, &datums, page, FirstOffsetNumber);

 		vacuum_delay_point();

 		/*
 		 * Is it time to flush memory to disk?	Flush if we are at the end of
 		 * the pending list, or if we have a full row and memory is getting
 		 * full.
 		 */
 		if (GinPageGetOpaque(page)->rightlink == InvalidBlockNumber ||
 			(GinPageHasFullRow(page) &&
 			 (accum.allocatedMemory >= workMemory * 1024L)))
 		{
 			ItemPointerData *list;
 			uint32		nlist;
 			Datum		key;
 			GinNullCategory category;
 			OffsetNumber maxoff,
 						attnum;

 			/*
 			 * Unlock current page to increase performance. Changes of page
 			 * will be checked later by comparing maxoff after completion of
 			 * memory flush.
 			 */
 			maxoff = PageGetMaxOffsetNumber(page);
 			LockBuffer(buffer, GIN_UNLOCK);

 			/*
 			 * Moving collected data into regular structure can take
 			 * significant amount of time - so, run it without locking pending
 			 * list.
 			 */
 			ginBeginBAScan(&accum);
 			while ((list = ginGetBAEntry(&accum,
 										 &attnum, &key, &category, &nlist)) != NULL)
 			{
 				ginEntryInsert(ginstate, attnum, key, category,
 							   list, nlist, NULL);
 				vacuum_delay_point();
 			}

 			/*
 			 * Lock the whole list to remove pages
 			 */
 			LockBuffer(metabuffer, GIN_EXCLUSIVE);
 			LockBuffer(buffer, GIN_SHARE);

 			Assert(!GinPageIsDeleted(page));

 			/*
 			 * While we left the page unlocked, more stuff might have gotten
 			 * added to it.  If so, process those entries immediately.  There
 			 * shouldn't be very many, so we don't worry about the fact that
 			 * we're doing this with exclusive lock. Insertion algorithm
 			 * guarantees that inserted row(s) will not continue on next page.
 			 * NOTE: intentionally no vacuum_delay_point in this loop.
 			 */
 			if (PageGetMaxOffsetNumber(page) != maxoff)
 			{
 				ginInitBA(&accum);
 				processPendingPage(&accum, &datums, page, maxoff + 1);

 				ginBeginBAScan(&accum);
 				while ((list = ginGetBAEntry(&accum,
 											 &attnum, &key, &category, &nlist)) != NULL)
 					ginEntryInsert(ginstate, attnum, key, category,
 								   list, nlist, NULL);
 			}

 			/*
 			 * Remember next page - it will become the new list head
 			 */
 			blkno = GinPageGetOpaque(page)->rightlink;
 			UnlockReleaseBuffer(buffer);	/* shiftList will do exclusive
 											 * locking */

 			/*
 			 * remove read pages from pending list, at this point all content
 			 * of read pages is in regular structure
 			 */
 			shiftList(index, metabuffer, blkno, fill_fsm, stats);

 			/* At this point, some pending pages have been freed up */
 			fsm_vac = true;

 			Assert(blkno == metadata->head);
 			LockBuffer(metabuffer, GIN_UNLOCK);

 			/*
 			 * if we removed the whole pending list or we cleanup tail (which
 			 * we remembered on start our cleanup process) then just exit
 			 */
 			if (blkno == InvalidBlockNumber || cleanupFinish)
 				break;

 			/*
 			 * release memory used so far and reinit state
 			 */
 			MemoryContextReset(opCtx);
 			initKeyArray(&datums, datums.maxvalues);
 			ginInitBA(&accum);
 		}
 		else
 		{
 			blkno = GinPageGetOpaque(page)->rightlink;
 			UnlockReleaseBuffer(buffer);
 		}

 		/*
 		 * Read next page in pending list
 		 */
 		vacuum_delay_point();
 		buffer = ReadBuffer(index, blkno);
 		LockBuffer(buffer, GIN_SHARE);
 		page = BufferGetPage(buffer);
 	}

 	UnlockPage(index, GIN_METAPAGE_BLKNO, ExclusiveLock);
 	ReleaseBuffer(metabuffer);

 	/*
 	 * As pending list pages can have a high churn rate, it is desirable to
 	 * recycle them immediately to the FreeSpaceMap when ordinary backends
 	 * clean the list.
 	 */
 	if (fsm_vac && fill_fsm)
 		IndexFreeSpaceMapVacuum(index);

 	/* Clean up temporary space */
 	MemoryContextSwitchTo(oldCtx);
 	MemoryContextDelete(opCtx);
 }

 /*
  * SQL-callable function to clean the insert pending list
  */
 Datum
 gin_clean_pending_list(PG_FUNCTION_ARGS)
 {
 	Oid			indexoid = PG_GETARG_OID(0);
 	Relation	indexRel = index_open(indexoid, RowExclusiveLock);
 	IndexBulkDeleteResult stats;

 	if (RecoveryInProgress())
 		ereport(ERROR,
 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
 				 errmsg("recovery is in progress"),
 				 errhint("GIN pending list cannot be cleaned up during recovery.")));

 	/* Must be a GIN index */
 	if (indexRel->rd_rel->relkind != RELKIND_INDEX ||
 		!IsIndexAccessMethod(indexRel->rd_rel->relam, GIN_AM_OID))
 		ereport(ERROR,
 				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
 				 errmsg("\"%s\" is not a GIN index",
 						RelationGetRelationName(indexRel))));

 	/*
 	 * Reject attempts to read non-local temporary relations; we would be
 	 * likely to get wrong data since we have no visibility into the owning
 	 * session's local buffers.
 	 */
 	if (RELATION_IS_OTHER_TEMP(indexRel))
 		ereport(ERROR,
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 				 errmsg("cannot access temporary indexes of other sessions")));

 	/* User must own the index (comparable to privileges needed for VACUUM) */
 	if (!object_ownercheck(RelationRelationId, indexoid, GetUserId()))
 		aclcheck_error(ACLCHECK_NOT_OWNER, OBJECT_INDEX,
 					   RelationGetRelationName(indexRel));

 	memset(&stats, 0, sizeof(stats));

 	/*
 	 * Can't assume anything about the content of an !indisready index.  Make
 	 * those a no-op, not an error, so users can just run this function on all
 	 * indexes of the access method.  Since an indisready&&!indisvalid index
 	 * is merely awaiting missed aminsert calls, we're capable of processing
 	 * it.  Decline to do so, out of an abundance of caution.
 	 */
 	if (indexRel->rd_index->indisvalid)
 	{
 		GinState	ginstate;

 		initGinState(&ginstate, indexRel);
 		ginInsertCleanup(&ginstate, true, true, true, &stats);
 	}
 	else
 		ereport(DEBUG1,
 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
 				 errmsg("index \"%s\" is not valid",
 						RelationGetRelationName(indexRel))));

 	index_close(indexRel, RowExclusiveLock);

 	PG_RETURN_INT64((int64) stats.pages_deleted);
 }