src/backend/access/spgist/spgutils.c - cloudberry - Git at Google

 /*-------------------------------------------------------------------------
  *
  * spgutils.c
  *	  various support functions for SP-GiST
  *
  *
  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
  *			src/backend/access/spgist/spgutils.c
  *
  *-------------------------------------------------------------------------
  */

 #include "postgres.h"

 #include "access/amvalidate.h"
 #include "access/htup_details.h"
 #include "access/reloptions.h"
 #include "access/spgist_private.h"
 #include "access/toast_compression.h"
 #include "access/transam.h"
 #include "access/xact.h"
 #include "catalog/pg_amop.h"
 #include "commands/vacuum.h"
 #include "nodes/nodeFuncs.h"
 #include "parser/parse_coerce.h"
 #include "storage/bufmgr.h"
 #include "storage/indexfsm.h"
 #include "storage/lmgr.h"
 #include "utils/builtins.h"
 #include "utils/catcache.h"
 #include "utils/index_selfuncs.h"
 #include "utils/lsyscache.h"
 #include "utils/syscache.h"

 /*
  * SP-GiST handler function: return IndexAmRoutine with access method parameters
  * and callbacks.
  */
 Datum
 spghandler(PG_FUNCTION_ARGS)
 {
 	IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);

 	amroutine->amstrategies = 0;
 	amroutine->amsupport = SPGISTNProc;
 	amroutine->amoptsprocnum = SPGIST_OPTIONS_PROC;
 	amroutine->amcanorder = false;
 	amroutine->amcanorderbyop = true;
 	amroutine->amcanbackward = false;
 	amroutine->amcanunique = false;
 	amroutine->amcanmulticol = false;
 	amroutine->amoptionalkey = true;
 	amroutine->amsearcharray = false;
 	amroutine->amsearchnulls = true;
 	amroutine->amstorage = true;
 	amroutine->amclusterable = false;
 	amroutine->ampredlocks = false;
 	amroutine->amcanparallel = false;
 	amroutine->amcaninclude = true;
 	amroutine->amusemaintenanceworkmem = false;
 	amroutine->amsummarizing = false;
 	amroutine->amparallelvacuumoptions =
 		VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_COND_CLEANUP;
 	amroutine->amkeytype = InvalidOid;

 	amroutine->ambuild = spgbuild;
 	amroutine->ambuildempty = spgbuildempty;
 	amroutine->aminsert = spginsert;
 	amroutine->ambulkdelete = spgbulkdelete;
 	amroutine->amvacuumcleanup = spgvacuumcleanup;
 	amroutine->amcanreturn = spgcanreturn;
 	amroutine->amcostestimate = spgcostestimate;
 	amroutine->amoptions = spgoptions;
 	amroutine->amproperty = spgproperty;
 	amroutine->ambuildphasename = NULL;
 	amroutine->amvalidate = spgvalidate;
 	amroutine->amadjustmembers = spgadjustmembers;
 	amroutine->ambeginscan = spgbeginscan;
 	amroutine->amrescan = spgrescan;
 	amroutine->amgettuple = spggettuple;
 	amroutine->amgetbitmap = spggetbitmap;
 	amroutine->amendscan = spgendscan;
 	amroutine->ammarkpos = NULL;
 	amroutine->amrestrpos = NULL;
 	amroutine->amestimateparallelscan = NULL;
 	amroutine->aminitparallelscan = NULL;
 	amroutine->amparallelrescan = NULL;

 	PG_RETURN_POINTER(amroutine);
 }

 /*
  * GetIndexInputType
  *		Determine the nominal input data type for an index column
  *
  * We define the "nominal" input type as the associated opclass's opcintype,
  * or if that is a polymorphic type, the base type of the heap column or
  * expression that is the index's input.  The reason for preferring the
  * opcintype is that non-polymorphic opclasses probably don't want to hear
  * about binary-compatible input types.  For instance, if a text opclass
  * is being used with a varchar heap column, we want to report "text" not
  * "varchar".  Likewise, opclasses don't want to hear about domain types,
  * so if we do consult the actual input type, we make sure to flatten domains.
  *
  * At some point maybe this should go somewhere else, but it's not clear
  * if any other index AMs have a use for it.
  */
 static Oid
 GetIndexInputType(Relation index, AttrNumber indexcol)
 {
 	Oid			opcintype;
 	AttrNumber	heapcol;
 	List	   *indexprs;
 	ListCell   *indexpr_item;

 	Assert(index->rd_index != NULL);
 	Assert(indexcol > 0 && indexcol <= index->rd_index->indnkeyatts);
 	opcintype = index->rd_opcintype[indexcol - 1];
 	if (!IsPolymorphicType(opcintype))
 		return opcintype;
 	heapcol = index->rd_index->indkey.values[indexcol - 1];
 	if (heapcol != 0)			/* Simple index column? */
 		return getBaseType(get_atttype(index->rd_index->indrelid, heapcol));

 	/*
 	 * If the index expressions are already cached, skip calling
 	 * RelationGetIndexExpressions, as it will make a copy which is overkill.
 	 * We're not going to modify the trees, and we're not going to do anything
 	 * that would invalidate the relcache entry before we're done.
 	 */
 	if (index->rd_indexprs)
 		indexprs = index->rd_indexprs;
 	else
 		indexprs = RelationGetIndexExpressions(index);
 	indexpr_item = list_head(indexprs);
 	for (int i = 1; i <= index->rd_index->indnkeyatts; i++)
 	{
 		if (index->rd_index->indkey.values[i - 1] == 0)
 		{
 			/* expression column */
 			if (indexpr_item == NULL)
 				elog(ERROR, "wrong number of index expressions");
 			if (i == indexcol)
 				return getBaseType(exprType((Node *) lfirst(indexpr_item)));
 			indexpr_item = lnext(indexprs, indexpr_item);
 		}
 	}
 	elog(ERROR, "wrong number of index expressions");
 	return InvalidOid;			/* keep compiler quiet */
 }

 /* Fill in a SpGistTypeDesc struct with info about the specified data type */
 static void
 fillTypeDesc(SpGistTypeDesc *desc, Oid type)
 {
 	HeapTuple	tp;
 	Form_pg_type typtup;

 	desc->type = type;
 	tp = SearchSysCache1(TYPEOID, ObjectIdGetDatum(type));
 	if (!HeapTupleIsValid(tp))
 		elog(ERROR, "cache lookup failed for type %u", type);
 	typtup = (Form_pg_type) GETSTRUCT(tp);
 	desc->attlen = typtup->typlen;
 	desc->attbyval = typtup->typbyval;
 	desc->attalign = typtup->typalign;
 	desc->attstorage = typtup->typstorage;
 	ReleaseSysCache(tp);
 }

 /*
  * Fetch local cache of AM-specific info about the index, initializing it
  * if necessary
  */
 SpGistCache *
 spgGetCache(Relation index)
 {
 	SpGistCache *cache;

 	if (index->rd_amcache == NULL)
 	{
 		Oid			atttype;
 		spgConfigIn in;
 		FmgrInfo   *procinfo;

 		cache = MemoryContextAllocZero(index->rd_indexcxt,
 									   sizeof(SpGistCache));

 		/* SPGiST must have one key column and can also have INCLUDE columns */
 		Assert(IndexRelationGetNumberOfKeyAttributes(index) == 1);
 		Assert(IndexRelationGetNumberOfAttributes(index) <= INDEX_MAX_KEYS);

 		/*
 		 * Get the actual (well, nominal) data type of the key column.  We
 		 * pass this to the opclass config function so that polymorphic
 		 * opclasses are possible.
 		 */
 		atttype = GetIndexInputType(index, spgKeyColumn + 1);

 		/* Call the config function to get config info for the opclass */
 		in.attType = atttype;

 		procinfo = index_getprocinfo(index, 1, SPGIST_CONFIG_PROC);
 		FunctionCall2Coll(procinfo,
 						  index->rd_indcollation[spgKeyColumn],
 						  PointerGetDatum(&in),
 						  PointerGetDatum(&cache->config));

 		/*
 		 * If leafType isn't specified, use the declared index column type,
 		 * which index.c will have derived from the opclass's opcintype.
 		 * (Although we now make spgvalidate.c warn if these aren't the same,
 		 * old user-defined opclasses may not set the STORAGE parameter
 		 * correctly, so believe leafType if it's given.)
 		 */
 		if (!OidIsValid(cache->config.leafType))
 		{
 			cache->config.leafType =
 				TupleDescAttr(RelationGetDescr(index), spgKeyColumn)->atttypid;

 			/*
 			 * If index column type is binary-coercible to atttype (for
 			 * example, it's a domain over atttype), treat it as plain atttype
 			 * to avoid thinking we need to compress.
 			 */
 			if (cache->config.leafType != atttype &&
 				IsBinaryCoercible(cache->config.leafType, atttype))
 				cache->config.leafType = atttype;
 		}

 		/* Get the information we need about each relevant datatype */
 		fillTypeDesc(&cache->attType, atttype);

 		if (cache->config.leafType != atttype)
 		{
 			if (!OidIsValid(index_getprocid(index, 1, SPGIST_COMPRESS_PROC)))
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 						 errmsg("compress method must be defined when leaf type is different from input type")));

 			fillTypeDesc(&cache->attLeafType, cache->config.leafType);
 		}
 		else
 		{
 			/* Save lookups in this common case */
 			cache->attLeafType = cache->attType;
 		}

 		fillTypeDesc(&cache->attPrefixType, cache->config.prefixType);
 		fillTypeDesc(&cache->attLabelType, cache->config.labelType);

 		/*
 		 * Finally, if it's a real index (not a partitioned one), get the
 		 * lastUsedPages data from the metapage
 		 */
 		if (index->rd_rel->relkind != RELKIND_PARTITIONED_INDEX)
 		{
 			Buffer		metabuffer;
 			SpGistMetaPageData *metadata;

 			metabuffer = ReadBuffer(index, SPGIST_METAPAGE_BLKNO);
 			LockBuffer(metabuffer, BUFFER_LOCK_SHARE);

 			metadata = SpGistPageGetMeta(BufferGetPage(metabuffer));

 			if (metadata->magicNumber != SPGIST_MAGIC_NUMBER)
 				elog(ERROR, "index \"%s\" is not an SP-GiST index",
 					 RelationGetRelationName(index));

 			cache->lastUsedPages = metadata->lastUsedPages;

 			UnlockReleaseBuffer(metabuffer);
 		}

 		index->rd_amcache = (void *) cache;
 	}
 	else
 	{
 		/* assume it's up to date */
 		cache = (SpGistCache *) index->rd_amcache;
 	}

 	return cache;
 }

 /*
  * Compute a tuple descriptor for leaf tuples or index-only-scan result tuples.
  *
  * We can use the relcache's tupdesc as-is in many cases, and it's always
  * OK so far as any INCLUDE columns are concerned.  However, the entry for
  * the key column has to match leafType in the first case or attType in the
  * second case.  While the relcache's tupdesc *should* show leafType, this
  * might not hold for legacy user-defined opclasses, since before v14 they
  * were not allowed to declare their true storage type in CREATE OPCLASS.
  * Also, attType can be different from what is in the relcache.
  *
  * This function gives back either a pointer to the relcache's tupdesc
  * if that is suitable, or a palloc'd copy that's been adjusted to match
  * the specified key column type.  We can avoid doing any catalog lookups
  * here by insisting that the caller pass an SpGistTypeDesc not just an OID.
  */
 TupleDesc
 getSpGistTupleDesc(Relation index, SpGistTypeDesc *keyType)
 {
 	TupleDesc	outTupDesc;
 	Form_pg_attribute att;

 	if (keyType->type ==
 		TupleDescAttr(RelationGetDescr(index), spgKeyColumn)->atttypid)
 		outTupDesc = RelationGetDescr(index);
 	else
 	{
 		outTupDesc = CreateTupleDescCopy(RelationGetDescr(index));
 		att = TupleDescAttr(outTupDesc, spgKeyColumn);
 		/* It's sufficient to update the type-dependent fields of the column */
 		att->atttypid = keyType->type;
 		att->atttypmod = -1;
 		att->attlen = keyType->attlen;
 		att->attbyval = keyType->attbyval;
 		att->attalign = keyType->attalign;
 		att->attstorage = keyType->attstorage;
 		/* We shouldn't need to bother with making these valid: */
 		att->attcompression = InvalidCompressionMethod;
 		att->attcollation = InvalidOid;
 		/* In case we changed typlen, we'd better reset following offsets */
 		for (int i = spgFirstIncludeColumn; i < outTupDesc->natts; i++)
 			TupleDescAttr(outTupDesc, i)->attcacheoff = -1;
 	}
 	return outTupDesc;
 }

 /* Initialize SpGistState for working with the given index */
 void
 initSpGistState(SpGistState *state, Relation index)
 {
 	SpGistCache *cache;

 	state->index = index;

 	/* Get cached static information about index */
 	cache = spgGetCache(index);

 	state->config = cache->config;
 	state->attType = cache->attType;
 	state->attLeafType = cache->attLeafType;
 	state->attPrefixType = cache->attPrefixType;
 	state->attLabelType = cache->attLabelType;

 	/* Ensure we have a valid descriptor for leaf tuples */
 	state->leafTupDesc = getSpGistTupleDesc(state->index, &state->attLeafType);

 	/* Make workspace for constructing dead tuples */
 	state->deadTupleStorage = palloc0(SGDTSIZE);

 	/*
 	 * Set horizon XID to use in redirection tuples.  Use our own XID if we
 	 * have one, else use InvalidTransactionId.  The latter case can happen in
 	 * VACUUM or REINDEX CONCURRENTLY, and in neither case would it be okay to
 	 * force an XID to be assigned.  VACUUM won't create any redirection
 	 * tuples anyway, but REINDEX CONCURRENTLY can.  Fortunately, REINDEX
 	 * CONCURRENTLY doesn't mark the index valid until the end, so there could
 	 * never be any concurrent scans "in flight" to a redirection tuple it has
 	 * inserted.  And it locks out VACUUM until the end, too.  So it's okay
 	 * for VACUUM to immediately expire a redirection tuple that contains an
 	 * invalid xid.
 	 */
 	state->myXid = GetTopTransactionIdIfAny();

 	/* Assume we're not in an index build (spgbuild will override) */
 	state->isBuild = false;
 }

 /*
  * Allocate a new page (either by recycling, or by extending the index file).
  *
  * The returned buffer is already pinned and exclusive-locked.
  * Caller is responsible for initializing the page by calling SpGistInitBuffer.
  */
 Buffer
 SpGistNewBuffer(Relation index)
 {
 	Buffer		buffer;

 	/* First, try to get a page from FSM */
 	for (;;)
 	{
 		BlockNumber blkno = GetFreeIndexPage(index);

 		if (blkno == InvalidBlockNumber)
 			break;				/* nothing known to FSM */

 		/*
 		 * The fixed pages shouldn't ever be listed in FSM, but just in case
 		 * one is, ignore it.
 		 */
 		if (SpGistBlockIsFixed(blkno))
 			continue;

 		buffer = ReadBuffer(index, blkno);

 		/*
 		 * We have to guard against the possibility that someone else already
 		 * recycled this page; the buffer may be locked if so.
 		 */
 		if (ConditionalLockBuffer(buffer))
 		{
 			Page		page = BufferGetPage(buffer);

 			if (PageIsNew(page))
 				return buffer;	/* OK to use, if never initialized */

 			if (SpGistPageIsDeleted(page) || PageIsEmpty(page))
 				return buffer;	/* OK to use */

 			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
 		}

 		/* Can't use it, so release buffer and try again */
 		ReleaseBuffer(buffer);
 	}

 	buffer = ExtendBufferedRel(BMR_REL(index), MAIN_FORKNUM, NULL,
 							   EB_LOCK_FIRST);

 	return buffer;
 }

 /*
  * Update index metapage's lastUsedPages info from local cache, if possible
  *
  * Updating meta page isn't critical for index working, so
  * 1 use ConditionalLockBuffer to improve concurrency
  * 2 don't WAL-log metabuffer changes to decrease WAL traffic
  */
 void
 SpGistUpdateMetaPage(Relation index)
 {
 	SpGistCache *cache = (SpGistCache *) index->rd_amcache;

 	if (cache != NULL)
 	{
 		Buffer		metabuffer;

 		metabuffer = ReadBuffer(index, SPGIST_METAPAGE_BLKNO);

 		if (ConditionalLockBuffer(metabuffer))
 		{
 			Page		metapage = BufferGetPage(metabuffer);
 			SpGistMetaPageData *metadata = SpGistPageGetMeta(metapage);

 			metadata->lastUsedPages = cache->lastUsedPages;

 			/*
 			 * Set pd_lower just past the end of the metadata.  This is
 			 * essential, because without doing so, metadata will be lost if
 			 * xlog.c compresses the page.  (We must do this here because
 			 * pre-v11 versions of PG did not set the metapage's pd_lower
 			 * correctly, so a pg_upgraded index might contain the wrong
 			 * value.)
 			 */
 			((PageHeader) metapage)->pd_lower =
 				((char *) metadata + sizeof(SpGistMetaPageData)) - (char *) metapage;

 			MarkBufferDirty(metabuffer);
 			UnlockReleaseBuffer(metabuffer);
 		}
 		else
 		{
 			ReleaseBuffer(metabuffer);
 		}
 	}
 }

 /* Macro to select proper element of lastUsedPages cache depending on flags */
 /* Masking flags with SPGIST_CACHED_PAGES is just for paranoia's sake */
 #define GET_LUP(c, f)  (&(c)->lastUsedPages.cachedPage[((unsigned int) (f)) % SPGIST_CACHED_PAGES])

 /*
  * Allocate and initialize a new buffer of the type and parity specified by
  * flags.  The returned buffer is already pinned and exclusive-locked.
  *
  * When requesting an inner page, if we get one with the wrong parity,
  * we just release the buffer and try again.  We will get a different page
  * because GetFreeIndexPage will have marked the page used in FSM.  The page
  * is entered in our local lastUsedPages cache, so there's some hope of
  * making use of it later in this session, but otherwise we rely on VACUUM
  * to eventually re-enter the page in FSM, making it available for recycling.
  * Note that such a page does not get marked dirty here, so unless it's used
  * fairly soon, the buffer will just get discarded and the page will remain
  * as it was on disk.
  *
  * When we return a buffer to the caller, the page is *not* entered into
  * the lastUsedPages cache; we expect the caller will do so after it's taken
  * whatever space it will use.  This is because after the caller has used up
  * some space, the page might have less space than whatever was cached already
  * so we'd rather not trash the old cache entry.
  */
 static Buffer
 allocNewBuffer(Relation index, int flags)
 {
 	SpGistCache *cache = spgGetCache(index);
 	uint16		pageflags = 0;

 	if (GBUF_REQ_LEAF(flags))
 		pageflags |= SPGIST_LEAF;
 	if (GBUF_REQ_NULLS(flags))
 		pageflags |= SPGIST_NULLS;

 	for (;;)
 	{
 		Buffer		buffer;

 		buffer = SpGistNewBuffer(index);
 		SpGistInitBuffer(buffer, pageflags);

 		if (pageflags & SPGIST_LEAF)
 		{
 			/* Leaf pages have no parity concerns, so just use it */
 			return buffer;
 		}
 		else
 		{
 			BlockNumber blkno = BufferGetBlockNumber(buffer);
 			int			blkFlags = GBUF_INNER_PARITY(blkno);

 			if ((flags & GBUF_PARITY_MASK) == blkFlags)
 			{
 				/* Page has right parity, use it */
 				return buffer;
 			}
 			else
 			{
 				/* Page has wrong parity, record it in cache and try again */
 				if (pageflags & SPGIST_NULLS)
 					blkFlags |= GBUF_NULLS;
 				cache->lastUsedPages.cachedPage[blkFlags].blkno = blkno;
 				cache->lastUsedPages.cachedPage[blkFlags].freeSpace =
 					PageGetExactFreeSpace(BufferGetPage(buffer));
 				UnlockReleaseBuffer(buffer);
 			}
 		}
 	}
 }

 /*
  * Get a buffer of the type and parity specified by flags, having at least
  * as much free space as indicated by needSpace.  We use the lastUsedPages
  * cache to assign the same buffer previously requested when possible.
  * The returned buffer is already pinned and exclusive-locked.
  *
  * *isNew is set true if the page was initialized here, false if it was
  * already valid.
  */
 Buffer
 SpGistGetBuffer(Relation index, int flags, int needSpace, bool *isNew)
 {
 	SpGistCache *cache = spgGetCache(index);
 	SpGistLastUsedPage *lup;

 	/* Bail out if even an empty page wouldn't meet the demand */
 	if (needSpace > SPGIST_PAGE_CAPACITY)
 		elog(ERROR, "desired SPGiST tuple size is too big");

 	/*
 	 * If possible, increase the space request to include relation's
 	 * fillfactor.  This ensures that when we add unrelated tuples to a page,
 	 * we try to keep 100-fillfactor% available for adding tuples that are
 	 * related to the ones already on it.  But fillfactor mustn't cause an
 	 * error for requests that would otherwise be legal.
 	 */
 	needSpace += SpGistGetTargetPageFreeSpace(index);
 	needSpace = Min(needSpace, SPGIST_PAGE_CAPACITY);

 	/* Get the cache entry for this flags setting */
 	lup = GET_LUP(cache, flags);

 	/* If we have nothing cached, just turn it over to allocNewBuffer */
 	if (lup->blkno == InvalidBlockNumber)
 	{
 		*isNew = true;
 		return allocNewBuffer(index, flags);
 	}

 	/* fixed pages should never be in cache */
 	Assert(!SpGistBlockIsFixed(lup->blkno));

 	/* If cached freeSpace isn't enough, don't bother looking at the page */
 	if (lup->freeSpace >= needSpace)
 	{
 		Buffer		buffer;
 		Page		page;

 		buffer = ReadBuffer(index, lup->blkno);

 		if (!ConditionalLockBuffer(buffer))
 		{
 			/*
 			 * buffer is locked by another process, so return a new buffer
 			 */
 			ReleaseBuffer(buffer);
 			*isNew = true;
 			return allocNewBuffer(index, flags);
 		}

 		page = BufferGetPage(buffer);

 		if (PageIsNew(page) || SpGistPageIsDeleted(page) || PageIsEmpty(page))
 		{
 			/* OK to initialize the page */
 			uint16		pageflags = 0;

 			if (GBUF_REQ_LEAF(flags))
 				pageflags |= SPGIST_LEAF;
 			if (GBUF_REQ_NULLS(flags))
 				pageflags |= SPGIST_NULLS;
 			SpGistInitBuffer(buffer, pageflags);
 			lup->freeSpace = PageGetExactFreeSpace(page) - needSpace;
 			*isNew = true;
 			return buffer;
 		}

 		/*
 		 * Check that page is of right type and has enough space.  We must
 		 * recheck this since our cache isn't necessarily up to date.
 		 */
 		if ((GBUF_REQ_LEAF(flags) ? SpGistPageIsLeaf(page) : !SpGistPageIsLeaf(page)) &&
 			(GBUF_REQ_NULLS(flags) ? SpGistPageStoresNulls(page) : !SpGistPageStoresNulls(page)))
 		{
 			int			freeSpace = PageGetExactFreeSpace(page);

 			if (freeSpace >= needSpace)
 			{
 				/* Success, update freespace info and return the buffer */
 				lup->freeSpace = freeSpace - needSpace;
 				*isNew = false;
 				return buffer;
 			}
 		}

 		/*
 		 * fallback to allocation of new buffer
 		 */
 		UnlockReleaseBuffer(buffer);
 	}

 	/* No success with cache, so return a new buffer */
 	*isNew = true;
 	return allocNewBuffer(index, flags);
 }

 /*
  * Update lastUsedPages cache when done modifying a page.
  *
  * We update the appropriate cache entry if it already contained this page
  * (its freeSpace is likely obsolete), or if this page has more space than
  * whatever we had cached.
  */
 void
 SpGistSetLastUsedPage(Relation index, Buffer buffer)
 {
 	SpGistCache *cache = spgGetCache(index);
 	SpGistLastUsedPage *lup;
 	int			freeSpace;
 	Page		page = BufferGetPage(buffer);
 	BlockNumber blkno = BufferGetBlockNumber(buffer);
 	int			flags;

 	/* Never enter fixed pages (root pages) in cache, though */
 	if (SpGistBlockIsFixed(blkno))
 		return;

 	if (SpGistPageIsLeaf(page))
 		flags = GBUF_LEAF;
 	else
 		flags = GBUF_INNER_PARITY(blkno);
 	if (SpGistPageStoresNulls(page))
 		flags |= GBUF_NULLS;

 	lup = GET_LUP(cache, flags);

 	freeSpace = PageGetExactFreeSpace(page);
 	if (lup->blkno == InvalidBlockNumber || lup->blkno == blkno ||
 		lup->freeSpace < freeSpace)
 	{
 		lup->blkno = blkno;
 		lup->freeSpace = freeSpace;
 	}
 }

 /*
  * Initialize an SPGiST page to empty, with specified flags
  */
 void
 SpGistInitPage(Page page, uint16 f)
 {
 	SpGistPageOpaque opaque;

 	PageInit(page, BLCKSZ, sizeof(SpGistPageOpaqueData));
 	opaque = SpGistPageGetOpaque(page);
 	opaque->flags = f;
 	opaque->spgist_page_id = SPGIST_PAGE_ID;
 }

 /*
  * Initialize a buffer's page to empty, with specified flags
  */
 void
 SpGistInitBuffer(Buffer b, uint16 f)
 {
 	Assert(BufferGetPageSize(b) == BLCKSZ);
 	SpGistInitPage(BufferGetPage(b), f);
 }

 /*
  * Initialize metadata page
  */
 void
 SpGistInitMetapage(Page page)
 {
 	SpGistMetaPageData *metadata;
 	int			i;

 	SpGistInitPage(page, SPGIST_META);
 	metadata = SpGistPageGetMeta(page);
 	memset(metadata, 0, sizeof(SpGistMetaPageData));
 	metadata->magicNumber = SPGIST_MAGIC_NUMBER;

 	/* initialize last-used-page cache to empty */
 	for (i = 0; i < SPGIST_CACHED_PAGES; i++)
 		metadata->lastUsedPages.cachedPage[i].blkno = InvalidBlockNumber;

 	/*
 	 * Set pd_lower just past the end of the metadata.  This is essential,
 	 * because without doing so, metadata will be lost if xlog.c compresses
 	 * the page.
 	 */
 	((PageHeader) page)->pd_lower =
 		((char *) metadata + sizeof(SpGistMetaPageData)) - (char *) page;
 }

 /*
  * reloptions processing for SPGiST
  */
 bytea *
 spgoptions(Datum reloptions, bool validate)
 {
 	static const relopt_parse_elt tab[] = {
 		{"fillfactor", RELOPT_TYPE_INT, offsetof(SpGistOptions, fillfactor)},
 	};

 	return (bytea *) build_reloptions(reloptions, validate,
 									  RELOPT_KIND_SPGIST,
 									  sizeof(SpGistOptions),
 									  tab, lengthof(tab));
 }

 /*
  * Get the space needed to store a non-null datum of the indicated type
  * in an inner tuple (that is, as a prefix or node label).
  * Note the result is already rounded up to a MAXALIGN boundary.
  * Here we follow the convention that pass-by-val types are just stored
  * in their Datum representation (compare memcpyInnerDatum).
  */
 unsigned int
 SpGistGetInnerTypeSize(SpGistTypeDesc *att, Datum datum)
 {
 	unsigned int size;

 	if (att->attbyval)
 		size = sizeof(Datum);
 	else if (att->attlen > 0)
 		size = att->attlen;
 	else
 		size = VARSIZE_ANY(datum);

 	return MAXALIGN(size);
 }

 /*
  * Copy the given non-null datum to *target, in the inner-tuple case
  */
 static void
 memcpyInnerDatum(void *target, SpGistTypeDesc *att, Datum datum)
 {
 	unsigned int size;

 	if (att->attbyval)
 	{
 		memcpy(target, &datum, sizeof(Datum));
 	}
 	else
 	{
 		size = (att->attlen > 0) ? att->attlen : VARSIZE_ANY(datum);
 		memcpy(target, DatumGetPointer(datum), size);
 	}
 }

 /*
  * Compute space required for a leaf tuple holding the given data.
  *
  * This must match the size-calculation portion of spgFormLeafTuple.
  */
 Size
 SpGistGetLeafTupleSize(TupleDesc tupleDescriptor,
 					   Datum *datums, bool *isnulls)
 {
 	Size		size;
 	Size		data_size;
 	bool		needs_null_mask = false;
 	int			natts = tupleDescriptor->natts;

 	/*
 	 * Decide whether we need a nulls bitmask.
 	 *
 	 * If there is only a key attribute (natts == 1), never use a bitmask, for
 	 * compatibility with the pre-v14 layout of leaf tuples.  Otherwise, we
 	 * need one if any attribute is null.
 	 */
 	if (natts > 1)
 	{
 		for (int i = 0; i < natts; i++)
 		{
 			if (isnulls[i])
 			{
 				needs_null_mask = true;
 				break;
 			}
 		}
 	}

 	/*
 	 * Calculate size of the data part; same as for heap tuples.
 	 */
 	data_size = heap_compute_data_size(tupleDescriptor, datums, isnulls);

 	/*
 	 * Compute total size.
 	 */
 	size = SGLTHDRSZ(needs_null_mask);
 	size += data_size;
 	size = MAXALIGN(size);

 	/*
 	 * Ensure that we can replace the tuple with a dead tuple later. This test
 	 * is unnecessary when there are any non-null attributes, but be safe.
 	 */
 	if (size < SGDTSIZE)
 		size = SGDTSIZE;

 	return size;
 }

 /*
  * Construct a leaf tuple containing the given heap TID and datum values
  */
 SpGistLeafTuple
 spgFormLeafTuple(SpGistState *state, ItemPointer heapPtr,
 				 Datum *datums, bool *isnulls)
 {
 	SpGistLeafTuple tup;
 	TupleDesc	tupleDescriptor = state->leafTupDesc;
 	Size		size;
 	Size		hoff;
 	Size		data_size;
 	bool		needs_null_mask = false;
 	int			natts = tupleDescriptor->natts;
 	char	   *tp;				/* ptr to tuple data */
 	uint16		tupmask = 0;	/* unused heap_fill_tuple output */

 	/*
 	 * Decide whether we need a nulls bitmask.
 	 *
 	 * If there is only a key attribute (natts == 1), never use a bitmask, for
 	 * compatibility with the pre-v14 layout of leaf tuples.  Otherwise, we
 	 * need one if any attribute is null.
 	 */
 	if (natts > 1)
 	{
 		for (int i = 0; i < natts; i++)
 		{
 			if (isnulls[i])
 			{
 				needs_null_mask = true;
 				break;
 			}
 		}
 	}

 	/*
 	 * Calculate size of the data part; same as for heap tuples.
 	 */
 	data_size = heap_compute_data_size(tupleDescriptor, datums, isnulls);

 	/*
 	 * Compute total size.
 	 */
 	hoff = SGLTHDRSZ(needs_null_mask);
 	size = hoff + data_size;
 	size = MAXALIGN(size);

 	/*
 	 * Ensure that we can replace the tuple with a dead tuple later. This test
 	 * is unnecessary when there are any non-null attributes, but be safe.
 	 */
 	if (size < SGDTSIZE)
 		size = SGDTSIZE;

 	/* OK, form the tuple */
 	tup = (SpGistLeafTuple) palloc0(size);

 	tup->size = size;
 	SGLT_SET_NEXTOFFSET(tup, InvalidOffsetNumber);
 	tup->heapPtr = *heapPtr;

 	tp = (char *) tup + hoff;

 	if (needs_null_mask)
 	{
 		bits8	   *bp;			/* ptr to null bitmap in tuple */

 		/* Set nullmask presence bit in SpGistLeafTuple header */
 		SGLT_SET_HASNULLMASK(tup, true);
 		/* Fill the data area and null mask */
 		bp = (bits8 *) ((char *) tup + sizeof(SpGistLeafTupleData));
 		heap_fill_tuple(tupleDescriptor, datums, isnulls, tp, data_size,
 						&tupmask, bp);
 	}
 	else if (natts > 1 || !isnulls[spgKeyColumn])
 	{
 		/* Fill data area only */
 		heap_fill_tuple(tupleDescriptor, datums, isnulls, tp, data_size,
 						&tupmask, (bits8 *) NULL);
 	}
 	/* otherwise we have no data, nor a bitmap, to fill */

 	return tup;
 }

 /*
  * Construct a node (to go into an inner tuple) containing the given label
  *
  * Note that the node's downlink is just set invalid here.  Caller will fill
  * it in later.
  */
 SpGistNodeTuple
 spgFormNodeTuple(SpGistState *state, Datum label, bool isnull)
 {
 	SpGistNodeTuple tup;
 	unsigned int size;
 	unsigned short infomask = 0;

 	/* compute space needed (note result is already maxaligned) */
 	size = SGNTHDRSZ;
 	if (!isnull)
 		size += SpGistGetInnerTypeSize(&state->attLabelType, label);

 	/*
 	 * Here we make sure that the size will fit in the field reserved for it
 	 * in t_info.
 	 */
 	if ((size & INDEX_SIZE_MASK) != size)
 		ereport(ERROR,
 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 				 errmsg("index row requires %zu bytes, maximum size is %zu",
 						(Size) size, (Size) INDEX_SIZE_MASK)));

 	tup = (SpGistNodeTuple) palloc0(size);

 	if (isnull)
 		infomask |= INDEX_NULL_MASK;
 	/* we don't bother setting the INDEX_VAR_MASK bit */
 	infomask |= size;
 	tup->t_info = infomask;

 	/* The TID field will be filled in later */
 	ItemPointerSetInvalid(&tup->t_tid);

 	if (!isnull)
 		memcpyInnerDatum(SGNTDATAPTR(tup), &state->attLabelType, label);

 	return tup;
 }

 /*
  * Construct an inner tuple containing the given prefix and node array
  */
 SpGistInnerTuple
 spgFormInnerTuple(SpGistState *state, bool hasPrefix, Datum prefix,
 				  int nNodes, SpGistNodeTuple *nodes)
 {
 	SpGistInnerTuple tup;
 	unsigned int size;
 	unsigned int prefixSize;
 	int			i;
 	char	   *ptr;

 	/* Compute size needed */
 	if (hasPrefix)
 		prefixSize = SpGistGetInnerTypeSize(&state->attPrefixType, prefix);
 	else
 		prefixSize = 0;

 	size = SGITHDRSZ + prefixSize;

 	/* Note: we rely on node tuple sizes to be maxaligned already */
 	for (i = 0; i < nNodes; i++)
 		size += IndexTupleSize(nodes[i]);

 	/*
 	 * Ensure that we can replace the tuple with a dead tuple later.  This
 	 * test is unnecessary given current tuple layouts, but let's be safe.
 	 */
 	if (size < SGDTSIZE)
 		size = SGDTSIZE;

 	/*
 	 * Inner tuple should be small enough to fit on a page
 	 */
 	if (size > SPGIST_PAGE_CAPACITY - sizeof(ItemIdData))
 		ereport(ERROR,
 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 				 errmsg("SP-GiST inner tuple size %zu exceeds maximum %zu",
 						(Size) size,
 						SPGIST_PAGE_CAPACITY - sizeof(ItemIdData)),
 				 errhint("Values larger than a buffer page cannot be indexed.")));

 	/*
 	 * Check for overflow of header fields --- probably can't fail if the
 	 * above succeeded, but let's be paranoid
 	 */
 	if (size > SGITMAXSIZE ||
 		prefixSize > SGITMAXPREFIXSIZE ||
 		nNodes > SGITMAXNNODES)
 		elog(ERROR, "SPGiST inner tuple header field is too small");

 	/* OK, form the tuple */
 	tup = (SpGistInnerTuple) palloc0(size);

 	tup->nNodes = nNodes;
 	tup->prefixSize = prefixSize;
 	tup->size = size;

 	if (hasPrefix)
 		memcpyInnerDatum(SGITDATAPTR(tup), &state->attPrefixType, prefix);

 	ptr = (char *) SGITNODEPTR(tup);

 	for (i = 0; i < nNodes; i++)
 	{
 		SpGistNodeTuple node = nodes[i];

 		memcpy(ptr, node, IndexTupleSize(node));
 		ptr += IndexTupleSize(node);
 	}

 	return tup;
 }

 /*
  * Construct a "dead" tuple to replace a tuple being deleted.
  *
  * The state can be SPGIST_REDIRECT, SPGIST_DEAD, or SPGIST_PLACEHOLDER.
  * For a REDIRECT tuple, a pointer (blkno+offset) must be supplied, and
  * the xid field is filled in automatically.
  *
  * This is called in critical sections, so we don't use palloc; the tuple
  * is built in preallocated storage.  It should be copied before another
  * call with different parameters can occur.
  */
 SpGistDeadTuple
 spgFormDeadTuple(SpGistState *state, int tupstate,
 				 BlockNumber blkno, OffsetNumber offnum)
 {
 	SpGistDeadTuple tuple = (SpGistDeadTuple) state->deadTupleStorage;

 	tuple->tupstate = tupstate;
 	tuple->size = SGDTSIZE;
 	SGLT_SET_NEXTOFFSET(tuple, InvalidOffsetNumber);

 	if (tupstate == SPGIST_REDIRECT)
 	{
 		ItemPointerSet(&tuple->pointer, blkno, offnum);
 		tuple->xid = state->myXid;
 	}
 	else
 	{
 		ItemPointerSetInvalid(&tuple->pointer);
 		tuple->xid = InvalidTransactionId;
 	}

 	return tuple;
 }

 /*
  * Convert an SPGiST leaf tuple into Datum/isnull arrays.
  *
  * The caller must allocate sufficient storage for the output arrays.
  * (INDEX_MAX_KEYS entries should be enough.)
  */
 void
 spgDeformLeafTuple(SpGistLeafTuple tup, TupleDesc tupleDescriptor,
 				   Datum *datums, bool *isnulls, bool keyColumnIsNull)
 {
 	bool		hasNullsMask = SGLT_GET_HASNULLMASK(tup);
 	char	   *tp;				/* ptr to tuple data */
 	bits8	   *bp;				/* ptr to null bitmap in tuple */

 	if (keyColumnIsNull && tupleDescriptor->natts == 1)
 	{
 		/*
 		 * Trivial case: there is only the key attribute and we're in a nulls
 		 * tree.  The hasNullsMask bit in the tuple header should not be set
 		 * (and thus we can't use index_deform_tuple_internal), but
 		 * nonetheless the result is NULL.
 		 *
 		 * Note: currently this is dead code, because noplace calls this when
 		 * there is only the key attribute.  But we should cover the case.
 		 */
 		Assert(!hasNullsMask);

 		datums[spgKeyColumn] = (Datum) 0;
 		isnulls[spgKeyColumn] = true;
 		return;
 	}

 	tp = (char *) tup + SGLTHDRSZ(hasNullsMask);
 	bp = (bits8 *) ((char *) tup + sizeof(SpGistLeafTupleData));

 	index_deform_tuple_internal(tupleDescriptor,
 								datums, isnulls,
 								tp, bp, hasNullsMask);

 	/*
 	 * Key column isnull value from the tuple should be consistent with
 	 * keyColumnIsNull flag from the caller.
 	 */
 	Assert(keyColumnIsNull == isnulls[spgKeyColumn]);
 }

 /*
  * Extract the label datums of the nodes within innerTuple
  *
  * Returns NULL if label datums are NULLs
  */
 Datum *
 spgExtractNodeLabels(SpGistState *state, SpGistInnerTuple innerTuple)
 {
 	Datum	   *nodeLabels;
 	int			i;
 	SpGistNodeTuple node;

 	/* Either all the labels must be NULL, or none. */
 	node = SGITNODEPTR(innerTuple);
 	if (IndexTupleHasNulls(node))
 	{
 		SGITITERATE(innerTuple, i, node)
 		{
 			if (!IndexTupleHasNulls(node))
 				elog(ERROR, "some but not all node labels are null in SPGiST inner tuple");
 		}
 		/* They're all null, so just return NULL */
 		return NULL;
 	}
 	else
 	{
 		nodeLabels = (Datum *) palloc(sizeof(Datum) * innerTuple->nNodes);
 		SGITITERATE(innerTuple, i, node)
 		{
 			if (IndexTupleHasNulls(node))
 				elog(ERROR, "some but not all node labels are null in SPGiST inner tuple");
 			nodeLabels[i] = SGNTDATUM(node, state);
 		}
 		return nodeLabels;
 	}
 }

 /*
  * Add a new item to the page, replacing a PLACEHOLDER item if possible.
  * Return the location it's inserted at, or InvalidOffsetNumber on failure.
  *
  * If startOffset isn't NULL, we start searching for placeholders at
  * *startOffset, and update that to the next place to search.  This is just
  * an optimization for repeated insertions.
  *
  * If errorOK is false, we throw error when there's not enough room,
  * rather than returning InvalidOffsetNumber.
  */
 OffsetNumber
 SpGistPageAddNewItem(SpGistState *state, Page page, Item item, Size size,
 					 OffsetNumber *startOffset, bool errorOK)
 {
 	SpGistPageOpaque opaque = SpGistPageGetOpaque(page);
 	OffsetNumber i,
 				maxoff,
 				offnum;

 	if (opaque->nPlaceholder > 0 &&
 		PageGetExactFreeSpace(page) + SGDTSIZE >= MAXALIGN(size))
 	{
 		/* Try to replace a placeholder */
 		maxoff = PageGetMaxOffsetNumber(page);
 		offnum = InvalidOffsetNumber;

 		for (;;)
 		{
 			if (startOffset && *startOffset != InvalidOffsetNumber)
 				i = *startOffset;
 			else
 				i = FirstOffsetNumber;
 			for (; i <= maxoff; i++)
 			{
 				SpGistDeadTuple it = (SpGistDeadTuple) PageGetItem(page,
 																   PageGetItemId(page, i));

 				if (it->tupstate == SPGIST_PLACEHOLDER)
 				{
 					offnum = i;
 					break;
 				}
 			}

 			/* Done if we found a placeholder */
 			if (offnum != InvalidOffsetNumber)
 				break;

 			if (startOffset && *startOffset != InvalidOffsetNumber)
 			{
 				/* Hint was no good, re-search from beginning */
 				*startOffset = InvalidOffsetNumber;
 				continue;
 			}

 			/* Hmm, no placeholder found? */
 			opaque->nPlaceholder = 0;
 			break;
 		}

 		if (offnum != InvalidOffsetNumber)
 		{
 			/* Replace the placeholder tuple */
 			PageIndexTupleDelete(page, offnum);

 			offnum = PageAddItem(page, item, size, offnum, false, false);

 			/*
 			 * We should not have failed given the size check at the top of
 			 * the function, but test anyway.  If we did fail, we must PANIC
 			 * because we've already deleted the placeholder tuple, and
 			 * there's no other way to keep the damage from getting to disk.
 			 */
 			if (offnum != InvalidOffsetNumber)
 			{
 				Assert(opaque->nPlaceholder > 0);
 				opaque->nPlaceholder--;
 				if (startOffset)
 					*startOffset = offnum + 1;
 			}
 			else
 				elog(PANIC, "failed to add item of size %zu to SPGiST index page",
 					 size);

 			return offnum;
 		}
 	}

 	/* No luck in replacing a placeholder, so just add it to the page */
 	offnum = PageAddItem(page, item, size,
 						 InvalidOffsetNumber, false, false);

 	if (offnum == InvalidOffsetNumber && !errorOK)
 		elog(ERROR, "failed to add item of size %zu to SPGiST index page",
 			 size);

 	return offnum;
 }

 /*
  *	spgproperty() -- Check boolean properties of indexes.
  *
  * This is optional for most AMs, but is required for SP-GiST because the core
  * property code doesn't support AMPROP_DISTANCE_ORDERABLE.
  */
 bool
 spgproperty(Oid index_oid, int attno,
 			IndexAMProperty prop, const char *propname,
 			bool *res, bool *isnull)
 {
 	Oid			opclass,
 				opfamily,
 				opcintype;
 	CatCList   *catlist;
 	int			i;

 	/* Only answer column-level inquiries */
 	if (attno == 0)
 		return false;

 	switch (prop)
 	{
 		case AMPROP_DISTANCE_ORDERABLE:
 			break;
 		default:
 			return false;
 	}

 	/*
 	 * Currently, SP-GiST distance-ordered scans require that there be a
 	 * distance operator in the opclass with the default types. So we assume
 	 * that if such an operator exists, then there's a reason for it.
 	 */

 	/* First we need to know the column's opclass. */
 	opclass = get_index_column_opclass(index_oid, attno);
 	if (!OidIsValid(opclass))
 	{
 		*isnull = true;
 		return true;
 	}

 	/* Now look up the opclass family and input datatype. */
 	if (!get_opclass_opfamily_and_input_type(opclass, &opfamily, &opcintype))
 	{
 		*isnull = true;
 		return true;
 	}

 	/* And now we can check whether the operator is provided. */
 	catlist = SearchSysCacheList1(AMOPSTRATEGY,
 								  ObjectIdGetDatum(opfamily));

 	*res = false;

 	for (i = 0; i < catlist->n_members; i++)
 	{
 		HeapTuple	amoptup = &catlist->members[i]->tuple;
 		Form_pg_amop amopform = (Form_pg_amop) GETSTRUCT(amoptup);

 		if (amopform->amoppurpose == AMOP_ORDER &&
 			(amopform->amoplefttype == opcintype ||
 			 amopform->amoprighttype == opcintype) &&
 			opfamily_can_sort_type(amopform->amopsortfamily,
 								   get_op_rettype(amopform->amopopr)))
 		{
 			*res = true;
 			break;
 		}
 	}

 	ReleaseSysCacheList(catlist);

 	*isnull = false;

 	return true;
 }