src/backend/access/common/syncscan.c - cloudberry - Git at Google

 /*-------------------------------------------------------------------------
  *
  * syncscan.c
  *	  scan synchronization support
  *
  * When multiple backends run a sequential scan on the same table, we try
  * to keep them synchronized to reduce the overall I/O needed.  The goal is
  * to read each page into shared buffer cache only once, and let all backends
  * that take part in the shared scan process the page before it falls out of
  * the cache.
  *
  * Since the "leader" in a pack of backends doing a seqscan will have to wait
  * for I/O, while the "followers" don't, there is a strong self-synchronizing
  * effect once we can get the backends examining approximately the same part
  * of the table at the same time.  Hence all that is really needed is to get
  * a new backend beginning a seqscan to begin it close to where other backends
  * are reading.  We can scan the table circularly, from block X up to the
  * end and then from block 0 to X-1, to ensure we visit all rows while still
  * participating in the common scan.
  *
  * To accomplish that, we keep track of the scan position of each table, and
  * start new scans close to where the previous scan(s) are.  We don't try to
  * do any extra synchronization to keep the scans together afterwards; some
  * scans might progress much more slowly than others, for example if the
  * results need to be transferred to the client over a slow network, and we
  * don't want such queries to slow down others.
  *
  * There can realistically only be a few large sequential scans on different
  * tables in progress at any time.  Therefore we just keep the scan positions
  * in a small LRU list which we scan every time we need to look up or update a
  * scan position.  The whole mechanism is only applied for tables exceeding
  * a threshold size (but that is not the concern of this module).
  *
  * INTERFACE ROUTINES
  *		ss_get_location		- return current scan location of a relation
  *		ss_report_location	- update current scan location
  *
  *
  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
  *	  src/backend/access/common/syncscan.c
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"

 #include "access/syncscan.h"
 #include "miscadmin.h"
 #include "storage/lwlock.h"
 #include "storage/shmem.h"
 #include "utils/rel.h"


 /* GUC variables */
 #ifdef TRACE_SYNCSCAN
 bool		trace_syncscan = false;
 #endif


 /*
  * Size of the LRU list.
  *
  * Note: the code assumes that SYNC_SCAN_NELEM > 1.
  *
  * XXX: What's a good value? It should be large enough to hold the
  * maximum number of large tables scanned simultaneously.  But a larger value
  * means more traversing of the LRU list when starting a new scan.
  */
 #define SYNC_SCAN_NELEM 20

 /*
  * Interval between reports of the location of the current scan, in pages.
  *
  * Note: This should be smaller than the ring size (see buffer/freelist.c)
  * we use for bulk reads.  Otherwise a scan joining other scans might start
  * from a page that's no longer in the buffer cache.  This is a bit fuzzy;
  * there's no guarantee that the new scan will read the page before it leaves
  * the buffer cache anyway, and on the other hand the page is most likely
  * still in the OS cache.
  */
 #define SYNC_SCAN_REPORT_INTERVAL (128 * 1024 / BLCKSZ)


 /*
  * The scan locations structure is essentially a doubly-linked LRU with head
  * and tail pointer, but designed to hold a fixed maximum number of elements in
  * fixed-size shared memory.
  */
 typedef struct ss_scan_location_t
 {
 	RelFileNode relfilenode;	/* identity of a relation */
 	BlockNumber location;		/* last-reported location in the relation */
 } ss_scan_location_t;

 typedef struct ss_lru_item_t
 {
 	struct ss_lru_item_t *prev;
 	struct ss_lru_item_t *next;
 	ss_scan_location_t location;
 } ss_lru_item_t;

 typedef struct ss_scan_locations_t
 {
 	ss_lru_item_t *head;
 	ss_lru_item_t *tail;
 	ss_lru_item_t items[FLEXIBLE_ARRAY_MEMBER]; /* SYNC_SCAN_NELEM items */
 } ss_scan_locations_t;

 #define SizeOfScanLocations(N) \
 	(offsetof(ss_scan_locations_t, items) + (N) * sizeof(ss_lru_item_t))

 /* Pointer to struct in shared memory */
 static ss_scan_locations_t *scan_locations;

 /* prototypes for internal functions */
 static BlockNumber ss_search(RelFileNode relfilenode,
 							 BlockNumber location, bool set);


 /*
  * SyncScanShmemSize --- report amount of shared memory space needed
  */
 Size
 SyncScanShmemSize(void)
 {
 	return SizeOfScanLocations(SYNC_SCAN_NELEM);
 }

 /*
  * SyncScanShmemInit --- initialize this module's shared memory
  */
 void
 SyncScanShmemInit(void)
 {
 	int			i;
 	bool		found;

 	scan_locations = (ss_scan_locations_t *)
 		ShmemInitStruct("Sync Scan Locations List",
 						SizeOfScanLocations(SYNC_SCAN_NELEM),
 						&found);

 	if (!IsUnderPostmaster)
 	{
 		/* Initialize shared memory area */
 		Assert(!found);

 		scan_locations->head = &scan_locations->items[0];
 		scan_locations->tail = &scan_locations->items[SYNC_SCAN_NELEM - 1];

 		for (i = 0; i < SYNC_SCAN_NELEM; i++)
 		{
 			ss_lru_item_t *item = &scan_locations->items[i];

 			/*
 			 * Initialize all slots with invalid values. As scans are started,
 			 * these invalid entries will fall off the LRU list and get
 			 * replaced with real entries.
 			 */
 			item->location.relfilenode.spcNode = InvalidOid;
 			item->location.relfilenode.dbNode = InvalidOid;
 			item->location.relfilenode.relNode = InvalidOid;
 			item->location.location = InvalidBlockNumber;

 			item->prev = (i > 0) ?
 				(&scan_locations->items[i - 1]) : NULL;
 			item->next = (i < SYNC_SCAN_NELEM - 1) ?
 				(&scan_locations->items[i + 1]) : NULL;
 		}
 	}
 	else
 		Assert(found);
 }

 /*
  * ss_search --- search the scan_locations structure for an entry with the
  *		given relfilenode.
  *
  * If "set" is true, the location is updated to the given location.  If no
  * entry for the given relfilenode is found, it will be created at the head
  * of the list with the given location, even if "set" is false.
  *
  * In any case, the location after possible update is returned.
  *
  * Caller is responsible for having acquired suitable lock on the shared
  * data structure.
  */
 static BlockNumber
 ss_search(RelFileNode relfilenode, BlockNumber location, bool set)
 {
 	ss_lru_item_t *item;

 	item = scan_locations->head;
 	for (;;)
 	{
 		bool		match;

 		match = RelFileNodeEquals(item->location.relfilenode, relfilenode);

 		if (match || item->next == NULL)
 		{
 			/*
 			 * If we reached the end of list and no match was found, take over
 			 * the last entry
 			 */
 			if (!match)
 			{
 				item->location.relfilenode = relfilenode;
 				item->location.location = location;
 			}
 			else if (set)
 				item->location.location = location;

 			/* Move the entry to the front of the LRU list */
 			if (item != scan_locations->head)
 			{
 				/* unlink */
 				if (item == scan_locations->tail)
 					scan_locations->tail = item->prev;
 				item->prev->next = item->next;
 				if (item->next)
 					item->next->prev = item->prev;

 				/* link */
 				item->prev = NULL;
 				item->next = scan_locations->head;
 				scan_locations->head->prev = item;
 				scan_locations->head = item;
 			}

 			return item->location.location;
 		}

 		item = item->next;
 	}

 	/* not reached */
 }

 /*
  * ss_get_location --- get the optimal starting location for scan
  *
  * Returns the last-reported location of a sequential scan on the
  * relation, or 0 if no valid location is found.
  *
  * We expect the caller has just done RelationGetNumberOfBlocks(), and
  * so that number is passed in rather than computing it again.  The result
  * is guaranteed less than relnblocks (assuming that's > 0).
  */
 BlockNumber
 ss_get_location(Relation rel, BlockNumber relnblocks)
 {
 	BlockNumber startloc;

 	LWLockAcquire(SyncScanLock, LW_EXCLUSIVE);
 	startloc = ss_search(rel->rd_node, 0, false);
 	LWLockRelease(SyncScanLock);

 	/*
 	 * If the location is not a valid block number for this scan, start at 0.
 	 *
 	 * This can happen if for instance a VACUUM truncated the table since the
 	 * location was saved.
 	 */
 	if (startloc >= relnblocks)
 		startloc = 0;

 #ifdef TRACE_SYNCSCAN
 	if (trace_syncscan)
 		elog(LOG,
 			 "SYNC_SCAN: start \"%s\" (size %u) at %u",
 			 RelationGetRelationName(rel), relnblocks, startloc);
 #endif

 	return startloc;
 }

 /*
  * ss_report_location --- update the current scan location
  *
  * Writes an entry into the shared Sync Scan state of the form
  * (relfilenode, blocknumber), overwriting any existing entry for the
  * same relfilenode.
  */
 void
 ss_report_location(Relation rel, BlockNumber location)
 {
 #ifdef TRACE_SYNCSCAN
 	if (trace_syncscan)
 	{
 		if ((location % 1024) == 0)
 			elog(LOG,
 				 "SYNC_SCAN: scanning \"%s\" at %u",
 				 RelationGetRelationName(rel), location);
 	}
 #endif

 	/*
 	 * To reduce lock contention, only report scan progress every N pages. For
 	 * the same reason, don't block if the lock isn't immediately available.
 	 * Missing a few updates isn't critical, it just means that a new scan
 	 * that wants to join the pack will start a little bit behind the head of
 	 * the scan.  Hopefully the pages are still in OS cache and the scan
 	 * catches up quickly.
 	 */
 	if ((location % SYNC_SCAN_REPORT_INTERVAL) == 0)
 	{
 		if (LWLockConditionalAcquire(SyncScanLock, LW_EXCLUSIVE))
 		{
 			(void) ss_search(rel->rd_node, location, true);
 			LWLockRelease(SyncScanLock);
 		}
 #ifdef TRACE_SYNCSCAN
 		else if (trace_syncscan)
 			elog(LOG,
 				 "SYNC_SCAN: missed update for \"%s\" at %u",
 				 RelationGetRelationName(rel), location);
 #endif
 	}
 }
	/*-------------------------------------------------------------------------
	*
	* syncscan.c
	* scan synchronization support
	*
	* When multiple backends run a sequential scan on the same table, we try
	* to keep them synchronized to reduce the overall I/O needed. The goal is
	* to read each page into shared buffer cache only once, and let all backends
	* that take part in the shared scan process the page before it falls out of
	* the cache.
	*
	* Since the "leader" in a pack of backends doing a seqscan will have to wait
	* for I/O, while the "followers" don't, there is a strong self-synchronizing
	* effect once we can get the backends examining approximately the same part
	* of the table at the same time. Hence all that is really needed is to get
	* a new backend beginning a seqscan to begin it close to where other backends
	* are reading. We can scan the table circularly, from block X up to the
	* end and then from block 0 to X-1, to ensure we visit all rows while still
	* participating in the common scan.
	*
	* To accomplish that, we keep track of the scan position of each table, and
	* start new scans close to where the previous scan(s) are. We don't try to
	* do any extra synchronization to keep the scans together afterwards; some
	* scans might progress much more slowly than others, for example if the
	* results need to be transferred to the client over a slow network, and we
	* don't want such queries to slow down others.
	*
	* There can realistically only be a few large sequential scans on different
	* tables in progress at any time. Therefore we just keep the scan positions
	* in a small LRU list which we scan every time we need to look up or update a
	* scan position. The whole mechanism is only applied for tables exceeding
	* a threshold size (but that is not the concern of this module).
	*
	* INTERFACE ROUTINES
	* ss_get_location - return current scan location of a relation
	* ss_report_location - update current scan location
	*
	*
	* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
	* Portions Copyright (c) 1994, Regents of the University of California
	*
	* IDENTIFICATION
	* src/backend/access/common/syncscan.c
	*
	*-------------------------------------------------------------------------
	*/
	#include "postgres.h"

	#include "access/syncscan.h"
	#include "miscadmin.h"
	#include "storage/lwlock.h"
	#include "storage/shmem.h"
	#include "utils/rel.h"


	/* GUC variables */
	#ifdef TRACE_SYNCSCAN
	bool trace_syncscan = false;
	#endif


	/*
	* Size of the LRU list.
	*
	* Note: the code assumes that SYNC_SCAN_NELEM > 1.
	*
	* XXX: What's a good value? It should be large enough to hold the
	* maximum number of large tables scanned simultaneously. But a larger value
	* means more traversing of the LRU list when starting a new scan.
	*/
	#define SYNC_SCAN_NELEM 20

	/*
	* Interval between reports of the location of the current scan, in pages.
	*
	* Note: This should be smaller than the ring size (see buffer/freelist.c)
	* we use for bulk reads. Otherwise a scan joining other scans might start
	* from a page that's no longer in the buffer cache. This is a bit fuzzy;
	* there's no guarantee that the new scan will read the page before it leaves
	* the buffer cache anyway, and on the other hand the page is most likely
	* still in the OS cache.
	*/
	#define SYNC_SCAN_REPORT_INTERVAL (128 * 1024 / BLCKSZ)


	/*
	* The scan locations structure is essentially a doubly-linked LRU with head
	* and tail pointer, but designed to hold a fixed maximum number of elements in
	* fixed-size shared memory.
	*/
	typedef struct ss_scan_location_t
	{
	RelFileNode relfilenode; /* identity of a relation */
	BlockNumber location; /* last-reported location in the relation */
	} ss_scan_location_t;

	typedef struct ss_lru_item_t
	{
	struct ss_lru_item_t *prev;
	struct ss_lru_item_t *next;
	ss_scan_location_t location;
	} ss_lru_item_t;

	typedef struct ss_scan_locations_t
	{
	ss_lru_item_t *head;
	ss_lru_item_t *tail;
	ss_lru_item_t items[FLEXIBLE_ARRAY_MEMBER]; /* SYNC_SCAN_NELEM items */
	} ss_scan_locations_t;

	#define SizeOfScanLocations(N) \
	(offsetof(ss_scan_locations_t, items) + (N) * sizeof(ss_lru_item_t))

	/* Pointer to struct in shared memory */
	static ss_scan_locations_t *scan_locations;

	/* prototypes for internal functions */
	static BlockNumber ss_search(RelFileNode relfilenode,
	BlockNumber location, bool set);


	/*
	* SyncScanShmemSize --- report amount of shared memory space needed
	*/
	Size
	SyncScanShmemSize(void)
	{
	return SizeOfScanLocations(SYNC_SCAN_NELEM);
	}

	/*
	* SyncScanShmemInit --- initialize this module's shared memory
	*/
	void
	SyncScanShmemInit(void)
	{
	int i;
	bool found;

	scan_locations = (ss_scan_locations_t *)
	ShmemInitStruct("Sync Scan Locations List",
	SizeOfScanLocations(SYNC_SCAN_NELEM),
	&found);

	if (!IsUnderPostmaster)
	{
	/* Initialize shared memory area */
	Assert(!found);

	scan_locations->head = &scan_locations->items[0];
	scan_locations->tail = &scan_locations->items[SYNC_SCAN_NELEM - 1];

	for (i = 0; i < SYNC_SCAN_NELEM; i++)
	{
	ss_lru_item_t *item = &scan_locations->items[i];

	/*
	* Initialize all slots with invalid values. As scans are started,
	* these invalid entries will fall off the LRU list and get
	* replaced with real entries.
	*/
	item->location.relfilenode.spcNode = InvalidOid;
	item->location.relfilenode.dbNode = InvalidOid;
	item->location.relfilenode.relNode = InvalidOid;
	item->location.location = InvalidBlockNumber;

	item->prev = (i > 0) ?
	(&scan_locations->items[i - 1]) : NULL;
	item->next = (i < SYNC_SCAN_NELEM - 1) ?
	(&scan_locations->items[i + 1]) : NULL;
	}
	}
	else
	Assert(found);
	}

	/*
	* ss_search --- search the scan_locations structure for an entry with the
	* given relfilenode.
	*
	* If "set" is true, the location is updated to the given location. If no
	* entry for the given relfilenode is found, it will be created at the head
	* of the list with the given location, even if "set" is false.
	*
	* In any case, the location after possible update is returned.
	*
	* Caller is responsible for having acquired suitable lock on the shared
	* data structure.
	*/
	static BlockNumber
	ss_search(RelFileNode relfilenode, BlockNumber location, bool set)
	{
	ss_lru_item_t *item;

	item = scan_locations->head;
	for (;;)
	{
	bool match;

	match = RelFileNodeEquals(item->location.relfilenode, relfilenode);

	if (match \|\| item->next == NULL)
	{
	/*
	* If we reached the end of list and no match was found, take over
	* the last entry
	*/
	if (!match)
	{
	item->location.relfilenode = relfilenode;
	item->location.location = location;
	}
	else if (set)
	item->location.location = location;

	/* Move the entry to the front of the LRU list */
	if (item != scan_locations->head)
	{
	/* unlink */
	if (item == scan_locations->tail)
	scan_locations->tail = item->prev;
	item->prev->next = item->next;
	if (item->next)
	item->next->prev = item->prev;

	/* link */
	item->prev = NULL;
	item->next = scan_locations->head;
	scan_locations->head->prev = item;
	scan_locations->head = item;
	}

	return item->location.location;
	}

	item = item->next;
	}

	/* not reached */
	}

	/*
	* ss_get_location --- get the optimal starting location for scan
	*
	* Returns the last-reported location of a sequential scan on the
	* relation, or 0 if no valid location is found.
	*
	* We expect the caller has just done RelationGetNumberOfBlocks(), and
	* so that number is passed in rather than computing it again. The result
	* is guaranteed less than relnblocks (assuming that's > 0).
	*/
	BlockNumber
	ss_get_location(Relation rel, BlockNumber relnblocks)
	{
	BlockNumber startloc;

	LWLockAcquire(SyncScanLock, LW_EXCLUSIVE);
	startloc = ss_search(rel->rd_node, 0, false);
	LWLockRelease(SyncScanLock);

	/*
	* If the location is not a valid block number for this scan, start at 0.
	*
	* This can happen if for instance a VACUUM truncated the table since the
	* location was saved.
	*/
	if (startloc >= relnblocks)
	startloc = 0;

	#ifdef TRACE_SYNCSCAN
	if (trace_syncscan)
	elog(LOG,
	"SYNC_SCAN: start \"%s\" (size %u) at %u",
	RelationGetRelationName(rel), relnblocks, startloc);
	#endif

	return startloc;
	}

	/*
	* ss_report_location --- update the current scan location
	*
	* Writes an entry into the shared Sync Scan state of the form
	* (relfilenode, blocknumber), overwriting any existing entry for the
	* same relfilenode.
	*/
	void
	ss_report_location(Relation rel, BlockNumber location)
	{
	#ifdef TRACE_SYNCSCAN
	if (trace_syncscan)
	{
	if ((location % 1024) == 0)
	elog(LOG,
	"SYNC_SCAN: scanning \"%s\" at %u",
	RelationGetRelationName(rel), location);
	}
	#endif

	/*
	* To reduce lock contention, only report scan progress every N pages. For
	* the same reason, don't block if the lock isn't immediately available.
	* Missing a few updates isn't critical, it just means that a new scan
	* that wants to join the pack will start a little bit behind the head of
	* the scan. Hopefully the pages are still in OS cache and the scan
	* catches up quickly.
	*/
	if ((location % SYNC_SCAN_REPORT_INTERVAL) == 0)
	{
	if (LWLockConditionalAcquire(SyncScanLock, LW_EXCLUSIVE))
	{
	(void) ss_search(rel->rd_node, location, true);
	LWLockRelease(SyncScanLock);
	}
	#ifdef TRACE_SYNCSCAN
	else if (trace_syncscan)
	elog(LOG,
	"SYNC_SCAN: missed update for \"%s\" at %u",
	RelationGetRelationName(rel), location);
	#endif
	}
	}