src/backend/utils/mmgr/runaway_cleaner.c - cloudberry - Git at Google

 /*-------------------------------------------------------------------------
  *
  * runaway_cleaner.c
  *	 Implementation of the runaway cleaner that checks if a session is marked
  *	 as runaway (i.e., consuming too much vmem) by the red-zone handler
  *	 (redzone_handler.c). The runaway cleaner cleans up such session by triggering
  *	 an elog(ERROR, ...) which rolls back transaction and releases memory. Once
  *	 cleanup is finished, the runaway cleaner also informs the red zone handler
  *	 so that a new runaway session can be chosen if necessary.
  *
  * Copyright (c) 2014-Present VMware, Inc. or its affiliates.
  *
  *
  * IDENTIFICATION
  *	    src/backend/utils/mmgr/runaway_cleaner.c
  *
  *-------------------------------------------------------------------------
  */

 #include "postgres.h"

 #include "access/xact.h"
 #include "cdb/cdbvars.h"
 #include "miscadmin.h"
 #include "port/atomics.h"
 #include "utils/faultinjector.h"
 #include "utils/resgroup.h"
 #include "utils/resource_manager.h"
 #include "utils/session_state.h"
 #include "utils/vmem_tracker.h"

 /* External dependencies within the runaway cleanup framework */
 extern bool vmemTrackerInited;
 extern bool isProcessActive;
 extern EventVersion activationVersion;
 extern EventVersion deactivationVersion;
 extern volatile uint32 *isRunawayDetector;
 extern volatile EventVersion *latestRunawayVersion;

 /*
  * The cleanupCountdown in the SessionState determines how many
  * processes we need to cleanup to declare a session clean. If it
  * reaches 0, we mark the session clean. However, -1 indicates
  * that the session is either done cleaning previous runaway event
  * or it never started a cleaning.
  */
 #define CLEANUP_COUNTDOWN_BEFORE_RUNAWAY -1

 /* The runaway version for which this process started cleaning up */
 static EventVersion beginCleanupRunawayVersion = 0;

 /* The runaway version for which this process finished cleaning up */
 static EventVersion endCleanupRunawayVersion = 0;

 void RunawayCleaner_Init(void);
 void RunawayCleaner_StartCleanup(void);
 bool RunawayCleaner_IsCleanupInProgress(void);

 /*
  * Initializes the per-process states of the runaway cleaner.
  */
 void
 RunawayCleaner_Init()
 {
 	beginCleanupRunawayVersion = 0;
 	endCleanupRunawayVersion = 0;
 }

 /* Returns true if the current process should start a runaway cleanup */
 static bool
 RunawayCleaner_ShouldStartRunawayCleanup()
 {
 	if (NULL != MySessionState && MySessionState->runawayStatus != RunawayStatus_NotRunaway &&
 			beginCleanupRunawayVersion != *latestRunawayVersion)
 	{
 		AssertImply(isProcessActive, activationVersion >= deactivationVersion);
 		AssertImply(!isProcessActive, deactivationVersion >= activationVersion);

 		/*
 		 * We are marked as runaway. Therefore, if the runaway event happened before deactivation,
 		 * we must have a version counter increment
 		 */
 		AssertImply(*latestRunawayVersion < deactivationVersion && !isProcessActive, activationVersion < deactivationVersion);

 		if (isProcessActive && *latestRunawayVersion > activationVersion)
 		{
 			/* Active process and the runaway event came after the activation */
 			return true;
 		}
 		else if (!isProcessActive && *latestRunawayVersion < deactivationVersion &&
 				*latestRunawayVersion > activationVersion)
 		{
 			/*
 			 * The process is deactivated, but there is a pending runaway event before
 			 * the deactivation for which this process never cleaned up
 			 */
 			return true;
 		}
 	}

 	return false;
 }

 /*
  * Determine if the runaway cleanup should be handled by aborting the current
  * query or must be ignored. Since the cleanup can be attempted from multiple
  * places, it is important to first validate if calling elog(ERROR) is safe and
  * of value.
  */
 static bool
 RunawayCleaner_ShouldCancelQuery()
 {
 	/* VMEM tracker not being used */
 	if (!vmemTrackerInited)
 		return false;

 	/* In critical section or when holding off on handling interrupts */
 	if (CritSectionCount != 0 || InterruptHoldoffCount != 0)
 		return false;

 	/*
 	 * Cleaning up QEs that are not executing a valid command may cause the QD to
 	 * get stuck [MPP-24950]
 	 */
 	if (gp_command_count <= 0)
 		return false;

 	/*
 	 * If not currently executing a transaction, aborting it won't release any
 	 * more resources.
 	 */
 	if (!IsTransactionState())
 		return false;

 	/* Ok, we are actively executing a query */

 	if (MySessionState->runawayStatus == RunawayStatus_PrimaryRunawaySession)
 	{
 		/*
 		 * Abort the query if it is actively executing and has been flagged as
 		 * consuming the most memory
 		 */
 		return true;
 	}
 	else
 	{
 		Assert(MySessionState->runawayStatus == RunawayStatus_SecondaryRunawaySession);

 		/*
 		 * If this process was flagged as a runaway session inspite another session
 		 * using more memory, only abort this query if the current user is not a
 		 * superuser. This is to ensure that critical administrative commands (such
 		 * as database restarts), which are done as superuser, are not interrupted
 		 * by the runaway cleaner.
 		 */
 		return !superuser();
 	}
 }

 /*
  * Starts a runaway cleanup by triggering an ERROR if the VMEM tracker is active
  * and a commit is not already in progress. Otherwise, it marks the process as clean
  */
 void
 RunawayCleaner_StartCleanup()
 {
 	/*
 	 * Cleanup can be attempted from multiple places, such as before deactivating
 	 * a process (if a pending runaway event) or periodically from CHECK_FOR_INTERRUPTS
 	 * (indirectly via RedZoneHandler_DetectRunaway). We don't carry multiple cleanup
 	 * for a single runaway event. Every time we *start* a cleanup process, we set the
 	 * beginCleanupRunawayVersion to the runaway version for which we started cleaning
 	 * up. Later on, if we reenter this method (e.g., another CHECK_FOR_INTERRUPTS()
 	 * during cleanup), we can observe that the cleanup already started from this runaway
 	 * event, and therefore we skip duplicate cleanup
 	 */
 	if (RunawayCleaner_ShouldStartRunawayCleanup())
 	{
 		Assert(beginCleanupRunawayVersion < *latestRunawayVersion);
 		Assert(endCleanupRunawayVersion < *latestRunawayVersion);

 		/* We don't want to clean up multiple times for same runaway event */
 		beginCleanupRunawayVersion = *latestRunawayVersion;

 		if (RunawayCleaner_ShouldCancelQuery())
 		{
 			SIMPLE_FAULT_INJECTOR("runaway_cleanup");

 			ereport(ERROR, (errmsg("Canceling query because of high VMEM usage. Used: %dMB, available %dMB, red zone: %dMB",
 				VmemTracker_ConvertVmemChunksToMB(MySessionState->sessionVmem), VmemTracker_GetAvailableVmemMB(),
 				RedZoneHandler_GetRedZoneLimitMB()), errprintstack(true)));
 		}

 		/*
 		 * If we cannot error out because of a critical section or because we are a super user
 		 * or for some other reason (such as the QE is not running any valid command, i.e.,
 		 * gp_command_count is not positive) simply declare this process as clean
 		 */
 		RunawayCleaner_RunawayCleanupDoneForProcess(true /* ignoredCleanup */);
 	}
 }

 /*
  * Resets the runaway flag and enables runaway detector.
  *
  * Note: this method should not need any additional locks.
  * Either the MySessionState entry is being released, and
  * we already have a lock on SessionState, and therefore,
  * no new runaway detector can run until the lock is released.
  *
  * Alternatively, we may reset this while still in a live
  * session. In such case, our runaway event versioning should
  * ensure that every process of this session would do another round
  * of cleanup if it is detected as a runaway session again.
  */
 void
 RunawayCleaner_RunawayCleanupDoneForSession()
 {
 	Assert(NULL != MySessionState);
 	if (MySessionState->runawayStatus != RunawayStatus_NotRunaway)
 	{
 		/* The last runaway cleanup should have finished */
 		Assert(endCleanupRunawayVersion == beginCleanupRunawayVersion);
 		Assert(endCleanupRunawayVersion == *latestRunawayVersion);
 		Assert(CLEANUP_COUNTDOWN_BEFORE_RUNAWAY == MySessionState->cleanupCountdown);

 		MySessionState->runawayStatus = RunawayStatus_NotRunaway;
 		MySessionState->sessionVmemRunaway = 0;
 		MySessionState->commandCountRunaway = 0;

 		/*
 		 * Reset the exclusive runaway detector flag so that
 		 * another runaway detector can be chosen
 		 */
 		*isRunawayDetector = 0;
 	}
 }

 /*
  * Marks the current process as clean. If all the processes are marked
  * as clean for this session (i.e., cleanupCountdown == 0 in the
  * MySessionState) then we reset session's runaway status as well as
  * the runaway detector flag (i.e., a new runaway detector can run).
  *
  * Parameters:
  * 		ignoredCleanup: whether the cleanup was ignored, i.e., no elog(ERROR, ...)
  * 		was thrown. In such case a deactivated process is not reactivated as the
  * 		deactivation didn't get interrupted.
  */
 void
 RunawayCleaner_RunawayCleanupDoneForProcess(bool ignoredCleanup)
 {
 	/*
 	 * We don't do anything if we don't have an ongoing cleanup, or we already finished
 	 * cleanup once for the current runaway event
 	 */
 	if (beginCleanupRunawayVersion != *latestRunawayVersion ||
 			endCleanupRunawayVersion == beginCleanupRunawayVersion)
 	{
 		/* Either we never started cleanup, or we already finished */
 		return;
 	}

 	/* Disable repeating call */
 	endCleanupRunawayVersion = beginCleanupRunawayVersion;

 	Assert(NULL != MySessionState);
 	/*
 	 * As the current cleanup holds leverage on the  cleanupCountdown,
 	 * the session must stay as runaway at least until the current
 	 * process marks itself clean
 	 */
 	Assert(MySessionState->runawayStatus != RunawayStatus_NotRunaway);

 	/* We only cleanup if we were active when the runaway event happened */
 	Assert((!isProcessActive && *latestRunawayVersion < deactivationVersion &&
 			*latestRunawayVersion > activationVersion) ||
 			(*latestRunawayVersion > activationVersion &&
 			(activationVersion >= deactivationVersion && isProcessActive)));

 	/*
 	 * We don't reactivate if the process is already active or a deactivated
 	 * process never errored out during deactivation (i.e., failed to complete
 	 * deactivation)
 	 */
 	if (!isProcessActive && !ignoredCleanup)
 	{
 		Assert(1 == *isRunawayDetector);
 		Assert(0 < MySessionState->cleanupCountdown);
 		/*
 		 * As the process threw ERROR instead of going into ReadCommand() blocking
 		 * state, we have to reactivate the process from its current Deactivated
 		 * state
 		 */
 		IdleTracker_ActivateProcess();
 	}

 	Assert(0 < MySessionState->cleanupCountdown);
 #if USE_ASSERT_CHECKING
 	int cleanProgress =
 #endif
 			pg_atomic_add_fetch_u32((pg_atomic_uint32 *)&MySessionState->cleanupCountdown, -1);
 	Assert(0 <= cleanProgress);

 	uint32 expected = 0;
 	bool finalCleaner = pg_atomic_compare_exchange_u32((pg_atomic_uint32 *) &MySessionState->cleanupCountdown,
 			&expected, CLEANUP_COUNTDOWN_BEFORE_RUNAWAY);

 	if (finalCleaner)
 	{
 		/*
 		 * The final cleaner is responsible to reset the runaway flag,
 		 * and enable the runaway detection process.
 		 */
 		RunawayCleaner_RunawayCleanupDoneForSession();
 	}

 	/*
 	 * Finally we are done with all critical cleanup, which includes releasing all our memory and
 	 * releasing our cleanup counter so that another session can be marked as runaway, if needed.
 	 * Now, we have some head room to actually record our usage.
 	 */
 	write_stderr("Logging memory usage because of runaway cleanup. Note, this is a post-cleanup logging and may be incomplete.");
 	MemoryContextStats(TopMemoryContext);
 }

 /*
  * Returns true if a cleanup is in progress (i.e., endCleanupRunawayVersion
  * is smaller than beginCleanupRunawayVersion).
  */
 bool
 RunawayCleaner_IsCleanupInProgress()
 {
 	Assert(endCleanupRunawayVersion <= beginCleanupRunawayVersion);
 	return endCleanupRunawayVersion < beginCleanupRunawayVersion;
 }
	/*-------------------------------------------------------------------------
	*
	* runaway_cleaner.c
	* Implementation of the runaway cleaner that checks if a session is marked
	* as runaway (i.e., consuming too much vmem) by the red-zone handler
	* (redzone_handler.c). The runaway cleaner cleans up such session by triggering
	* an elog(ERROR, ...) which rolls back transaction and releases memory. Once
	* cleanup is finished, the runaway cleaner also informs the red zone handler
	* so that a new runaway session can be chosen if necessary.
	*
	* Copyright (c) 2014-Present VMware, Inc. or its affiliates.
	*
	*
	* IDENTIFICATION
	* src/backend/utils/mmgr/runaway_cleaner.c
	*
	*-------------------------------------------------------------------------
	*/

	#include "postgres.h"

	#include "access/xact.h"
	#include "cdb/cdbvars.h"
	#include "miscadmin.h"
	#include "port/atomics.h"
	#include "utils/faultinjector.h"
	#include "utils/resgroup.h"
	#include "utils/resource_manager.h"
	#include "utils/session_state.h"
	#include "utils/vmem_tracker.h"

	/* External dependencies within the runaway cleanup framework */
	extern bool vmemTrackerInited;
	extern bool isProcessActive;
	extern EventVersion activationVersion;
	extern EventVersion deactivationVersion;
	extern volatile uint32 *isRunawayDetector;
	extern volatile EventVersion *latestRunawayVersion;

	/*
	* The cleanupCountdown in the SessionState determines how many
	* processes we need to cleanup to declare a session clean. If it
	* reaches 0, we mark the session clean. However, -1 indicates
	* that the session is either done cleaning previous runaway event
	* or it never started a cleaning.
	*/
	#define CLEANUP_COUNTDOWN_BEFORE_RUNAWAY -1

	/* The runaway version for which this process started cleaning up */
	static EventVersion beginCleanupRunawayVersion = 0;

	/* The runaway version for which this process finished cleaning up */
	static EventVersion endCleanupRunawayVersion = 0;

	void RunawayCleaner_Init(void);
	void RunawayCleaner_StartCleanup(void);
	bool RunawayCleaner_IsCleanupInProgress(void);

	/*
	* Initializes the per-process states of the runaway cleaner.
	*/
	void
	RunawayCleaner_Init()
	{
	beginCleanupRunawayVersion = 0;
	endCleanupRunawayVersion = 0;
	}

	/* Returns true if the current process should start a runaway cleanup */
	static bool
	RunawayCleaner_ShouldStartRunawayCleanup()
	{
	if (NULL != MySessionState && MySessionState->runawayStatus != RunawayStatus_NotRunaway &&
	beginCleanupRunawayVersion != *latestRunawayVersion)
	{
	AssertImply(isProcessActive, activationVersion >= deactivationVersion);
	AssertImply(!isProcessActive, deactivationVersion >= activationVersion);

	/*
	* We are marked as runaway. Therefore, if the runaway event happened before deactivation,
	* we must have a version counter increment
	*/
	AssertImply(*latestRunawayVersion < deactivationVersion && !isProcessActive, activationVersion < deactivationVersion);

	if (isProcessActive && *latestRunawayVersion > activationVersion)
	{
	/* Active process and the runaway event came after the activation */
	return true;
	}
	else if (!isProcessActive && *latestRunawayVersion < deactivationVersion &&
	*latestRunawayVersion > activationVersion)
	{
	/*
	* The process is deactivated, but there is a pending runaway event before
	* the deactivation for which this process never cleaned up
	*/
	return true;
	}
	}

	return false;
	}

	/*
	* Determine if the runaway cleanup should be handled by aborting the current
	* query or must be ignored. Since the cleanup can be attempted from multiple
	* places, it is important to first validate if calling elog(ERROR) is safe and
	* of value.
	*/
	static bool
	RunawayCleaner_ShouldCancelQuery()
	{
	/* VMEM tracker not being used */
	if (!vmemTrackerInited)
	return false;

	/* In critical section or when holding off on handling interrupts */
	if (CritSectionCount != 0 \|\| InterruptHoldoffCount != 0)
	return false;

	/*
	* Cleaning up QEs that are not executing a valid command may cause the QD to
	* get stuck [MPP-24950]
	*/
	if (gp_command_count <= 0)
	return false;

	/*
	* If not currently executing a transaction, aborting it won't release any
	* more resources.
	*/
	if (!IsTransactionState())
	return false;

	/* Ok, we are actively executing a query */

	if (MySessionState->runawayStatus == RunawayStatus_PrimaryRunawaySession)
	{
	/*
	* Abort the query if it is actively executing and has been flagged as
	* consuming the most memory
	*/
	return true;
	}
	else
	{
	Assert(MySessionState->runawayStatus == RunawayStatus_SecondaryRunawaySession);

	/*
	* If this process was flagged as a runaway session inspite another session
	* using more memory, only abort this query if the current user is not a
	* superuser. This is to ensure that critical administrative commands (such
	* as database restarts), which are done as superuser, are not interrupted
	* by the runaway cleaner.
	*/
	return !superuser();
	}
	}

	/*
	* Starts a runaway cleanup by triggering an ERROR if the VMEM tracker is active
	* and a commit is not already in progress. Otherwise, it marks the process as clean
	*/
	void
	RunawayCleaner_StartCleanup()
	{
	/*
	* Cleanup can be attempted from multiple places, such as before deactivating
	* a process (if a pending runaway event) or periodically from CHECK_FOR_INTERRUPTS
	* (indirectly via RedZoneHandler_DetectRunaway). We don't carry multiple cleanup
	* for a single runaway event. Every time we start a cleanup process, we set the
	* beginCleanupRunawayVersion to the runaway version for which we started cleaning
	* up. Later on, if we reenter this method (e.g., another CHECK_FOR_INTERRUPTS()
	* during cleanup), we can observe that the cleanup already started from this runaway
	* event, and therefore we skip duplicate cleanup
	*/
	if (RunawayCleaner_ShouldStartRunawayCleanup())
	{
	Assert(beginCleanupRunawayVersion < *latestRunawayVersion);
	Assert(endCleanupRunawayVersion < *latestRunawayVersion);

	/* We don't want to clean up multiple times for same runaway event */
	beginCleanupRunawayVersion = *latestRunawayVersion;

	if (RunawayCleaner_ShouldCancelQuery())
	{
	SIMPLE_FAULT_INJECTOR("runaway_cleanup");

	ereport(ERROR, (errmsg("Canceling query because of high VMEM usage. Used: %dMB, available %dMB, red zone: %dMB",
	VmemTracker_ConvertVmemChunksToMB(MySessionState->sessionVmem), VmemTracker_GetAvailableVmemMB(),
	RedZoneHandler_GetRedZoneLimitMB()), errprintstack(true)));
	}

	/*
	* If we cannot error out because of a critical section or because we are a super user
	* or for some other reason (such as the QE is not running any valid command, i.e.,
	* gp_command_count is not positive) simply declare this process as clean
	*/
	RunawayCleaner_RunawayCleanupDoneForProcess(true /* ignoredCleanup */);
	}
	}

	/*
	* Resets the runaway flag and enables runaway detector.
	*
	* Note: this method should not need any additional locks.
	* Either the MySessionState entry is being released, and
	* we already have a lock on SessionState, and therefore,
	* no new runaway detector can run until the lock is released.
	*
	* Alternatively, we may reset this while still in a live
	* session. In such case, our runaway event versioning should
	* ensure that every process of this session would do another round
	* of cleanup if it is detected as a runaway session again.
	*/
	void
	RunawayCleaner_RunawayCleanupDoneForSession()
	{
	Assert(NULL != MySessionState);
	if (MySessionState->runawayStatus != RunawayStatus_NotRunaway)
	{
	/* The last runaway cleanup should have finished */
	Assert(endCleanupRunawayVersion == beginCleanupRunawayVersion);
	Assert(endCleanupRunawayVersion == *latestRunawayVersion);
	Assert(CLEANUP_COUNTDOWN_BEFORE_RUNAWAY == MySessionState->cleanupCountdown);

	MySessionState->runawayStatus = RunawayStatus_NotRunaway;
	MySessionState->sessionVmemRunaway = 0;
	MySessionState->commandCountRunaway = 0;

	/*
	* Reset the exclusive runaway detector flag so that
	* another runaway detector can be chosen
	*/
	*isRunawayDetector = 0;
	}
	}

	/*
	* Marks the current process as clean. If all the processes are marked
	* as clean for this session (i.e., cleanupCountdown == 0 in the
	* MySessionState) then we reset session's runaway status as well as
	* the runaway detector flag (i.e., a new runaway detector can run).
	*
	* Parameters:
	* ignoredCleanup: whether the cleanup was ignored, i.e., no elog(ERROR, ...)
	* was thrown. In such case a deactivated process is not reactivated as the
	* deactivation didn't get interrupted.
	*/
	void
	RunawayCleaner_RunawayCleanupDoneForProcess(bool ignoredCleanup)
	{
	/*
	* We don't do anything if we don't have an ongoing cleanup, or we already finished
	* cleanup once for the current runaway event
	*/
	if (beginCleanupRunawayVersion != *latestRunawayVersion \|\|
	endCleanupRunawayVersion == beginCleanupRunawayVersion)
	{
	/* Either we never started cleanup, or we already finished */
	return;
	}

	/* Disable repeating call */
	endCleanupRunawayVersion = beginCleanupRunawayVersion;

	Assert(NULL != MySessionState);
	/*
	* As the current cleanup holds leverage on the cleanupCountdown,
	* the session must stay as runaway at least until the current
	* process marks itself clean
	*/
	Assert(MySessionState->runawayStatus != RunawayStatus_NotRunaway);

	/* We only cleanup if we were active when the runaway event happened */
	Assert((!isProcessActive && *latestRunawayVersion < deactivationVersion &&
	*latestRunawayVersion > activationVersion) \|\|
	(*latestRunawayVersion > activationVersion &&
	(activationVersion >= deactivationVersion && isProcessActive)));

	/*
	* We don't reactivate if the process is already active or a deactivated
	* process never errored out during deactivation (i.e., failed to complete
	* deactivation)
	*/
	if (!isProcessActive && !ignoredCleanup)
	{
	Assert(1 == *isRunawayDetector);
	Assert(0 < MySessionState->cleanupCountdown);
	/*
	* As the process threw ERROR instead of going into ReadCommand() blocking
	* state, we have to reactivate the process from its current Deactivated
	* state
	*/
	IdleTracker_ActivateProcess();
	}

	Assert(0 < MySessionState->cleanupCountdown);
	#if USE_ASSERT_CHECKING
	int cleanProgress =
	#endif
	pg_atomic_add_fetch_u32((pg_atomic_uint32 *)&MySessionState->cleanupCountdown, -1);
	Assert(0 <= cleanProgress);

	uint32 expected = 0;
	bool finalCleaner = pg_atomic_compare_exchange_u32((pg_atomic_uint32 *) &MySessionState->cleanupCountdown,
	&expected, CLEANUP_COUNTDOWN_BEFORE_RUNAWAY);

	if (finalCleaner)
	{
	/*
	* The final cleaner is responsible to reset the runaway flag,
	* and enable the runaway detection process.
	*/
	RunawayCleaner_RunawayCleanupDoneForSession();
	}

	/*
	* Finally we are done with all critical cleanup, which includes releasing all our memory and
	* releasing our cleanup counter so that another session can be marked as runaway, if needed.
	* Now, we have some head room to actually record our usage.
	*/
	write_stderr("Logging memory usage because of runaway cleanup. Note, this is a post-cleanup logging and may be incomplete.");
	MemoryContextStats(TopMemoryContext);
	}

	/*
	* Returns true if a cleanup is in progress (i.e., endCleanupRunawayVersion
	* is smaller than beginCleanupRunawayVersion).
	*/
	bool
	RunawayCleaner_IsCleanupInProgress()
	{
	Assert(endCleanupRunawayVersion <= beginCleanupRunawayVersion);
	return endCleanupRunawayVersion < beginCleanupRunawayVersion;
	}