src/backend/nodes/queryjumblefuncs.c - cloudberry - Git at Google

 /*-------------------------------------------------------------------------
  *
  * queryjumblefuncs.c
  *	 Query normalization and fingerprinting.
  *
  * Normalization is a process whereby similar queries, typically differing only
  * in their constants (though the exact rules are somewhat more subtle than
  * that) are recognized as equivalent, and are tracked as a single entry.  This
  * is particularly useful for non-prepared queries.
  *
  * Normalization is implemented by fingerprinting queries, selectively
  * serializing those fields of each query tree's nodes that are judged to be
  * essential to the query.  This is referred to as a query jumble.  This is
  * distinct from a regular serialization in that various extraneous
  * information is ignored as irrelevant or not essential to the query, such
  * as the collations of Vars and, most notably, the values of constants.
  *
  * This jumble is acquired at the end of parse analysis of each query, and
  * a 64-bit hash of it is stored into the query's Query.queryId field.
  * The server then copies this value around, making it available in plan
  * tree(s) generated from the query.  The executor can then use this value
  * to blame query costs on the proper queryId.
  *
  * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
  * IDENTIFICATION
  *	  src/backend/nodes/queryjumblefuncs.c
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"

 #include "common/hashfn.h"
 #include "miscadmin.h"
 #include "nodes/queryjumble.h"
 #include "parser/scansup.h"

 #define JUMBLE_SIZE				1024	/* query serialization buffer size */

 /* GUC parameters */
 int			compute_query_id = COMPUTE_QUERY_ID_AUTO;

 /* True when compute_query_id is ON, or AUTO and a module requests them */
 bool		query_id_enabled = false;

 static void AppendJumble(JumbleState *jstate,
 						 const unsigned char *item, Size size);
 static void RecordConstLocation(JumbleState *jstate, int location);
 static void _jumbleNode(JumbleState *jstate, Node *node);
 static void _jumbleA_Const(JumbleState *jstate, Node *node);
 static void _jumbleList(JumbleState *jstate, Node *node);
 static void _jumbleRangeTblEntry(JumbleState *jstate, Node *node);

 /*
  * Given a possibly multi-statement source string, confine our attention to the
  * relevant part of the string.
  */
 const char *
 CleanQuerytext(const char *query, int *location, int *len)
 {
 	int			query_location = *location;
 	int			query_len = *len;

 	/* First apply starting offset, unless it's -1 (unknown). */
 	if (query_location >= 0)
 	{
 		Assert(query_location <= strlen(query));
 		query += query_location;
 		/* Length of 0 (or -1) means "rest of string" */
 		if (query_len <= 0)
 			query_len = strlen(query);
 		else
 			Assert(query_len <= strlen(query));
 	}
 	else
 	{
 		/* If query location is unknown, distrust query_len as well */
 		query_location = 0;
 		query_len = strlen(query);
 	}

 	/*
 	 * Discard leading and trailing whitespace, too.  Use scanner_isspace()
 	 * not libc's isspace(), because we want to match the lexer's behavior.
 	 */
 	while (query_len > 0 && scanner_isspace(query[0]))
 		query++, query_location++, query_len--;
 	while (query_len > 0 && scanner_isspace(query[query_len - 1]))
 		query_len--;

 	*location = query_location;
 	*len = query_len;

 	return query;
 }

 JumbleState *
 JumbleQuery(Query *query)
 {
 	JumbleState *jstate = NULL;

 	Assert(IsQueryIdEnabled());

 	jstate = (JumbleState *) palloc(sizeof(JumbleState));

 	/* Set up workspace for query jumbling */
 	jstate->jumble = (unsigned char *) palloc(JUMBLE_SIZE);
 	jstate->jumble_len = 0;
 	jstate->clocations_buf_size = 32;
 	jstate->clocations = (LocationLen *)
 		palloc(jstate->clocations_buf_size * sizeof(LocationLen));
 	jstate->clocations_count = 0;
 	jstate->highest_extern_param_id = 0;

 	/* Compute query ID and mark the Query node with it */
 	_jumbleNode(jstate, (Node *) query);
 	query->queryId = DatumGetUInt64(hash_any_extended(jstate->jumble,
 													  jstate->jumble_len,
 													  0));

 	/*
 	 * If we are unlucky enough to get a hash of zero, use 1 instead for
 	 * normal statements and 2 for utility queries.
 	 */
 	if (query->queryId == UINT64CONST(0))
 	{
 		if (query->utilityStmt)
 			query->queryId = UINT64CONST(2);
 		else
 			query->queryId = UINT64CONST(1);
 	}

 	return jstate;
 }

 /*
  * Enables query identifier computation.
  *
  * Third-party plugins can use this function to inform core that they require
  * a query identifier to be computed.
  */
 void
 EnableQueryId(void)
 {
 	if (compute_query_id != COMPUTE_QUERY_ID_OFF)
 		query_id_enabled = true;
 }

 /*
  * AppendJumble: Append a value that is substantive in a given query to
  * the current jumble.
  */
 static void
 AppendJumble(JumbleState *jstate, const unsigned char *item, Size size)
 {
 	unsigned char *jumble = jstate->jumble;
 	Size		jumble_len = jstate->jumble_len;

 	/*
 	 * Whenever the jumble buffer is full, we hash the current contents and
 	 * reset the buffer to contain just that hash value, thus relying on the
 	 * hash to summarize everything so far.
 	 */
 	while (size > 0)
 	{
 		Size		part_size;

 		if (jumble_len >= JUMBLE_SIZE)
 		{
 			uint64		start_hash;

 			start_hash = DatumGetUInt64(hash_any_extended(jumble,
 														  JUMBLE_SIZE, 0));
 			memcpy(jumble, &start_hash, sizeof(start_hash));
 			jumble_len = sizeof(start_hash);
 		}
 		part_size = Min(size, JUMBLE_SIZE - jumble_len);
 		memcpy(jumble + jumble_len, item, part_size);
 		jumble_len += part_size;
 		item += part_size;
 		size -= part_size;
 	}
 	jstate->jumble_len = jumble_len;
 }

 /*
  * Record location of constant within query string of query tree
  * that is currently being walked.
  */
 static void
 RecordConstLocation(JumbleState *jstate, int location)
 {
 	/* -1 indicates unknown or undefined location */
 	if (location >= 0)
 	{
 		/* enlarge array if needed */
 		if (jstate->clocations_count >= jstate->clocations_buf_size)
 		{
 			jstate->clocations_buf_size *= 2;
 			jstate->clocations = (LocationLen *)
 				repalloc(jstate->clocations,
 						 jstate->clocations_buf_size *
 						 sizeof(LocationLen));
 		}
 		jstate->clocations[jstate->clocations_count].location = location;
 		/* initialize lengths to -1 to simplify third-party module usage */
 		jstate->clocations[jstate->clocations_count].length = -1;
 		jstate->clocations_count++;
 	}
 }

 #define JUMBLE_NODE(item) \
 	_jumbleNode(jstate, (Node *) expr->item)
 #define JUMBLE_LOCATION(location) \
 	RecordConstLocation(jstate, expr->location)
 #define JUMBLE_FIELD(item) \
 	AppendJumble(jstate, (const unsigned char *) &(expr->item), sizeof(expr->item))
 #define JUMBLE_FIELD_SINGLE(item) \
 	AppendJumble(jstate, (const unsigned char *) &(item), sizeof(item))
 #define JUMBLE_STRING(str) \
 do { \
 	if (expr->str) \
 		AppendJumble(jstate, (const unsigned char *) (expr->str), strlen(expr->str) + 1); \
 } while(0)

 #include "queryjumblefuncs.funcs.c"

 static void
 _jumbleNode(JumbleState *jstate, Node *node)
 {
 	Node	   *expr = node;

 	if (expr == NULL)
 		return;

 	/* Guard against stack overflow due to overly complex expressions */
 	check_stack_depth();

 	/*
 	 * We always emit the node's NodeTag, then any additional fields that are
 	 * considered significant, and then we recurse to any child nodes.
 	 */
 	JUMBLE_FIELD(type);

 	switch (nodeTag(expr))
 	{
 #include "queryjumblefuncs.switch.c"

 		case T_List:
 		case T_IntList:
 		case T_OidList:
 		case T_XidList:
 			_jumbleList(jstate, expr);
 			break;

 		default:
 			/* Only a warning, since we can stumble along anyway */
 			elog(WARNING, "unrecognized node type: %d",
 				 (int) nodeTag(expr));
 			break;
 	}

 	/* Special cases to handle outside the automated code */
 	switch (nodeTag(expr))
 	{
 		case T_Param:
 			{
 				Param	   *p = (Param *) node;

 				/*
 				 * Update the highest Param id seen, in order to start
 				 * normalization correctly.
 				 */
 				if (p->paramkind == PARAM_EXTERN &&
 					p->paramid > jstate->highest_extern_param_id)
 					jstate->highest_extern_param_id = p->paramid;
 			}
 			break;
 		default:
 			break;
 	}
 }

 static void
 _jumbleList(JumbleState *jstate, Node *node)
 {
 	List	   *expr = (List *) node;
 	ListCell   *l;

 	switch (expr->type)
 	{
 		case T_List:
 			foreach(l, expr)
 				_jumbleNode(jstate, lfirst(l));
 			break;
 		case T_IntList:
 			foreach(l, expr)
 				JUMBLE_FIELD_SINGLE(lfirst_int(l));
 			break;
 		case T_OidList:
 			foreach(l, expr)
 				JUMBLE_FIELD_SINGLE(lfirst_oid(l));
 			break;
 		case T_XidList:
 			foreach(l, expr)
 				JUMBLE_FIELD_SINGLE(lfirst_xid(l));
 			break;
 		default:
 			elog(ERROR, "unrecognized list node type: %d",
 				 (int) expr->type);
 			return;
 	}
 }

 static void
 _jumbleA_Const(JumbleState *jstate, Node *node)
 {
 	A_Const    *expr = (A_Const *) node;

 	JUMBLE_FIELD(isnull);
 	if (!expr->isnull)
 	{
 		JUMBLE_FIELD(val.node.type);
 		switch (nodeTag(&expr->val))
 		{
 			case T_Integer:
 				JUMBLE_FIELD(val.ival.ival);
 				break;
 			case T_Float:
 				JUMBLE_STRING(val.fval.fval);
 				break;
 			case T_Boolean:
 				JUMBLE_FIELD(val.boolval.boolval);
 				break;
 			case T_String:
 				JUMBLE_STRING(val.sval.sval);
 				break;
 			case T_BitString:
 				JUMBLE_STRING(val.bsval.bsval);
 				break;
 			default:
 				elog(ERROR, "unrecognized node type: %d",
 					 (int) nodeTag(&expr->val));
 				break;
 		}
 	}
 }

 static void
 _jumbleRangeTblEntry(JumbleState *jstate, Node *node)
 {
 	RangeTblEntry *expr = (RangeTblEntry *) node;

 	JUMBLE_FIELD(rtekind);
 	switch (expr->rtekind)
 	{
 		case RTE_RELATION:
 			JUMBLE_FIELD(relid);
 			JUMBLE_NODE(tablesample);
 			JUMBLE_FIELD(inh);
 			break;
 		case RTE_SUBQUERY:
 			JUMBLE_NODE(subquery);
 			break;
 		case RTE_JOIN:
 			JUMBLE_FIELD(jointype);
 			break;
 		case RTE_FUNCTION:
 			JUMBLE_NODE(functions);
 			break;
 		case RTE_TABLEFUNC:
 			JUMBLE_NODE(tablefunc);
 			break;
 		case RTE_VALUES:
 			JUMBLE_NODE(values_lists);
 			break;
 		case RTE_CTE:

 			/*
 			 * Depending on the CTE name here isn't ideal, but it's the only
 			 * info we have to identify the referenced WITH item.
 			 */
 			JUMBLE_STRING(ctename);
 			JUMBLE_FIELD(ctelevelsup);
 			break;
 		case RTE_NAMEDTUPLESTORE:
 			JUMBLE_STRING(enrname);
 			break;
 		case RTE_RESULT:
 			break;
 		default:
 			elog(ERROR, "unrecognized RTE kind: %d", (int) expr->rtekind);
 			break;
 	}
 }
	/*-------------------------------------------------------------------------
	*
	* queryjumblefuncs.c
	* Query normalization and fingerprinting.
	*
	* Normalization is a process whereby similar queries, typically differing only
	* in their constants (though the exact rules are somewhat more subtle than
	* that) are recognized as equivalent, and are tracked as a single entry. This
	* is particularly useful for non-prepared queries.
	*
	* Normalization is implemented by fingerprinting queries, selectively
	* serializing those fields of each query tree's nodes that are judged to be
	* essential to the query. This is referred to as a query jumble. This is
	* distinct from a regular serialization in that various extraneous
	* information is ignored as irrelevant or not essential to the query, such
	* as the collations of Vars and, most notably, the values of constants.
	*
	* This jumble is acquired at the end of parse analysis of each query, and
	* a 64-bit hash of it is stored into the query's Query.queryId field.
	* The server then copies this value around, making it available in plan
	* tree(s) generated from the query. The executor can then use this value
	* to blame query costs on the proper queryId.
	*
	* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
	* Portions Copyright (c) 1994, Regents of the University of California
	*
	*
	* IDENTIFICATION
	* src/backend/nodes/queryjumblefuncs.c
	*
	*-------------------------------------------------------------------------
	*/
	#include "postgres.h"

	#include "common/hashfn.h"
	#include "miscadmin.h"
	#include "nodes/queryjumble.h"
	#include "parser/scansup.h"

	#define JUMBLE_SIZE 1024 /* query serialization buffer size */

	/* GUC parameters */
	int compute_query_id = COMPUTE_QUERY_ID_AUTO;

	/* True when compute_query_id is ON, or AUTO and a module requests them */
	bool query_id_enabled = false;

	static void AppendJumble(JumbleState *jstate,
	const unsigned char *item, Size size);
	static void RecordConstLocation(JumbleState *jstate, int location);
	static void _jumbleNode(JumbleState jstate, Node node);
	static void _jumbleA_Const(JumbleState jstate, Node node);
	static void _jumbleList(JumbleState jstate, Node node);
	static void _jumbleRangeTblEntry(JumbleState jstate, Node node);

	/*
	* Given a possibly multi-statement source string, confine our attention to the
	* relevant part of the string.
	*/
	const char *
	CleanQuerytext(const char query, int location, int *len)
	{
	int query_location = *location;
	int query_len = *len;

	/* First apply starting offset, unless it's -1 (unknown). */
	if (query_location >= 0)
	{
	Assert(query_location <= strlen(query));
	query += query_location;
	/* Length of 0 (or -1) means "rest of string" */
	if (query_len <= 0)
	query_len = strlen(query);
	else
	Assert(query_len <= strlen(query));
	}
	else
	{
	/* If query location is unknown, distrust query_len as well */
	query_location = 0;
	query_len = strlen(query);
	}

	/*
	* Discard leading and trailing whitespace, too. Use scanner_isspace()
	* not libc's isspace(), because we want to match the lexer's behavior.
	*/
	while (query_len > 0 && scanner_isspace(query[0]))
	query++, query_location++, query_len--;
	while (query_len > 0 && scanner_isspace(query[query_len - 1]))
	query_len--;

	*location = query_location;
	*len = query_len;

	return query;
	}

	JumbleState *
	JumbleQuery(Query *query)
	{
	JumbleState *jstate = NULL;

	Assert(IsQueryIdEnabled());

	jstate = (JumbleState *) palloc(sizeof(JumbleState));

	/* Set up workspace for query jumbling */
	jstate->jumble = (unsigned char *) palloc(JUMBLE_SIZE);
	jstate->jumble_len = 0;
	jstate->clocations_buf_size = 32;
	jstate->clocations = (LocationLen *)
	palloc(jstate->clocations_buf_size * sizeof(LocationLen));
	jstate->clocations_count = 0;
	jstate->highest_extern_param_id = 0;

	/* Compute query ID and mark the Query node with it */
	_jumbleNode(jstate, (Node *) query);
	query->queryId = DatumGetUInt64(hash_any_extended(jstate->jumble,
	jstate->jumble_len,
	0));

	/*
	* If we are unlucky enough to get a hash of zero, use 1 instead for
	* normal statements and 2 for utility queries.
	*/
	if (query->queryId == UINT64CONST(0))
	{
	if (query->utilityStmt)
	query->queryId = UINT64CONST(2);
	else
	query->queryId = UINT64CONST(1);
	}

	return jstate;
	}

	/*
	* Enables query identifier computation.
	*
	* Third-party plugins can use this function to inform core that they require
	* a query identifier to be computed.
	*/
	void
	EnableQueryId(void)
	{
	if (compute_query_id != COMPUTE_QUERY_ID_OFF)
	query_id_enabled = true;
	}

	/*
	* AppendJumble: Append a value that is substantive in a given query to
	* the current jumble.
	*/
	static void
	AppendJumble(JumbleState jstate, const unsigned char item, Size size)
	{
	unsigned char *jumble = jstate->jumble;
	Size jumble_len = jstate->jumble_len;

	/*
	* Whenever the jumble buffer is full, we hash the current contents and
	* reset the buffer to contain just that hash value, thus relying on the
	* hash to summarize everything so far.
	*/
	while (size > 0)
	{
	Size part_size;

	if (jumble_len >= JUMBLE_SIZE)
	{
	uint64 start_hash;

	start_hash = DatumGetUInt64(hash_any_extended(jumble,
	JUMBLE_SIZE, 0));
	memcpy(jumble, &start_hash, sizeof(start_hash));
	jumble_len = sizeof(start_hash);
	}
	part_size = Min(size, JUMBLE_SIZE - jumble_len);
	memcpy(jumble + jumble_len, item, part_size);
	jumble_len += part_size;
	item += part_size;
	size -= part_size;
	}
	jstate->jumble_len = jumble_len;
	}

	/*
	* Record location of constant within query string of query tree
	* that is currently being walked.
	*/
	static void
	RecordConstLocation(JumbleState *jstate, int location)
	{
	/* -1 indicates unknown or undefined location */
	if (location >= 0)
	{
	/* enlarge array if needed */
	if (jstate->clocations_count >= jstate->clocations_buf_size)
	{
	jstate->clocations_buf_size *= 2;
	jstate->clocations = (LocationLen *)
	repalloc(jstate->clocations,
	jstate->clocations_buf_size *
	sizeof(LocationLen));
	}
	jstate->clocations[jstate->clocations_count].location = location;
	/* initialize lengths to -1 to simplify third-party module usage */
	jstate->clocations[jstate->clocations_count].length = -1;
	jstate->clocations_count++;
	}
	}

	#define JUMBLE_NODE(item) \
	_jumbleNode(jstate, (Node *) expr->item)
	#define JUMBLE_LOCATION(location) \
	RecordConstLocation(jstate, expr->location)
	#define JUMBLE_FIELD(item) \
	AppendJumble(jstate, (const unsigned char *) &(expr->item), sizeof(expr->item))
	#define JUMBLE_FIELD_SINGLE(item) \
	AppendJumble(jstate, (const unsigned char *) &(item), sizeof(item))
	#define JUMBLE_STRING(str) \
	do { \
	if (expr->str) \
	AppendJumble(jstate, (const unsigned char *) (expr->str), strlen(expr->str) + 1); \
	} while(0)

	#include "queryjumblefuncs.funcs.c"

	static void
	_jumbleNode(JumbleState jstate, Node node)
	{
	Node *expr = node;

	if (expr == NULL)
	return;

	/* Guard against stack overflow due to overly complex expressions */
	check_stack_depth();

	/*
	* We always emit the node's NodeTag, then any additional fields that are
	* considered significant, and then we recurse to any child nodes.
	*/
	JUMBLE_FIELD(type);

	switch (nodeTag(expr))
	{
	#include "queryjumblefuncs.switch.c"

	case T_List:
	case T_IntList:
	case T_OidList:
	case T_XidList:
	_jumbleList(jstate, expr);
	break;

	default:
	/* Only a warning, since we can stumble along anyway */
	elog(WARNING, "unrecognized node type: %d",
	(int) nodeTag(expr));
	break;
	}

	/* Special cases to handle outside the automated code */
	switch (nodeTag(expr))
	{
	case T_Param:
	{
	Param p = (Param ) node;

	/*
	* Update the highest Param id seen, in order to start
	* normalization correctly.
	*/
	if (p->paramkind == PARAM_EXTERN &&
	p->paramid > jstate->highest_extern_param_id)
	jstate->highest_extern_param_id = p->paramid;
	}
	break;
	default:
	break;
	}
	}

	static void
	_jumbleList(JumbleState jstate, Node node)
	{
	List expr = (List ) node;
	ListCell *l;

	switch (expr->type)
	{
	case T_List:
	foreach(l, expr)
	_jumbleNode(jstate, lfirst(l));
	break;
	case T_IntList:
	foreach(l, expr)
	JUMBLE_FIELD_SINGLE(lfirst_int(l));
	break;
	case T_OidList:
	foreach(l, expr)
	JUMBLE_FIELD_SINGLE(lfirst_oid(l));
	break;
	case T_XidList:
	foreach(l, expr)
	JUMBLE_FIELD_SINGLE(lfirst_xid(l));
	break;
	default:
	elog(ERROR, "unrecognized list node type: %d",
	(int) expr->type);
	return;
	}
	}

	static void
	_jumbleA_Const(JumbleState jstate, Node node)
	{
	A_Const expr = (A_Const ) node;

	JUMBLE_FIELD(isnull);
	if (!expr->isnull)
	{
	JUMBLE_FIELD(val.node.type);
	switch (nodeTag(&expr->val))
	{
	case T_Integer:
	JUMBLE_FIELD(val.ival.ival);
	break;
	case T_Float:
	JUMBLE_STRING(val.fval.fval);
	break;
	case T_Boolean:
	JUMBLE_FIELD(val.boolval.boolval);
	break;
	case T_String:
	JUMBLE_STRING(val.sval.sval);
	break;
	case T_BitString:
	JUMBLE_STRING(val.bsval.bsval);
	break;
	default:
	elog(ERROR, "unrecognized node type: %d",
	(int) nodeTag(&expr->val));
	break;
	}
	}
	}

	static void
	_jumbleRangeTblEntry(JumbleState jstate, Node node)
	{
	RangeTblEntry expr = (RangeTblEntry ) node;

	JUMBLE_FIELD(rtekind);
	switch (expr->rtekind)
	{
	case RTE_RELATION:
	JUMBLE_FIELD(relid);
	JUMBLE_NODE(tablesample);
	JUMBLE_FIELD(inh);
	break;
	case RTE_SUBQUERY:
	JUMBLE_NODE(subquery);
	break;
	case RTE_JOIN:
	JUMBLE_FIELD(jointype);
	break;
	case RTE_FUNCTION:
	JUMBLE_NODE(functions);
	break;
	case RTE_TABLEFUNC:
	JUMBLE_NODE(tablefunc);
	break;
	case RTE_VALUES:
	JUMBLE_NODE(values_lists);
	break;
	case RTE_CTE:

	/*
	* Depending on the CTE name here isn't ideal, but it's the only
	* info we have to identify the referenced WITH item.
	*/
	JUMBLE_STRING(ctename);
	JUMBLE_FIELD(ctelevelsup);
	break;
	case RTE_NAMEDTUPLESTORE:
	JUMBLE_STRING(enrname);
	break;
	case RTE_RESULT:
	break;
	default:
	elog(ERROR, "unrecognized RTE kind: %d", (int) expr->rtekind);
	break;
	}
	}