src/backend/regex/regprefix.c - cloudberry - Git at Google

 /*-------------------------------------------------------------------------
  *
  * regprefix.c
  *	  Extract a common prefix, if any, from a compiled regex.
  *
  *
  * Portions Copyright (c) 2012-2023, PostgreSQL Global Development Group
  * Portions Copyright (c) 1998, 1999 Henry Spencer
  *
  * IDENTIFICATION
  *	  src/backend/regex/regprefix.c
  *
  *-------------------------------------------------------------------------
  */

 #include "regex/regguts.h"


 /*
  * forward declarations
  */
 static int	findprefix(struct cnfa *cnfa, struct colormap *cm,
 					   chr *string, size_t *slength);


 /*
  * pg_regprefix - get common prefix for regular expression
  *
  * Returns one of:
  *	REG_NOMATCH: there is no common prefix of strings matching the regex
  *	REG_PREFIX: there is a common prefix of strings matching the regex
  *	REG_EXACT: all strings satisfying the regex must match the same string
  *	or a REG_XXX error code
  *
  * In the non-failure cases, *string is set to a palloc'd string containing
  * the common prefix or exact value, of length *slength (measured in chrs
  * not bytes!).
  *
  * This function does not analyze all complex cases (such as lookaround
  * constraints) exactly.  Therefore it is possible that some strings matching
  * the reported prefix or exact-match string do not satisfy the regex.  But
  * it should never be the case that a string satisfying the regex does not
  * match the reported prefix or exact-match string.
  */
 int
 pg_regprefix(regex_t *re,
 			 chr **string,
 			 size_t *slength)
 {
 	struct guts *g;
 	struct cnfa *cnfa;
 	int			st;

 	/* sanity checks */
 	if (string == NULL || slength == NULL)
 		return REG_INVARG;
 	*string = NULL;				/* initialize for failure cases */
 	*slength = 0;
 	if (re == NULL || re->re_magic != REMAGIC)
 		return REG_INVARG;
 	if (re->re_csize != sizeof(chr))
 		return REG_MIXED;

 	/* Initialize locale-dependent support */
 	pg_set_regex_collation(re->re_collation);

 	/* setup */
 	g = (struct guts *) re->re_guts;
 	if (g->info & REG_UIMPOSSIBLE)
 		return REG_NOMATCH;

 	/*
 	 * This implementation considers only the search NFA for the topmost regex
 	 * tree node.  Therefore, constraints such as backrefs are not fully
 	 * applied, which is allowed per the function's API spec.
 	 */
 	assert(g->tree != NULL);
 	cnfa = &g->tree->cnfa;

 	/* matchall NFAs never have a fixed prefix */
 	if (cnfa->flags & MATCHALL)
 		return REG_NOMATCH;

 	/*
 	 * Since a correct NFA should never contain any exit-free loops, it should
 	 * not be possible for our traversal to return to a previously visited NFA
 	 * state.  Hence we need at most nstates chrs in the output string.
 	 */
 	*string = (chr *) MALLOC(cnfa->nstates * sizeof(chr));
 	if (*string == NULL)
 		return REG_ESPACE;

 	/* do it */
 	st = findprefix(cnfa, &g->cmap, *string, slength);

 	assert(*slength <= cnfa->nstates);

 	/* clean up */
 	if (st != REG_PREFIX && st != REG_EXACT)
 	{
 		FREE(*string);
 		*string = NULL;
 		*slength = 0;
 	}

 	return st;
 }

 /*
  * findprefix - extract common prefix from cNFA
  *
  * Results are returned into the preallocated chr array string[], with
  * *slength (which must be preset to zero) incremented for each chr.
  */
 static int						/* regprefix return code */
 findprefix(struct cnfa *cnfa,
 		   struct colormap *cm,
 		   chr *string,
 		   size_t *slength)
 {
 	int			st;
 	int			nextst;
 	color		thiscolor;
 	chr			c;
 	struct carc *ca;

 	/*
 	 * The "pre" state must have only BOS/BOL outarcs, else pattern isn't
 	 * anchored left.  If we have both BOS and BOL, they must go to the same
 	 * next state.
 	 */
 	st = cnfa->pre;
 	nextst = -1;
 	for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
 	{
 		if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
 		{
 			if (nextst == -1)
 				nextst = ca->to;
 			else if (nextst != ca->to)
 				return REG_NOMATCH;
 		}
 		else
 			return REG_NOMATCH;
 	}
 	if (nextst == -1)
 		return REG_NOMATCH;

 	/*
 	 * Scan through successive states, stopping as soon as we find one with
 	 * more than one acceptable transition character (either multiple colors
 	 * on out-arcs, or a color with more than one member chr).
 	 *
 	 * We could find a state with multiple out-arcs that are all labeled with
 	 * the same singleton color; this comes from patterns like "^ab(cde|cxy)".
 	 * In that case we add the chr "c" to the output string but then exit the
 	 * loop with nextst == -1.  This leaves a little bit on the table: if the
 	 * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added
 	 * to the prefix.  But chasing multiple parallel state chains doesn't seem
 	 * worth the trouble.
 	 */
 	do
 	{
 		st = nextst;
 		nextst = -1;
 		thiscolor = COLORLESS;
 		for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
 		{
 			/* We can ignore BOS/BOL arcs */
 			if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
 				continue;

 			/*
 			 * ... but EOS/EOL arcs terminate the search, as do RAINBOW arcs
 			 * and LACONs
 			 */
 			if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1] ||
 				ca->co == RAINBOW || ca->co >= cnfa->ncolors)
 			{
 				thiscolor = COLORLESS;
 				break;
 			}
 			if (thiscolor == COLORLESS)
 			{
 				/* First plain outarc */
 				thiscolor = ca->co;
 				nextst = ca->to;
 			}
 			else if (thiscolor == ca->co)
 			{
 				/* Another plain outarc for same color */
 				nextst = -1;
 			}
 			else
 			{
 				/* More than one plain outarc color terminates the search */
 				thiscolor = COLORLESS;
 				break;
 			}
 		}
 		/* Done if we didn't find exactly one color on plain outarcs */
 		if (thiscolor == COLORLESS)
 			break;
 		/* The color must be a singleton */
 		if (cm->cd[thiscolor].nschrs != 1)
 			break;
 		/* Must not have any high-color-map entries */
 		if (cm->cd[thiscolor].nuchrs != 0)
 			break;

 		/*
 		 * Identify the color's sole member chr and add it to the prefix
 		 * string.  In general the colormap data structure doesn't provide a
 		 * way to find color member chrs, except by trying GETCOLOR() on each
 		 * possible chr value, which won't do at all.  However, for the cases
 		 * we care about it should be sufficient to test the "firstchr" value,
 		 * that is the first chr ever added to the color.  There are cases
 		 * where this might no longer be a member of the color (so we do need
 		 * to test), but none of them are likely to arise for a character that
 		 * is a member of a common prefix.  If we do hit such a corner case,
 		 * we just fall out without adding anything to the prefix string.
 		 */
 		c = cm->cd[thiscolor].firstchr;
 		if (GETCOLOR(cm, c) != thiscolor)
 			break;

 		string[(*slength)++] = c;

 		/* Advance to next state, but only if we have a unique next state */
 	} while (nextst != -1);

 	/*
 	 * If we ended at a state that only has EOS/EOL outarcs leading to the
 	 * "post" state, then we have an exact-match string.  Note this is true
 	 * even if the string is of zero length.
 	 */
 	nextst = -1;
 	for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
 	{
 		if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
 		{
 			if (nextst == -1)
 				nextst = ca->to;
 			else if (nextst != ca->to)
 			{
 				nextst = -1;
 				break;
 			}
 		}
 		else
 		{
 			nextst = -1;
 			break;
 		}
 	}
 	if (nextst == cnfa->post)
 		return REG_EXACT;

 	/*
 	 * Otherwise, if we were unable to identify any prefix characters, say
 	 * NOMATCH --- the pattern is anchored left, but doesn't specify any
 	 * particular first character.
 	 */
 	if (*slength > 0)
 		return REG_PREFIX;

 	return REG_NOMATCH;
 }
	/*-------------------------------------------------------------------------
	*
	* regprefix.c
	* Extract a common prefix, if any, from a compiled regex.
	*
	*
	* Portions Copyright (c) 2012-2023, PostgreSQL Global Development Group
	* Portions Copyright (c) 1998, 1999 Henry Spencer
	*
	* IDENTIFICATION
	* src/backend/regex/regprefix.c
	*
	*-------------------------------------------------------------------------
	*/

	#include "regex/regguts.h"


	/*
	* forward declarations
	*/
	static int findprefix(struct cnfa cnfa, struct colormap cm,
	chr string, size_t slength);


	/*
	* pg_regprefix - get common prefix for regular expression
	*
	* Returns one of:
	* REG_NOMATCH: there is no common prefix of strings matching the regex
	* REG_PREFIX: there is a common prefix of strings matching the regex
	* REG_EXACT: all strings satisfying the regex must match the same string
	* or a REG_XXX error code
	*
	* In the non-failure cases, *string is set to a palloc'd string containing
	* the common prefix or exact value, of length *slength (measured in chrs
	* not bytes!).
	*
	* This function does not analyze all complex cases (such as lookaround
	* constraints) exactly. Therefore it is possible that some strings matching
	* the reported prefix or exact-match string do not satisfy the regex. But
	* it should never be the case that a string satisfying the regex does not
	* match the reported prefix or exact-match string.
	*/
	int
	pg_regprefix(regex_t *re,
	chr **string,
	size_t *slength)
	{
	struct guts *g;
	struct cnfa *cnfa;
	int st;

	/* sanity checks */
	if (string == NULL \|\| slength == NULL)
	return REG_INVARG;
	string = NULL; / initialize for failure cases */
	*slength = 0;
	if (re == NULL \|\| re->re_magic != REMAGIC)
	return REG_INVARG;
	if (re->re_csize != sizeof(chr))
	return REG_MIXED;

	/* Initialize locale-dependent support */
	pg_set_regex_collation(re->re_collation);

	/* setup */
	g = (struct guts *) re->re_guts;
	if (g->info & REG_UIMPOSSIBLE)
	return REG_NOMATCH;

	/*
	* This implementation considers only the search NFA for the topmost regex
	* tree node. Therefore, constraints such as backrefs are not fully
	* applied, which is allowed per the function's API spec.
	*/
	assert(g->tree != NULL);
	cnfa = &g->tree->cnfa;

	/* matchall NFAs never have a fixed prefix */
	if (cnfa->flags & MATCHALL)
	return REG_NOMATCH;

	/*
	* Since a correct NFA should never contain any exit-free loops, it should
	* not be possible for our traversal to return to a previously visited NFA
	* state. Hence we need at most nstates chrs in the output string.
	*/
	string = (chr ) MALLOC(cnfa->nstates * sizeof(chr));
	if (*string == NULL)
	return REG_ESPACE;

	/* do it */
	st = findprefix(cnfa, &g->cmap, *string, slength);

	assert(*slength <= cnfa->nstates);

	/* clean up */
	if (st != REG_PREFIX && st != REG_EXACT)
	{
	FREE(*string);
	*string = NULL;
	*slength = 0;
	}

	return st;
	}

	/*
	* findprefix - extract common prefix from cNFA
	*
	* Results are returned into the preallocated chr array string[], with
	* *slength (which must be preset to zero) incremented for each chr.
	*/
	static int /* regprefix return code */
	findprefix(struct cnfa *cnfa,
	struct colormap *cm,
	chr *string,
	size_t *slength)
	{
	int st;
	int nextst;
	color thiscolor;
	chr c;
	struct carc *ca;

	/*
	* The "pre" state must have only BOS/BOL outarcs, else pattern isn't
	* anchored left. If we have both BOS and BOL, they must go to the same
	* next state.
	*/
	st = cnfa->pre;
	nextst = -1;
	for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
	{
	if (ca->co == cnfa->bos[0] \|\| ca->co == cnfa->bos[1])
	{
	if (nextst == -1)
	nextst = ca->to;
	else if (nextst != ca->to)
	return REG_NOMATCH;
	}
	else
	return REG_NOMATCH;
	}
	if (nextst == -1)
	return REG_NOMATCH;

	/*
	* Scan through successive states, stopping as soon as we find one with
	* more than one acceptable transition character (either multiple colors
	* on out-arcs, or a color with more than one member chr).
	*
	* We could find a state with multiple out-arcs that are all labeled with
	* the same singleton color; this comes from patterns like "^ab(cde\|cxy)".
	* In that case we add the chr "c" to the output string but then exit the
	* loop with nextst == -1. This leaves a little bit on the table: if the
	* pattern is like "^ab(cde\|cdy)", we won't notice that "d" could be added
	* to the prefix. But chasing multiple parallel state chains doesn't seem
	* worth the trouble.
	*/
	do
	{
	st = nextst;
	nextst = -1;
	thiscolor = COLORLESS;
	for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
	{
	/* We can ignore BOS/BOL arcs */
	if (ca->co == cnfa->bos[0] \|\| ca->co == cnfa->bos[1])
	continue;

	/*
	* ... but EOS/EOL arcs terminate the search, as do RAINBOW arcs
	* and LACONs
	*/
	if (ca->co == cnfa->eos[0] \|\| ca->co == cnfa->eos[1] \|\|
	ca->co == RAINBOW \|\| ca->co >= cnfa->ncolors)
	{
	thiscolor = COLORLESS;
	break;
	}
	if (thiscolor == COLORLESS)
	{
	/* First plain outarc */
	thiscolor = ca->co;
	nextst = ca->to;
	}
	else if (thiscolor == ca->co)
	{
	/* Another plain outarc for same color */
	nextst = -1;
	}
	else
	{
	/* More than one plain outarc color terminates the search */
	thiscolor = COLORLESS;
	break;
	}
	}
	/* Done if we didn't find exactly one color on plain outarcs */
	if (thiscolor == COLORLESS)
	break;
	/* The color must be a singleton */
	if (cm->cd[thiscolor].nschrs != 1)
	break;
	/* Must not have any high-color-map entries */
	if (cm->cd[thiscolor].nuchrs != 0)
	break;

	/*
	* Identify the color's sole member chr and add it to the prefix
	* string. In general the colormap data structure doesn't provide a
	* way to find color member chrs, except by trying GETCOLOR() on each
	* possible chr value, which won't do at all. However, for the cases
	* we care about it should be sufficient to test the "firstchr" value,
	* that is the first chr ever added to the color. There are cases
	* where this might no longer be a member of the color (so we do need
	* to test), but none of them are likely to arise for a character that
	* is a member of a common prefix. If we do hit such a corner case,
	* we just fall out without adding anything to the prefix string.
	*/
	c = cm->cd[thiscolor].firstchr;
	if (GETCOLOR(cm, c) != thiscolor)
	break;

	string[(*slength)++] = c;

	/* Advance to next state, but only if we have a unique next state */
	} while (nextst != -1);

	/*
	* If we ended at a state that only has EOS/EOL outarcs leading to the
	* "post" state, then we have an exact-match string. Note this is true
	* even if the string is of zero length.
	*/
	nextst = -1;
	for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
	{
	if (ca->co == cnfa->eos[0] \|\| ca->co == cnfa->eos[1])
	{
	if (nextst == -1)
	nextst = ca->to;
	else if (nextst != ca->to)
	{
	nextst = -1;
	break;
	}
	}
	else
	{
	nextst = -1;
	break;
	}
	}
	if (nextst == cnfa->post)
	return REG_EXACT;

	/*
	* Otherwise, if we were unable to identify any prefix characters, say
	* NOMATCH --- the pattern is anchored left, but doesn't specify any
	* particular first character.
	*/
	if (*slength > 0)
	return REG_PREFIX;

	return REG_NOMATCH;
	}