gpcontrib/orafce/regexp.c - cloudberry - Git at Google

 #include "postgres.h"

 #include "catalog/pg_type.h"
 #include "funcapi.h"
 #include "miscadmin.h"
 #include "regex/regex.h"
 #include "utils/array.h"
 #include "utils/builtins.h"
 #include "utils/memutils.h"

 #if PG_VERSION_NUM >= 150000

 #include "utils/varlena.h"

 #endif

 #include "orafce.h"
 #include "builtins.h"

 /* all the options of interest for regex functions */
 typedef struct pg_re_flags
 {
 	int			cflags;			/* compile flags for Spencer's regex code */
 	bool		glob;			/* do it globally (for each occurrence) */
 } pg_re_flags;

 /* cross-call state for regexp_match and regexp_split functions */
 typedef struct regexp_matches_ctx
 {
 	text	   *orig_str;		/* data string in original TEXT form */
 	int			nmatches;		/* number of places where pattern matched */
 	int			npatterns;		/* number of capturing subpatterns */
 	/* We store start char index and end+1 char index for each match */
 	/* so the number of entries in match_locs is nmatches * npatterns * 2 */
 	int		   *match_locs;		/* 0-based character indexes */
 	int			next_match;		/* 0-based index of next match to process */
 	/* workspace for build_regexp_match_result() */
 	Datum	   *elems;			/* has npatterns elements */
 	bool	   *nulls;			/* has npatterns elements */
 	pg_wchar   *wide_str;		/* wide-char version of original string */
 	char	   *conv_buf;		/* conversion buffer, if needed */
 	int			conv_bufsiz;	/* size thereof */
 } regexp_matches_ctx;

 /*
  * Backport code from PostgreSQL 15
  */

 PG_FUNCTION_INFO_V1(orafce_regexp_instr);
 PG_FUNCTION_INFO_V1(orafce_regexp_instr_no_start);
 PG_FUNCTION_INFO_V1(orafce_regexp_instr_no_n);
 PG_FUNCTION_INFO_V1(orafce_regexp_instr_no_endoption);
 PG_FUNCTION_INFO_V1(orafce_regexp_instr_no_flags);
 PG_FUNCTION_INFO_V1(orafce_regexp_instr_no_subexpr);
 PG_FUNCTION_INFO_V1(orafce_textregexreplace_noopt);
 PG_FUNCTION_INFO_V1(orafce_textregexreplace);
 PG_FUNCTION_INFO_V1(orafce_textregexreplace_extended);
 PG_FUNCTION_INFO_V1(orafce_textregexreplace_extended_no_n);
 PG_FUNCTION_INFO_V1(orafce_textregexreplace_extended_no_flags);

 #if PG_VERSION_NUM <  120000


 /* this is the maximum number of cached regular expressions */
 #ifndef MAX_CACHED_RES
 #define MAX_CACHED_RES	32
 #endif

 /* this structure describes one cached regular expression */
 typedef struct cached_re_str
 {
 	char	   *cre_pat;		/* original RE (not null terminated!) */
 	int			cre_pat_len;	/* length of original RE, in bytes */
 	int			cre_flags;		/* compile flags: extended,icase etc */
 	Oid			cre_collation;	/* collation to use */
 	regex_t		cre_re;			/* the compiled regular expression */
 } cached_re_str;

 static int	num_res = 0;		/* # of cached re's */
 static cached_re_str re_array[MAX_CACHED_RES];	/* cached re's */


 /*
  * RE_compile_and_cache - compile a RE, caching if possible
  *
  * Returns regex_t *
  *
  *	text_re --- the pattern, expressed as a TEXT object
  *	cflags --- compile options for the pattern
  *	collation --- collation to use for LC_CTYPE-dependent behavior
  *
  * Pattern is given in the database encoding.  We internally convert to
  * an array of pg_wchar, which is what Spencer's regex package wants.
  */
 static regex_t *
 RE_compile_and_cache(text *text_re, int cflags, Oid collation)
 {
 	int			text_re_len = VARSIZE_ANY_EXHDR(text_re);
 	char	   *text_re_val = VARDATA_ANY(text_re);
 	pg_wchar   *pattern;
 	int			pattern_len;
 	int			i;
 	int			regcomp_result;
 	cached_re_str re_temp;
 	char		errMsg[100];

 	/*
 	 * Look for a match among previously compiled REs.  Since the data
 	 * structure is self-organizing with most-used entries at the front, our
 	 * search strategy can just be to scan from the front.
 	 */
 	for (i = 0; i < num_res; i++)
 	{
 		if (re_array[i].cre_pat_len == text_re_len &&
 			re_array[i].cre_flags == cflags &&
 			re_array[i].cre_collation == collation &&
 			memcmp(re_array[i].cre_pat, text_re_val, text_re_len) == 0)
 		{
 			/*
 			 * Found a match; move it to front if not there already.
 			 */
 			if (i > 0)
 			{
 				re_temp = re_array[i];
 				memmove(&re_array[1], &re_array[0], i * sizeof(cached_re_str));
 				re_array[0] = re_temp;
 			}

 			return &re_array[0].cre_re;
 		}
 	}

 	/*
 	 * Couldn't find it, so try to compile the new RE.  To avoid leaking
 	 * resources on failure, we build into the re_temp local.
 	 */

 	/* Convert pattern string to wide characters */
 	pattern = (pg_wchar *) palloc((text_re_len + 1) * sizeof(pg_wchar));
 	pattern_len = pg_mb2wchar_with_len(text_re_val,
 									   pattern,
 									   text_re_len);

 	regcomp_result = pg_regcomp(&re_temp.cre_re,
 								pattern,
 								pattern_len,
 								cflags,
 								collation);

 	pfree(pattern);

 	if (regcomp_result != REG_OKAY)
 	{
 		/* re didn't compile (no need for pg_regfree, if so) */

 		/*
 		 * Here and in other places in this file, do CHECK_FOR_INTERRUPTS
 		 * before reporting a regex error.  This is so that if the regex
 		 * library aborts and returns REG_CANCEL, we don't print an error
 		 * message that implies the regex was invalid.
 		 */
 		CHECK_FOR_INTERRUPTS();

 		pg_regerror(regcomp_result, &re_temp.cre_re, errMsg, sizeof(errMsg));
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
 				 errmsg("invalid regular expression: %s", errMsg)));
 	}

 	/*
 	 * We use malloc/free for the cre_pat field because the storage has to
 	 * persist across transactions, and because we want to get control back on
 	 * out-of-memory.  The Max() is because some malloc implementations return
 	 * NULL for malloc(0).
 	 */
 	re_temp.cre_pat = malloc(Max(text_re_len, 1));
 	if (re_temp.cre_pat == NULL)
 	{
 		pg_regfree(&re_temp.cre_re);
 		ereport(ERROR,
 				(errcode(ERRCODE_OUT_OF_MEMORY),
 				 errmsg("out of memory")));
 	}
 	memcpy(re_temp.cre_pat, text_re_val, text_re_len);
 	re_temp.cre_pat_len = text_re_len;
 	re_temp.cre_flags = cflags;
 	re_temp.cre_collation = collation;

 	/*
 	 * Okay, we have a valid new item in re_temp; insert it into the storage
 	 * array.  Discard last entry if needed.
 	 */
 	if (num_res >= MAX_CACHED_RES)
 	{
 		--num_res;
 		Assert(num_res < MAX_CACHED_RES);
 		pg_regfree(&re_array[num_res].cre_re);
 		free(re_array[num_res].cre_pat);
 	}

 	if (num_res > 0)
 		memmove(&re_array[1], &re_array[0], num_res * sizeof(cached_re_str));

 	re_array[0] = re_temp;
 	num_res++;

 	return &re_array[0].cre_re;
 }

 #endif

 #if PG_VERSION_NUM <  150000

 /*
  * check_replace_text_has_escape
  *
  * Returns 0 if text contains no backslashes that need processing.
  * Returns 1 if text contains backslashes, but not regexp submatch specifiers.
  * Returns 2 if text contains regexp submatch specifiers (\1 .. \9).
  */
 static int
 check_replace_text_has_escape(const text *replace_text)
 {
 	int			result = 0;
 	const char *p = VARDATA_ANY(replace_text);
 	const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);

 	while (p < p_end)
 	{
 		/* Find next escape char, if any. */
 		p = memchr(p, '\\', p_end - p);
 		if (p == NULL)
 			break;
 		p++;
 		/* Note: a backslash at the end doesn't require extra processing. */
 		if (p < p_end)
 		{
 			if (*p >= '1' && *p <= '9')
 				return 2;		/* Found a submatch specifier, so done */
 			result = 1;			/* Found some other sequence, keep looking */
 			p++;
 		}
 	}
 	return result;
 }

 /*
  * charlen_to_bytelen()
  *	Compute the number of bytes occupied by n characters starting at *p
  *
  * It is caller's responsibility that there actually are n characters;
  * the string need not be null-terminated.
  */
 static int
 charlen_to_bytelen(const char *p, int n)
 {
 	if (pg_database_encoding_max_length() == 1)
 	{
 		/* Optimization for single-byte encodings */
 		return n;
 	}
 	else
 	{
 		const char *s;

 		for (s = p; n > 0; n--)
 			s += pg_mblen(s);

 		return s - p;
 	}
 }

 /*
  * appendStringInfoText
  *
  * Append a text to str.
  * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
  */
 static void
 appendStringInfoText(StringInfo str, const text *t)
 {
 	appendBinaryStringInfo(str, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
 }

 /*
  * appendStringInfoRegexpSubstr
  *
  * Append replace_text to str, substituting regexp back references for
  * \n escapes.  start_ptr is the start of the match in the source string,
  * at logical character position data_pos.
  */
 static void
 appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
 							 regmatch_t *pmatch,
 							 char *start_ptr, int data_pos)
 {
 	const char *p = VARDATA_ANY(replace_text);
 	const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);

 	while (p < p_end)
 	{
 		const char *chunk_start = p;
 		int			so;
 		int			eo;

 		/* Find next escape char, if any. */
 		p = memchr(p, '\\', p_end - p);
 		if (p == NULL)
 			p = p_end;

 		/* Copy the text we just scanned over, if any. */
 		if (p > chunk_start)
 			appendBinaryStringInfo(str, chunk_start, p - chunk_start);

 		/* Done if at end of string, else advance over escape char. */
 		if (p >= p_end)
 			break;
 		p++;

 		if (p >= p_end)
 		{
 			/* Escape at very end of input.  Treat same as unexpected char */
 			appendStringInfoChar(str, '\\');
 			break;
 		}

 		if (*p >= '1' && *p <= '9')
 		{
 			/* Use the back reference of regexp. */
 			int			idx = *p - '0';

 			so = pmatch[idx].rm_so;
 			eo = pmatch[idx].rm_eo;
 			p++;
 		}
 		else if (*p == '&')
 		{
 			/* Use the entire matched string. */
 			so = pmatch[0].rm_so;
 			eo = pmatch[0].rm_eo;
 			p++;
 		}
 		else if (*p == '\\')
 		{
 			/* \\ means transfer one \ to output. */
 			appendStringInfoChar(str, '\\');
 			p++;
 			continue;
 		}
 		else
 		{
 			/*
 			 * If escape char is not followed by any expected char, just treat
 			 * it as ordinary data to copy.  (XXX would it be better to throw
 			 * an error?)
 			 */
 			appendStringInfoChar(str, '\\');
 			continue;
 		}

 		if (so >= 0 && eo >= 0)
 		{
 			/*
 			 * Copy the text that is back reference of regexp.  Note so and eo
 			 * are counted in characters not bytes.
 			 */
 			char	   *chunk_start;
 			int			chunk_len;

 			Assert(so >= data_pos);
 			chunk_start = start_ptr;
 			chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
 			chunk_len = charlen_to_bytelen(chunk_start, eo - so);
 			appendBinaryStringInfo(str, chunk_start, chunk_len);
 		}
 	}
 }

 /*
  * replace_text_regexp
  *
  * replace substring(s) in src_text that match pattern with replace_text.
  * The replace_text can contain backslash markers to substitute
  * (parts of) the matched text.
  *
  * cflags: regexp compile flags.
  * collation: collation to use.
  * search_start: the character (not byte) offset in src_text at which to
  * begin searching.
  * n: if 0, replace all matches; if > 0, replace only the N'th match.
  */
 static text *
 orafce_replace_text_regexp(text *src_text, text *pattern_text,
 					text *replace_text,
 					int cflags, Oid collation,
 					int search_start, int n)
 {
 	text	   *ret_text;
 	regex_t    *re;
 	int			src_text_len = VARSIZE_ANY_EXHDR(src_text);
 	int			nmatches = 0;
 	StringInfoData buf;
 	regmatch_t	pmatch[10];		/* main match, plus \1 to \9 */
 	int			nmatch = lengthof(pmatch);
 	pg_wchar   *data;
 	size_t		data_len;
 	size_t		data_pos;
 	char	   *start_ptr;
 	int			escape_status;

 	initStringInfo(&buf);

 	/* Convert data string to wide characters. */
 	data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
 	data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);

 	/* Check whether replace_text has escapes, especially regexp submatches. */
 	escape_status = check_replace_text_has_escape(replace_text);

 #if PG_VERSION_NUM >=  150000

 	/* REG_NOSUB doesn't work well in pre PostgreSQL 15 */

 	/* If no regexp submatches, we can use REG_NOSUB. */
 	if (escape_status < 2)
 	{
 		cflags |= REG_NOSUB;
 		/* Also tell pg_regexec we only want the whole-match location. */
 		nmatch = 1;
 	}

 #endif

 	/* Prepare the regexp. */
 	re = RE_compile_and_cache(pattern_text, cflags, collation);

 	/* start_ptr points to the data_pos'th character of src_text */
 	start_ptr = (char *) VARDATA_ANY(src_text);
 	data_pos = 0;

 	while (search_start <= (int) data_len)
 	{
 		int			regexec_result;

 		CHECK_FOR_INTERRUPTS();

 		regexec_result = pg_regexec(re,
 									data,
 									data_len,
 									search_start,
 									NULL,	/* no details */
 									nmatch,
 									pmatch,
 									0);

 		if (regexec_result == REG_NOMATCH)
 			break;

 		if (regexec_result != REG_OKAY)
 		{
 			char		errMsg[100];

 			CHECK_FOR_INTERRUPTS();
 			pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
 			ereport(ERROR,
 					(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
 					 errmsg("regular expression failed: %s", errMsg)));
 		}

 		/*
 		 * Count matches, and decide whether to replace this match.
 		 */
 		nmatches++;
 		if (n > 0 && nmatches != n)
 		{
 			/*
 			 * No, so advance search_start, but not start_ptr/data_pos. (Thus,
 			 * we treat the matched text as if it weren't matched, and copy it
 			 * to the output later.)
 			 */
 			search_start = pmatch[0].rm_eo;
 			if (pmatch[0].rm_so == pmatch[0].rm_eo)
 				search_start++;
 			continue;
 		}

 		/*
 		 * Copy the text to the left of the match position.  Note we are given
 		 * character not byte indexes.
 		 */
 		if (pmatch[0].rm_so - data_pos > 0)
 		{
 			int			chunk_len;

 			chunk_len = charlen_to_bytelen(start_ptr,
 										   pmatch[0].rm_so - data_pos);
 			appendBinaryStringInfo(&buf, start_ptr, chunk_len);

 			/*
 			 * Advance start_ptr over that text, to avoid multiple rescans of
 			 * it if the replace_text contains multiple back-references.
 			 */
 			start_ptr += chunk_len;
 			data_pos = pmatch[0].rm_so;
 		}

 		/*
 		 * Copy the replace_text, processing escapes if any are present.
 		 */
 		if (escape_status > 0)
 			appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
 										 start_ptr, data_pos);
 		else
 			appendStringInfoText(&buf, replace_text);

 		/* Advance start_ptr and data_pos over the matched text. */
 		start_ptr += charlen_to_bytelen(start_ptr,
 										pmatch[0].rm_eo - data_pos);
 		data_pos = pmatch[0].rm_eo;

 		/*
 		 * If we only want to replace one occurrence, we're done.
 		 */
 		if (n > 0)
 			break;

 		/*
 		 * Advance search position.  Normally we start the next search at the
 		 * end of the previous match; but if the match was of zero length, we
 		 * have to advance by one character, or we'd just find the same match
 		 * again.
 		 */
 		search_start = data_pos;
 		if (pmatch[0].rm_so == pmatch[0].rm_eo)
 			search_start++;
 	}

 	/*
 	 * Copy the text to the right of the last match.
 	 */
 	if (data_pos < data_len)
 	{
 		int			chunk_len;

 		chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
 		appendBinaryStringInfo(&buf, start_ptr, chunk_len);
 	}

 	ret_text = cstring_to_text_with_len(buf.data, buf.len);
 	pfree(buf.data);
 	pfree(data);

 	return ret_text;
 }

 #else

 #define orafce_replace_text_regexp replace_text_regexp

 #endif

 /*
  * RE_wchar_execute - execute a RE on pg_wchar data
  *
  * Returns true on match, false on no match
  *
  *	re --- the compiled pattern as returned by RE_compile_and_cache
  *	data --- the data to match against (need not be null-terminated)
  *	data_len --- the length of the data string
  *	start_search -- the offset in the data to start searching
  *	nmatch, pmatch	--- optional return area for match details
  *
  * Data is given as array of pg_wchar which is what Spencer's regex package
  * wants.
  */
 static bool
 RE_wchar_execute(regex_t *re, pg_wchar *data, int data_len,
 				 int start_search, int nmatch, regmatch_t *pmatch)
 {
 	int			regexec_result;

 	/* Perform RE match and return result */
 	regexec_result = pg_regexec(re,
 								data,
 								data_len,
 								start_search,
 								NULL,	/* no details */
 								nmatch,
 								pmatch,
 								0);

 	if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH)
 	{
 		char		errMsg[100];

 		/* re failed??? */
 		CHECK_FOR_INTERRUPTS();
 		pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
 				 errmsg("regular expression failed: %s", errMsg)));
 	}

 	return (regexec_result == REG_OKAY);
 }


 /*
  * setup_regexp_matches --- do the initial matching for regexp_match,
  *		regexp_split, and related functions
  *
  * To avoid having to re-find the compiled pattern on each call, we do
  * all the matching in one swoop.  The returned regexp_matches_ctx contains
  * the locations of all the substrings matching the pattern.
  *
  * start_search: the character (not byte) offset in orig_str at which to
  * begin the search.  Returned positions are relative to orig_str anyway.
  * use_subpatterns: collect data about matches to parenthesized subexpressions.
  * ignore_degenerate: ignore zero-length matches.
  * fetching_unmatched: caller wants to fetch unmatched substrings.
  *
  * We don't currently assume that fetching_unmatched is exclusive of fetching
  * the matched text too; if it's set, the conversion buffer is large enough to
  * fetch any single matched or unmatched string, but not any larger
  * substring.  (In practice, when splitting the matches are usually small
  * anyway, and it didn't seem worth complicating the code further.)
  */
 static regexp_matches_ctx *
 setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
 					 int start_search,
 					 Oid collation,
 					 bool use_subpatterns,
 					 bool ignore_degenerate,
 					 bool fetching_unmatched)
 {
 	regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx));
 	int			eml = pg_database_encoding_max_length();
 	int			orig_len;
 	pg_wchar   *wide_str;
 	int			wide_len;
 	regex_t    *cpattern;
 	regmatch_t *pmatch;
 	int			pmatch_len;
 	int			array_len;
 	int			array_idx;
 	int			prev_match_end;
 	int			prev_valid_match_end;
 	int			maxlen = 0;		/* largest fetch length in characters */
 	int			cflags;

 	/* save original string --- we'll extract result substrings from it */
 	matchctx->orig_str = orig_str;

 	/* convert string to pg_wchar form for matching */
 	orig_len = VARSIZE_ANY_EXHDR(orig_str);
 	wide_str = (pg_wchar *) palloc(sizeof(pg_wchar) * (orig_len + 1));
 	wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len);

 	/* set up the compiled pattern */
 	cflags = re_flags->cflags;

 #if PG_VERSION_NUM >=  150000

 	/* REG_NOSUB doesn't work well in pre PostgreSQL 15 */

 	if (!use_subpatterns)
 		cflags |= REG_NOSUB;

 #endif

 	cpattern = RE_compile_and_cache(pattern, cflags, collation);

 	/* do we want to remember subpatterns? */
 	if (use_subpatterns && cpattern->re_nsub > 0)
 	{
 		matchctx->npatterns = cpattern->re_nsub;
 		pmatch_len = cpattern->re_nsub + 1;
 	}
 	else
 	{
 		use_subpatterns = false;
 		matchctx->npatterns = 1;
 		pmatch_len = 1;
 	}

 	/* temporary output space for RE package */
 	pmatch = palloc(sizeof(regmatch_t) * pmatch_len);

 	/*
 	 * the real output space (grown dynamically if needed)
 	 *
 	 * use values 2^n-1, not 2^n, so that we hit the limit at 2^28-1 rather
 	 * than at 2^27
 	 */
 	array_len = re_flags->glob ? 255 : 31;
 	matchctx->match_locs = (int *) palloc(sizeof(int) * array_len);
 	array_idx = 0;

 	/* search for the pattern, perhaps repeatedly */
 	prev_match_end = 0;
 	prev_valid_match_end = 0;
 	while (RE_wchar_execute(cpattern, wide_str, wide_len, start_search,
 							pmatch_len, pmatch))
 	{
 		/*
 		 * If requested, ignore degenerate matches, which are zero-length
 		 * matches occurring at the start or end of a string or just after a
 		 * previous match.
 		 */
 		if (!ignore_degenerate ||
 			(pmatch[0].rm_so < wide_len &&
 			 pmatch[0].rm_eo > prev_match_end))
 		{
 			/* enlarge output space if needed */
 			while (array_idx + matchctx->npatterns * 2 + 1 > array_len)
 			{
 				array_len += array_len + 1; /* 2^n-1 => 2^(n+1)-1 */
 				if (array_len > (int) (MaxAllocSize / sizeof(int)))
 					ereport(ERROR,
 							(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 							 errmsg("too many regular expression matches")));
 				matchctx->match_locs = (int *) repalloc(matchctx->match_locs,
 														sizeof(int) * array_len);
 			}

 			/* save this match's locations */
 			if (use_subpatterns)
 			{
 				int			i;

 				for (i = 1; i <= matchctx->npatterns; i++)
 				{
 					int			so = pmatch[i].rm_so;
 					int			eo = pmatch[i].rm_eo;

 					matchctx->match_locs[array_idx++] = so;
 					matchctx->match_locs[array_idx++] = eo;
 					if (so >= 0 && eo >= 0 && (eo - so) > maxlen)
 						maxlen = (eo - so);
 				}
 			}
 			else
 			{
 				int			so = pmatch[0].rm_so;
 				int			eo = pmatch[0].rm_eo;

 				matchctx->match_locs[array_idx++] = so;
 				matchctx->match_locs[array_idx++] = eo;
 				if (so >= 0 && eo >= 0 && (eo - so) > maxlen)
 					maxlen = (eo - so);
 			}
 			matchctx->nmatches++;

 			/*
 			 * check length of unmatched portion between end of previous valid
 			 * (nondegenerate, or degenerate but not ignored) match and start
 			 * of current one
 			 */
 			if (fetching_unmatched &&
 				pmatch[0].rm_so >= 0 &&
 				(pmatch[0].rm_so - prev_valid_match_end) > maxlen)
 				maxlen = (pmatch[0].rm_so - prev_valid_match_end);
 			prev_valid_match_end = pmatch[0].rm_eo;
 		}
 		prev_match_end = pmatch[0].rm_eo;

 		/* if not glob, stop after one match */
 		if (!re_flags->glob)
 			break;

 		/*
 		 * Advance search position.  Normally we start the next search at the
 		 * end of the previous match; but if the match was of zero length, we
 		 * have to advance by one character, or we'd just find the same match
 		 * again.
 		 */
 		start_search = prev_match_end;
 		if (pmatch[0].rm_so == pmatch[0].rm_eo)
 			start_search++;
 		if (start_search > wide_len)
 			break;
 	}

 	/*
 	 * check length of unmatched portion between end of last match and end of
 	 * input string
 	 */
 	if (fetching_unmatched &&
 		(wide_len - prev_valid_match_end) > maxlen)
 		maxlen = (wide_len - prev_valid_match_end);

 	/*
 	 * Keep a note of the end position of the string for the benefit of
 	 * splitting code.
 	 */
 	matchctx->match_locs[array_idx] = wide_len;

 	if (eml > 1)
 	{
 		int64		maxsiz = eml * (int64) maxlen;
 		int			conv_bufsiz;

 		/*
 		 * Make the conversion buffer large enough for any substring of
 		 * interest.
 		 *
 		 * Worst case: assume we need the maximum size (maxlen*eml), but take
 		 * advantage of the fact that the original string length in bytes is
 		 * an upper bound on the byte length of any fetched substring (and we
 		 * know that len+1 is safe to allocate because the varlena header is
 		 * longer than 1 byte).
 		 */
 		if (maxsiz > orig_len)
 			conv_bufsiz = orig_len + 1;
 		else
 			conv_bufsiz = maxsiz + 1;	/* safe since maxsiz < 2^30 */

 		matchctx->conv_buf = palloc(conv_bufsiz);
 		matchctx->conv_bufsiz = conv_bufsiz;
 		matchctx->wide_str = wide_str;
 	}
 	else
 	{
 		/* No need to keep the wide string if we're in a single-byte charset. */
 		pfree(wide_str);
 		matchctx->wide_str = NULL;
 		matchctx->conv_buf = NULL;
 		matchctx->conv_bufsiz = 0;
 	}

 	/* Clean up temp storage */
 	pfree(pmatch);

 	return matchctx;
 }

 /*
  * parse_re_flags - parse the options argument of regexp_match and friends
  *
  *	flags --- output argument, filled with desired options
  *	opts --- TEXT object, or NULL for defaults
  *
  * This accepts all the options allowed by any of the callers; callers that
  * don't want some have to reject them after the fact.
  */
 static void
 parse_re_flags(pg_re_flags *flags, text *opts)
 {
 	/* regex flavor is always folded into the compile flags */
 	flags->cflags = REG_ADVANCED;
 	flags->glob = false;

 	if (opts)
 	{
 		char	   *opt_p = VARDATA_ANY(opts);
 		int			opt_len = VARSIZE_ANY_EXHDR(opts);
 		int			i;

 		for (i = 0; i < opt_len; i++)
 		{
 			switch (opt_p[i])
 			{
 				case 'g':
 					flags->glob = true;
 					break;
 				case 'b':		/* BREs (but why???) */
 					flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED | REG_QUOTE);
 					break;
 				case 'c':		/* case sensitive */
 					flags->cflags &= ~REG_ICASE;
 					break;
 				case 'e':		/* plain EREs */
 					flags->cflags |= REG_EXTENDED;
 					flags->cflags &= ~(REG_ADVANCED | REG_QUOTE);
 					break;
 				case 'i':		/* case insensitive */
 					flags->cflags |= REG_ICASE;
 					break;
 				case 'm':		/* Perloid synonym for n */
 				case 'n':		/* \n affects ^ $ . [^ */
 					flags->cflags |= REG_NEWLINE;
 					break;
 				case 'p':		/* ~Perl, \n affects . [^ */
 					flags->cflags |= REG_NLSTOP;
 					flags->cflags &= ~REG_NLANCH;
 					break;
 				case 'q':		/* literal string */
 					flags->cflags |= REG_QUOTE;
 					flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED);
 					break;
 				case 's':		/* single line, \n ordinary */
 					flags->cflags &= ~REG_NEWLINE;
 					break;
 				case 't':		/* tight syntax */
 					flags->cflags &= ~REG_EXPANDED;
 					break;
 				case 'w':		/* weird, \n affects ^ $ only */
 					flags->cflags &= ~REG_NLSTOP;
 					flags->cflags |= REG_NLANCH;
 					break;
 				case 'x':		/* expanded syntax */
 					flags->cflags |= REG_EXPANDED;
 					break;
 				default:
 					ereport(ERROR,
 							(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 							 errmsg("invalid regular expression option: \"%.*s\"",
 									pg_mblen(opt_p + i), opt_p + i)));
 					break;
 			}
 		}
 	}
 }

 /*
  * regexp_instr()
  *		Return the match's position within the string
  */
 Datum
 orafce_regexp_instr(PG_FUNCTION_ARGS)
 {
 	text	   *str = NULL;
 	text	   *pattern = NULL;
 	int			start = 1;
 	int			n = 1;
 	int			endoption = 0;
 	text	   *flags = NULL;
 	int			subexpr = 0;
 	int			pos;
 	pg_re_flags re_flags;
 	regexp_matches_ctx *matchctx;

 	if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
 		PG_RETURN_NULL();

 	str = PG_GETARG_TEXT_PP(0);
 	pattern = PG_GETARG_TEXT_PP(1);

 	/* Collect optional parameters */
 	if (PG_NARGS() > 2)
 	{
 		if (PG_ARGISNULL(2))
 			PG_RETURN_NULL();

 		start = PG_GETARG_INT32(2);
 		if (start <= 0)
 			ereport(ERROR,
 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 					 errmsg("argument 'position' must be a number greater than 0")));
 	}
 	if (PG_NARGS() > 3)
 	{
 		if (PG_ARGISNULL(3))
 			PG_RETURN_NULL();

 		n = PG_GETARG_INT32(3);
 		if (n <= 0)
 			ereport(ERROR,
 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 					 errmsg("argument 'occurence' must be a number greater than 0")));
 	}
 	if (PG_NARGS() > 4)
 	{
 		if (PG_ARGISNULL(4))
 			PG_RETURN_NULL();

 		endoption = PG_GETARG_INT32(4);
 		if (endoption != 0 && endoption != 1)
 			ereport(ERROR,
 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 					 errmsg("argument 'return_opt' must be 0 or 1")));
 	}
 	if (PG_NARGS() > 5)
 	{
 		if (!PG_ARGISNULL(5))
 			flags = PG_GETARG_TEXT_PP(5);
 	}
 	if (PG_NARGS() > 6)
 	{
 		if (PG_ARGISNULL(6))
 			PG_RETURN_NULL();

 		subexpr = PG_GETARG_INT32(6);
 		if (subexpr < 0)
 			ereport(ERROR,
 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 					 errmsg("argument 'group' must be a positive number")));
 	}

 	/* Determine options */
 	parse_re_flags(&re_flags, flags);

 	/* But we find all the matches anyway */
 	re_flags.glob = true;

 	/* Do the matching */
 	matchctx = setup_regexp_matches(str, pattern, &re_flags, start - 1,
 									PG_GET_COLLATION(),
 									(subexpr > 0),	/* need submatches? */
 									false, false);

 	/* When n exceeds matches return 0 (includes case of no matches) */
 	if (n > matchctx->nmatches)
 		PG_RETURN_INT32(0);

 	/* When subexpr exceeds number of subexpressions return 0 */
 	if (subexpr > matchctx->npatterns)
 		PG_RETURN_INT32(0);

 	/* Select the appropriate match position to return */
 	pos = (n - 1) * matchctx->npatterns;
 	if (subexpr > 0)
 		pos += subexpr - 1;
 	pos *= 2;
 	if (endoption == 1)
 		pos += 1;

 	if (matchctx->match_locs[pos] >= 0)
 		PG_RETURN_INT32(matchctx->match_locs[pos] + 1);
 	else
 		PG_RETURN_INT32(0);		/* position not identifiable */
 }

 /* This is separate to keep the opr_sanity regression test from complaining */
 Datum
 orafce_regexp_instr_no_start(PG_FUNCTION_ARGS)
 {
 	return orafce_regexp_instr(fcinfo);
 }

 /* This is separate to keep the opr_sanity regression test from complaining */
 Datum
 orafce_regexp_instr_no_n(PG_FUNCTION_ARGS)
 {
 	return orafce_regexp_instr(fcinfo);
 }

 /* This is separate to keep the opr_sanity regression test from complaining */
 Datum
 orafce_regexp_instr_no_endoption(PG_FUNCTION_ARGS)
 {
 	return orafce_regexp_instr(fcinfo);
 }

 /* This is separate to keep the opr_sanity regression test from complaining */
 Datum
 orafce_regexp_instr_no_flags(PG_FUNCTION_ARGS)
 {
 	return orafce_regexp_instr(fcinfo);
 }

 /* This is separate to keep the opr_sanity regression test from complaining */
 Datum
 orafce_regexp_instr_no_subexpr(PG_FUNCTION_ARGS)
 {
 	return orafce_regexp_instr(fcinfo);
 }

 /*
  * textregexreplace_noopt()
  *		Return a string matched by a regular expression, with replacement.
  *
  * This version doesn't have an option argument: we default to case
  * sensitive match, replace the first instance only.
  */
 Datum
 orafce_textregexreplace_noopt(PG_FUNCTION_ARGS)
 {
 	text	   *s;
 	text	   *p;
 	text	   *r;

 	if (PG_ARGISNULL(1) && !PG_ARGISNULL(0))
 		PG_RETURN_TEXT_P(PG_GETARG_TEXT_PP(0));

 	if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2))
 		PG_RETURN_NULL();

 	s = PG_GETARG_TEXT_PP(0);
 	p = PG_GETARG_TEXT_PP(1);
 	r = PG_GETARG_TEXT_PP(2);

 	PG_RETURN_TEXT_P(orafce_replace_text_regexp(s, p, r,
 										 REG_ADVANCED, PG_GET_COLLATION(),
 										 0, 0));
 }

 /*
  * textregexreplace()
  *		Return a string matched by a regular expression, with replacement.
  */
 Datum
 orafce_textregexreplace(PG_FUNCTION_ARGS)
 {
 	text	   *s;
 	text	   *p;
 	text	   *r;
 	text	   *opt = NULL;
 	pg_re_flags flags;

 	/* Always return NULL when start position or occurrence are NULL */
 	if (PG_NARGS() > 3 && PG_ARGISNULL(3))
 		PG_RETURN_NULL();
 	if (PG_NARGS() > 4 && PG_ARGISNULL(4))
 		PG_RETURN_NULL();

 	/*
 	 * Special case for second parameter in REGEXP_REPLACE, when NULL
 	 * returns the original value unless the start position or occurrences
 	 * are NULL too. In this case, it returns NULL (see instruction above).
 	 */
 	if (PG_ARGISNULL(1) && !PG_ARGISNULL(0))
 		PG_RETURN_TEXT_P(PG_GETARG_TEXT_PP(0));

 	if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2))
 		PG_RETURN_NULL();

 	s = PG_GETARG_TEXT_PP(0);
 	p = PG_GETARG_TEXT_PP(1);
 	r = PG_GETARG_TEXT_PP(2);

 	if (!PG_ARGISNULL(3))
 		opt = PG_GETARG_TEXT_PP(3);

 	/*
 	 * regexp_replace() with four arguments will be preferentially resolved as
 	 * this form when the fourth argument is of type UNKNOWN.  However, the
 	 * user might have intended to call textregexreplace_extended_no_n.  If we
 	 * see flags that look like an integer, emit the same error that
 	 * parse_re_flags would, but add a HINT about how to fix it.
 	 */
 	if (opt && VARSIZE_ANY_EXHDR(opt) > 0)
 	{
 		char	   *opt_p = VARDATA_ANY(opt);

 		if (*opt_p >= '0' && *opt_p <= '9')
 			ereport(ERROR,
 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 					 errmsg("invalid regular expression option: \"%.*s\"",
 							pg_mblen(opt_p), opt_p),
 					 errhint("If you meant to use regexp_replace() with a start parameter, cast the fourth argument to integer explicitly.")));
 	}

 	parse_re_flags(&flags, opt);

 	PG_RETURN_TEXT_P(orafce_replace_text_regexp(s, p, r,
 										 flags.cflags, PG_GET_COLLATION(),
 										 0, 0));
 }

 /*
  * textregexreplace_extended()
  *		Return a string matched by a regular expression, with replacement.
  *		Extends textregexreplace by allowing a start position and the
  *		choice of the occurrence to replace (0 means all occurrences).
  */
 Datum
 orafce_textregexreplace_extended(PG_FUNCTION_ARGS)
 {
 	text	   *s;
 	text	   *p;
 	text	   *r;
 	int			start = 1;
 	int			n = 1;
 	text	   *flags = NULL;
 	pg_re_flags re_flags;

 	/* Always return NULL when start position or occurrence are NULL */
 	if (PG_NARGS() > 3 && PG_ARGISNULL(3))
 		PG_RETURN_NULL();
 	if (PG_NARGS() > 4 && PG_ARGISNULL(4))
 		PG_RETURN_NULL();

 	/*
 	 * Special case for second parameter in REGEXP_REPLACE, when NULL
 	 * returns the original value unless the start position or occurrences
 	 * are NULL too. In this case, it returns NULL (see instruction above).
 	 */
 	if (PG_ARGISNULL(1) && !PG_ARGISNULL(0))
 		PG_RETURN_TEXT_P(PG_GETARG_TEXT_PP(0));

 	if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2))
 		PG_RETURN_NULL();

 	s = PG_GETARG_TEXT_PP(0);
 	p = PG_GETARG_TEXT_PP(1);
 	r = PG_GETARG_TEXT_PP(2);

 	/* Collect optional parameters */
 	if (PG_NARGS() > 3)
 	{
 		start = PG_GETARG_INT32(3);
 		if (start <= 0)
 			ereport(ERROR,
 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 					 errmsg("argument 'position' must be a number greater than 0")));
 	}
 	if (PG_NARGS() > 4)
 	{
 		n = PG_GETARG_INT32(4);
 		if (n < 0)
 			ereport(ERROR,
 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 					 errmsg("argument 'occurrence' must be a positive number")));
 	}
 	if (PG_NARGS() > 5)
 	{
 		if (!PG_ARGISNULL(5))
 			flags = PG_GETARG_TEXT_PP(5);
 	}

 	/* Determine options */
 	parse_re_flags(&re_flags, flags);

 	/* The global modifier is not allowed with Oracle */
 	if (re_flags.glob)
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 				 errmsg("modifier 'g' is not supported by this function")));

 	/*
 	 * If N was not specified, force the 'g' modifier. This is the
 	 * default in Oracle when no occurence is specified.
 	 */
 	if (PG_NARGS() <= 4)
 		n = 0;

 	/* Do the replacement(s) */
 	PG_RETURN_TEXT_P(orafce_replace_text_regexp(s, p, r,
 										 re_flags.cflags, PG_GET_COLLATION(),
 										 start - 1, n));
 }

 /* This is separate to keep the opr_sanity regression test from complaining */
 Datum
 orafce_textregexreplace_extended_no_n(PG_FUNCTION_ARGS)
 {
 	return orafce_textregexreplace_extended(fcinfo);
 }

 /* This is separate to keep the opr_sanity regression test from complaining */
 Datum
 orafce_textregexreplace_extended_no_flags(PG_FUNCTION_ARGS)
 {
 	return orafce_textregexreplace_extended(fcinfo);
 }