regexp.c

#include "postgres.h"

#include "catalog/pg_type.h"
#include "funcapi.h"
#include "miscadmin.h"
#include "regex/regex.h"
#include "utils/array.h"
#include "utils/builtins.h"
#include "utils/memutils.h"

#if PG_VERSION_NUM >= 150000

#include "utils/varlena.h"

#endif

#include "orafce.h"
#include "builtins.h"

/* all the options of interest for regex functions */
typedef struct pg_re_flags
{
	int			cflags;			/* compile flags for Spencer's regex code */
	bool		glob;			/* do it globally (for each occurrence) */
} pg_re_flags;

/* cross-call state for regexp_match and regexp_split functions */
typedef struct regexp_matches_ctx
{
	text	   *orig_str;		/* data string in original TEXT form */
	int			nmatches;		/* number of places where pattern matched */
	int			npatterns;		/* number of capturing subpatterns */
	/* We store start char index and end+1 char index for each match */
	/* so the number of entries in match_locs is nmatches * npatterns * 2 */
	int		   *match_locs;		/* 0-based character indexes */
	int			next_match;		/* 0-based index of next match to process */
	/* workspace for build_regexp_match_result() */
	Datum	   *elems;			/* has npatterns elements */
	bool	   *nulls;			/* has npatterns elements */
	pg_wchar   *wide_str;		/* wide-char version of original string */
	char	   *conv_buf;		/* conversion buffer, if needed */
	int			conv_bufsiz;	/* size thereof */
} regexp_matches_ctx;

/*
 * Backport code from PostgreSQL 15
 */

PG_FUNCTION_INFO_V1(orafce_regexp_instr);
PG_FUNCTION_INFO_V1(orafce_regexp_instr_no_start);
PG_FUNCTION_INFO_V1(orafce_regexp_instr_no_n);
PG_FUNCTION_INFO_V1(orafce_regexp_instr_no_endoption);
PG_FUNCTION_INFO_V1(orafce_regexp_instr_no_flags);
PG_FUNCTION_INFO_V1(orafce_regexp_instr_no_subexpr);
PG_FUNCTION_INFO_V1(orafce_textregexreplace_noopt);
PG_FUNCTION_INFO_V1(orafce_textregexreplace);
PG_FUNCTION_INFO_V1(orafce_textregexreplace_extended);
PG_FUNCTION_INFO_V1(orafce_textregexreplace_extended_no_n);
PG_FUNCTION_INFO_V1(orafce_textregexreplace_extended_no_flags);

#if PG_VERSION_NUM <  120000


/* this is the maximum number of cached regular expressions */
#ifndef MAX_CACHED_RES
#define MAX_CACHED_RES	32
#endif

/* this structure describes one cached regular expression */
typedef struct cached_re_str
{
	char	   *cre_pat;		/* original RE (not null terminated!) */
	int			cre_pat_len;	/* length of original RE, in bytes */
	int			cre_flags;		/* compile flags: extended,icase etc */
	Oid			cre_collation;	/* collation to use */
	regex_t		cre_re;			/* the compiled regular expression */
} cached_re_str;

static int	num_res = 0;		/* # of cached re's */
static cached_re_str re_array[MAX_CACHED_RES];	/* cached re's */


/*
 * RE_compile_and_cache - compile a RE, caching if possible
 *
 * Returns regex_t *
 *
 *	text_re --- the pattern, expressed as a TEXT object
 *	cflags --- compile options for the pattern
 *	collation --- collation to use for LC_CTYPE-dependent behavior
 *
 * Pattern is given in the database encoding.  We internally convert to
 * an array of pg_wchar, which is what Spencer's regex package wants.
 */
static regex_t *
RE_compile_and_cache(text *text_re, int cflags, Oid collation)
{
	int			text_re_len = VARSIZE_ANY_EXHDR(text_re);
	char	   *text_re_val = VARDATA_ANY(text_re);
	pg_wchar   *pattern;
	int			pattern_len;
	int			i;
	int			regcomp_result;
	cached_re_str re_temp;
	char		errMsg[100];

	/*
	 * Look for a match among previously compiled REs.  Since the data
	 * structure is self-organizing with most-used entries at the front, our
	 * search strategy can just be to scan from the front.
	 */
	for (i = 0; i < num_res; i++)
	{
		if (re_array[i].cre_pat_len == text_re_len &&
			re_array[i].cre_flags == cflags &&
			re_array[i].cre_collation == collation &&
			memcmp(re_array[i].cre_pat, text_re_val, text_re_len) == 0)
		{
			/*
			 * Found a match; move it to front if not there already.
			 */
			if (i > 0)
			{
				re_temp = re_array[i];
				memmove(&re_array[1], &re_array[0], i * sizeof(cached_re_str));
				re_array[0] = re_temp;
			}

			return &re_array[0].cre_re;
		}
	}

	/*
	 * Couldn't find it, so try to compile the new RE.  To avoid leaking
	 * resources on failure, we build into the re_temp local.
	 */

	/* Convert pattern string to wide characters */
	pattern = (pg_wchar *) palloc((text_re_len + 1) * sizeof(pg_wchar));
	pattern_len = pg_mb2wchar_with_len(text_re_val,
									   pattern,
									   text_re_len);

	regcomp_result = pg_regcomp(&re_temp.cre_re,
								pattern,
								pattern_len,
								cflags,
								collation);

	pfree(pattern);

	if (regcomp_result != REG_OKAY)
	{
		/* re didn't compile (no need for pg_regfree, if so) */

		/*
		 * Here and in other places in this file, do CHECK_FOR_INTERRUPTS
		 * before reporting a regex error.  This is so that if the regex
		 * library aborts and returns REG_CANCEL, we don't print an error
		 * message that implies the regex was invalid.
		 */
		CHECK_FOR_INTERRUPTS();

		pg_regerror(regcomp_result, &re_temp.cre_re, errMsg, sizeof(errMsg));
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
				 errmsg("invalid regular expression: %s", errMsg)));
	}

	/*
	 * We use malloc/free for the cre_pat field because the storage has to
	 * persist across transactions, and because we want to get control back on
	 * out-of-memory.  The Max() is because some malloc implementations return
	 * NULL for malloc(0).
	 */
	re_temp.cre_pat = malloc(Max(text_re_len, 1));
	if (re_temp.cre_pat == NULL)
	{
		pg_regfree(&re_temp.cre_re);
		ereport(ERROR,
				(errcode(ERRCODE_OUT_OF_MEMORY),
				 errmsg("out of memory")));
	}
	memcpy(re_temp.cre_pat, text_re_val, text_re_len);
	re_temp.cre_pat_len = text_re_len;
	re_temp.cre_flags = cflags;
	re_temp.cre_collation = collation;

	/*
	 * Okay, we have a valid new item in re_temp; insert it into the storage
	 * array.  Discard last entry if needed.
	 */
	if (num_res >= MAX_CACHED_RES)
	{
		--num_res;
		Assert(num_res < MAX_CACHED_RES);
		pg_regfree(&re_array[num_res].cre_re);
		free(re_array[num_res].cre_pat);
	}

	if (num_res > 0)
		memmove(&re_array[1], &re_array[0], num_res * sizeof(cached_re_str));

	re_array[0] = re_temp;
	num_res++;

	return &re_array[0].cre_re;
}

#endif

#if PG_VERSION_NUM <  150000

/*
 * check_replace_text_has_escape
 *
 * Returns 0 if text contains no backslashes that need processing.
 * Returns 1 if text contains backslashes, but not regexp submatch specifiers.
 * Returns 2 if text contains regexp submatch specifiers (\1 .. \9).
 */
static int
check_replace_text_has_escape(const text *replace_text)
{
	int			result = 0;
	const char *p = VARDATA_ANY(replace_text);
	const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);

	while (p < p_end)
	{
		/* Find next escape char, if any. */
		p = memchr(p, '\\', p_end - p);
		if (p == NULL)
			break;
		p++;
		/* Note: a backslash at the end doesn't require extra processing. */
		if (p < p_end)
		{
			if (*p >= '1' && *p <= '9')
				return 2;		/* Found a submatch specifier, so done */
			result = 1;			/* Found some other sequence, keep looking */
			p++;
		}
	}
	return result;
}

/*
 * charlen_to_bytelen()
 *	Compute the number of bytes occupied by n characters starting at *p
 *
 * It is caller's responsibility that there actually are n characters;
 * the string need not be null-terminated.
 */
static int
charlen_to_bytelen(const char *p, int n)
{
	if (pg_database_encoding_max_length() == 1)
	{
		/* Optimization for single-byte encodings */
		return n;
	}
	else
	{
		const char *s;

		for (s = p; n > 0; n--)
			s += pg_mblen(s);

		return s - p;
	}
}

/*
 * appendStringInfoText
 *
 * Append a text to str.
 * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
 */
static void
appendStringInfoText(StringInfo str, const text *t)
{
	appendBinaryStringInfo(str, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
}

/*
 * appendStringInfoRegexpSubstr
 *
 * Append replace_text to str, substituting regexp back references for
 * \n escapes.  start_ptr is the start of the match in the source string,
 * at logical character position data_pos.
 */
static void
appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
							 regmatch_t *pmatch,
							 char *start_ptr, int data_pos)
{
	const char *p = VARDATA_ANY(replace_text);
	const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);

	while (p < p_end)
	{
		const char *chunk_start = p;
		int			so;
		int			eo;

		/* Find next escape char, if any. */
		p = memchr(p, '\\', p_end - p);
		if (p == NULL)
			p = p_end;

		/* Copy the text we just scanned over, if any. */
		if (p > chunk_start)
			appendBinaryStringInfo(str, chunk_start, p - chunk_start);

		/* Done if at end of string, else advance over escape char. */
		if (p >= p_end)
			break;
		p++;

		if (p >= p_end)
		{
			/* Escape at very end of input.  Treat same as unexpected char */
			appendStringInfoChar(str, '\\');
			break;
		}

		if (*p >= '1' && *p <= '9')
		{
			/* Use the back reference of regexp. */
			int			idx = *p - '0';

			so = pmatch[idx].rm_so;
			eo = pmatch[idx].rm_eo;
			p++;
		}
		else if (*p == '&')
		{
			/* Use the entire matched string. */
			so = pmatch[0].rm_so;
			eo = pmatch[0].rm_eo;
			p++;
		}
		else if (*p == '\\')
		{
			/* \\ means transfer one \ to output. */
			appendStringInfoChar(str, '\\');
			p++;
			continue;
		}
		else
		{
			/*
			 * If escape char is not followed by any expected char, just treat
			 * it as ordinary data to copy.  (XXX would it be better to throw
			 * an error?)
			 */
			appendStringInfoChar(str, '\\');
			continue;
		}

		if (so >= 0 && eo >= 0)
		{
			/*
			 * Copy the text that is back reference of regexp.  Note so and eo
			 * are counted in characters not bytes.
			 */
			char	   *chunk_start;
			int			chunk_len;

			Assert(so >= data_pos);
			chunk_start = start_ptr;
			chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
			chunk_len = charlen_to_bytelen(chunk_start, eo - so);
			appendBinaryStringInfo(str, chunk_start, chunk_len);
		}
	}
}

/*
 * replace_text_regexp
 *
 * replace substring(s) in src_text that match pattern with replace_text.
 * The replace_text can contain backslash markers to substitute
 * (parts of) the matched text.
 *
 * cflags: regexp compile flags.
 * collation: collation to use.
 * search_start: the character (not byte) offset in src_text at which to
 * begin searching.
 * n: if 0, replace all matches; if > 0, replace only the N'th match.
 */
static text *
orafce_replace_text_regexp(text *src_text, text *pattern_text,
					text *replace_text,
					int cflags, Oid collation,
					int search_start, int n)
{
	text	   *ret_text;
	regex_t    *re;
	int			src_text_len = VARSIZE_ANY_EXHDR(src_text);
	int			nmatches = 0;
	StringInfoData buf;
	regmatch_t	pmatch[10];		/* main match, plus \1 to \9 */
	int			nmatch = lengthof(pmatch);
	pg_wchar   *data;
	size_t		data_len;
	size_t		data_pos;
	char	   *start_ptr;
	int			escape_status;

	initStringInfo(&buf);

	/* Convert data string to wide characters. */
	data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
	data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);

	/* Check whether replace_text has escapes, especially regexp submatches. */
	escape_status = check_replace_text_has_escape(replace_text);

#if PG_VERSION_NUM >=  150000

	/* REG_NOSUB doesn't work well in pre PostgreSQL 15 */

	/* If no regexp submatches, we can use REG_NOSUB. */
	if (escape_status < 2)
	{
		cflags |= REG_NOSUB;
		/* Also tell pg_regexec we only want the whole-match location. */
		nmatch = 1;
	}

#endif

	/* Prepare the regexp. */
	re = RE_compile_and_cache(pattern_text, cflags, collation);

	/* start_ptr points to the data_pos'th character of src_text */
	start_ptr = (char *) VARDATA_ANY(src_text);
	data_pos = 0;

	while (search_start <= (int) data_len)
	{
		int			regexec_result;

		CHECK_FOR_INTERRUPTS();

		regexec_result = pg_regexec(re,
									data,
									data_len,
									search_start,
									NULL,	/* no details */
									nmatch,
									pmatch,
									0);

		if (regexec_result == REG_NOMATCH)
			break;

		if (regexec_result != REG_OKAY)
		{
			char		errMsg[100];

			CHECK_FOR_INTERRUPTS();
			pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
			ereport(ERROR,
					(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
					 errmsg("regular expression failed: %s", errMsg)));
		}

		/*
		 * Count matches, and decide whether to replace this match.
		 */
		nmatches++;
		if (n > 0 && nmatches != n)
		{
			/*
			 * No, so advance search_start, but not start_ptr/data_pos. (Thus,
			 * we treat the matched text as if it weren't matched, and copy it
			 * to the output later.)
			 */
			search_start = pmatch[0].rm_eo;
			if (pmatch[0].rm_so == pmatch[0].rm_eo)
				search_start++;
			continue;
		}

		/*
		 * Copy the text to the left of the match position.  Note we are given
		 * character not byte indexes.
		 */
		if (pmatch[0].rm_so - data_pos > 0)
		{
			int			chunk_len;

			chunk_len = charlen_to_bytelen(start_ptr,
										   pmatch[0].rm_so - data_pos);
			appendBinaryStringInfo(&buf, start_ptr, chunk_len);

			/*
			 * Advance start_ptr over that text, to avoid multiple rescans of
			 * it if the replace_text contains multiple back-references.
			 */
			start_ptr += chunk_len;
			data_pos = pmatch[0].rm_so;
		}

		/*
		 * Copy the replace_text, processing escapes if any are present.
		 */
		if (escape_status > 0)
			appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
										 start_ptr, data_pos);
		else
			appendStringInfoText(&buf, replace_text);

		/* Advance start_ptr and data_pos over the matched text. */
		start_ptr += charlen_to_bytelen(start_ptr,
										pmatch[0].rm_eo - data_pos);
		data_pos = pmatch[0].rm_eo;

		/*
		 * If we only want to replace one occurrence, we're done.
		 */
		if (n > 0)
			break;

		/*
		 * Advance search position.  Normally we start the next search at the
		 * end of the previous match; but if the match was of zero length, we
		 * have to advance by one character, or we'd just find the same match
		 * again.
		 */
		search_start = data_pos;
		if (pmatch[0].rm_so == pmatch[0].rm_eo)
			search_start++;
	}

	/*
	 * Copy the text to the right of the last match.
	 */
	if (data_pos < data_len)
	{
		int			chunk_len;

		chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
		appendBinaryStringInfo(&buf, start_ptr, chunk_len);
	}

	ret_text = cstring_to_text_with_len(buf.data, buf.len);
	pfree(buf.data);
	pfree(data);

	return ret_text;
}

#else

#define orafce_replace_text_regexp replace_text_regexp

#endif

/*
 * RE_wchar_execute - execute a RE on pg_wchar data
 *
 * Returns true on match, false on no match
 *
 *	re --- the compiled pattern as returned by RE_compile_and_cache
 *	data --- the data to match against (need not be null-terminated)
 *	data_len --- the length of the data string
 *	start_search -- the offset in the data to start searching
 *	nmatch, pmatch	--- optional return area for match details
 *
 * Data is given as array of pg_wchar which is what Spencer's regex package
 * wants.
 */
static bool
RE_wchar_execute(regex_t *re, pg_wchar *data, int data_len,
				 int start_search, int nmatch, regmatch_t *pmatch)
{
	int			regexec_result;

	/* Perform RE match and return result */
	regexec_result = pg_regexec(re,
								data,
								data_len,
								start_search,
								NULL,	/* no details */
								nmatch,
								pmatch,
								0);

	if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH)
	{
		char		errMsg[100];

		/* re failed??? */
		CHECK_FOR_INTERRUPTS();
		pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
				 errmsg("regular expression failed: %s", errMsg)));
	}

	return (regexec_result == REG_OKAY);
}


/*
 * setup_regexp_matches --- do the initial matching for regexp_match,
 *		regexp_split, and related functions
 *
 * To avoid having to re-find the compiled pattern on each call, we do
 * all the matching in one swoop.  The returned regexp_matches_ctx contains
 * the locations of all the substrings matching the pattern.
 *
 * start_search: the character (not byte) offset in orig_str at which to
 * begin the search.  Returned positions are relative to orig_str anyway.
 * use_subpatterns: collect data about matches to parenthesized subexpressions.
 * ignore_degenerate: ignore zero-length matches.
 * fetching_unmatched: caller wants to fetch unmatched substrings.
 *
 * We don't currently assume that fetching_unmatched is exclusive of fetching
 * the matched text too; if it's set, the conversion buffer is large enough to
 * fetch any single matched or unmatched string, but not any larger
 * substring.  (In practice, when splitting the matches are usually small
 * anyway, and it didn't seem worth complicating the code further.)
 */
static regexp_matches_ctx *
setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
					 int start_search,
					 Oid collation,
					 bool use_subpatterns,
					 bool ignore_degenerate,
					 bool fetching_unmatched)
{
	regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx));
	int			eml = pg_database_encoding_max_length();
	int			orig_len;
	pg_wchar   *wide_str;
	int			wide_len;
	regex_t    *cpattern;
	regmatch_t *pmatch;
	int			pmatch_len;
	int			array_len;
	int			array_idx;
	int			prev_match_end;
	int			prev_valid_match_end;
	int			maxlen = 0;		/* largest fetch length in characters */
	int			cflags;

	/* save original string --- we'll extract result substrings from it */
	matchctx->orig_str = orig_str;

	/* convert string to pg_wchar form for matching */
	orig_len = VARSIZE_ANY_EXHDR(orig_str);
	wide_str = (pg_wchar *) palloc(sizeof(pg_wchar) * (orig_len + 1));
	wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len);

	/* set up the compiled pattern */
	cflags = re_flags->cflags;

#if PG_VERSION_NUM >=  150000

	/* REG_NOSUB doesn't work well in pre PostgreSQL 15 */

	if (!use_subpatterns)
		cflags |= REG_NOSUB;

#endif

	cpattern = RE_compile_and_cache(pattern, cflags, collation);

	/* do we want to remember subpatterns? */
	if (use_subpatterns && cpattern->re_nsub > 0)
	{
		matchctx->npatterns = cpattern->re_nsub;
		pmatch_len = cpattern->re_nsub + 1;
	}
	else
	{
		use_subpatterns = false;
		matchctx->npatterns = 1;
		pmatch_len = 1;
	}

	/* temporary output space for RE package */
	pmatch = palloc(sizeof(regmatch_t) * pmatch_len);

	/*
	 * the real output space (grown dynamically if needed)
	 *
	 * use values 2^n-1, not 2^n, so that we hit the limit at 2^28-1 rather
	 * than at 2^27
	 */
	array_len = re_flags->glob ? 255 : 31;
	matchctx->match_locs = (int *) palloc(sizeof(int) * array_len);
	array_idx = 0;

	/* search for the pattern, perhaps repeatedly */
	prev_match_end = 0;
	prev_valid_match_end = 0;
	while (RE_wchar_execute(cpattern, wide_str, wide_len, start_search,
							pmatch_len, pmatch))
	{
		/*
		 * If requested, ignore degenerate matches, which are zero-length
		 * matches occurring at the start or end of a string or just after a
		 * previous match.
		 */
		if (!ignore_degenerate ||
			(pmatch[0].rm_so < wide_len &&
			 pmatch[0].rm_eo > prev_match_end))
		{
			/* enlarge output space if needed */
			while (array_idx + matchctx->npatterns * 2 + 1 > array_len)
			{
				array_len += array_len + 1; /* 2^n-1 => 2^(n+1)-1 */
				if (array_len > (int) (MaxAllocSize / sizeof(int)))
					ereport(ERROR,
							(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
							 errmsg("too many regular expression matches")));
				matchctx->match_locs = (int *) repalloc(matchctx->match_locs,
														sizeof(int) * array_len);
			}

			/* save this match's locations */
			if (use_subpatterns)
			{
				int			i;

				for (i = 1; i <= matchctx->npatterns; i++)
				{
					int			so = pmatch[i].rm_so;
					int			eo = pmatch[i].rm_eo;

					matchctx->match_locs[array_idx++] = so;
					matchctx->match_locs[array_idx++] = eo;
					if (so >= 0 && eo >= 0 && (eo - so) > maxlen)
						maxlen = (eo - so);
				}
			}
			else
			{
				int			so = pmatch[0].rm_so;
				int			eo = pmatch[0].rm_eo;

				matchctx->match_locs[array_idx++] = so;
				matchctx->match_locs[array_idx++] = eo;
				if (so >= 0 && eo >= 0 && (eo - so) > maxlen)
					maxlen = (eo - so);
			}
			matchctx->nmatches++;

			/*
			 * check length of unmatched portion between end of previous valid
			 * (nondegenerate, or degenerate but not ignored) match and start
			 * of current one
			 */
			if (fetching_unmatched &&
				pmatch[0].rm_so >= 0 &&
				(pmatch[0].rm_so - prev_valid_match_end) > maxlen)
				maxlen = (pmatch[0].rm_so - prev_valid_match_end);
			prev_valid_match_end = pmatch[0].rm_eo;
		}
		prev_match_end = pmatch[0].rm_eo;

		/* if not glob, stop after one match */
		if (!re_flags->glob)
			break;

		/*
		 * Advance search position.  Normally we start the next search at the
		 * end of the previous match; but if the match was of zero length, we
		 * have to advance by one character, or we'd just find the same match
		 * again.
		 */
		start_search = prev_match_end;
		if (pmatch[0].rm_so == pmatch[0].rm_eo)
			start_search++;
		if (start_search > wide_len)
			break;
	}

	/*
	 * check length of unmatched portion between end of last match and end of
	 * input string
	 */
	if (fetching_unmatched &&
		(wide_len - prev_valid_match_end) > maxlen)
		maxlen = (wide_len - prev_valid_match_end);

	/*
	 * Keep a note of the end position of the string for the benefit of
	 * splitting code.
	 */
	matchctx->match_locs[array_idx] = wide_len;

	if (eml > 1)
	{
		int64		maxsiz = eml * (int64) maxlen;
		int			conv_bufsiz;

		/*
		 * Make the conversion buffer large enough for any substring of
		 * interest.
		 *
		 * Worst case: assume we need the maximum size (maxlen*eml), but take
		 * advantage of the fact that the original string length in bytes is
		 * an upper bound on the byte length of any fetched substring (and we
		 * know that len+1 is safe to allocate because the varlena header is
		 * longer than 1 byte).
		 */
		if (maxsiz > orig_len)
			conv_bufsiz = orig_len + 1;
		else
			conv_bufsiz = maxsiz + 1;	/* safe since maxsiz < 2^30 */

		matchctx->conv_buf = palloc(conv_bufsiz);
		matchctx->conv_bufsiz = conv_bufsiz;
		matchctx->wide_str = wide_str;
	}
	else
	{
		/* No need to keep the wide string if we're in a single-byte charset. */
		pfree(wide_str);
		matchctx->wide_str = NULL;
		matchctx->conv_buf = NULL;
		matchctx->conv_bufsiz = 0;
	}

	/* Clean up temp storage */
	pfree(pmatch);

	return matchctx;
}

/*
 * parse_re_flags - parse the options argument of regexp_match and friends
 *
 *	flags --- output argument, filled with desired options
 *	opts --- TEXT object, or NULL for defaults
 *
 * This accepts all the options allowed by any of the callers; callers that
 * don't want some have to reject them after the fact.
 */
static void
parse_re_flags(pg_re_flags *flags, text *opts)
{
	/* regex flavor is always folded into the compile flags */
	flags->cflags = REG_ADVANCED;
	flags->glob = false;

	if (opts)
	{
		char	   *opt_p = VARDATA_ANY(opts);
		int			opt_len = VARSIZE_ANY_EXHDR(opts);
		int			i;

		for (i = 0; i < opt_len; i++)
		{
			switch (opt_p[i])
			{
				case 'g':
					flags->glob = true;
					break;
				case 'b':		/* BREs (but why???) */
					flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED | REG_QUOTE);
					break;
				case 'c':		/* case sensitive */
					flags->cflags &= ~REG_ICASE;
					break;
				case 'e':		/* plain EREs */
					flags->cflags |= REG_EXTENDED;
					flags->cflags &= ~(REG_ADVANCED | REG_QUOTE);
					break;
				case 'i':		/* case insensitive */
					flags->cflags |= REG_ICASE;
					break;
				case 'm':		/* Perloid synonym for n */
				case 'n':		/* \n affects ^ $ . [^ */
					flags->cflags |= REG_NEWLINE;
					break;
				case 'p':		/* ~Perl, \n affects . [^ */
					flags->cflags |= REG_NLSTOP;
					flags->cflags &= ~REG_NLANCH;
					break;
				case 'q':		/* literal string */
					flags->cflags |= REG_QUOTE;
					flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED);
					break;
				case 's':		/* single line, \n ordinary */
					flags->cflags &= ~REG_NEWLINE;
					break;
				case 't':		/* tight syntax */
					flags->cflags &= ~REG_EXPANDED;
					break;
				case 'w':		/* weird, \n affects ^ $ only */
					flags->cflags &= ~REG_NLSTOP;
					flags->cflags |= REG_NLANCH;
					break;
				case 'x':		/* expanded syntax */
					flags->cflags |= REG_EXPANDED;
					break;
				default:
					ereport(ERROR,
							(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
							 errmsg("invalid regular expression option: \"%.*s\"",
									pg_mblen(opt_p + i), opt_p + i)));
					break;
			}
		}
	}
}

/*
 * regexp_instr()
 *		Return the match's position within the string
 */
Datum
orafce_regexp_instr(PG_FUNCTION_ARGS)
{
	text	   *str = NULL;
	text	   *pattern = NULL;
	int			start = 1;
	int			n = 1;
	int			endoption = 0;
	text	   *flags = NULL;
	int			subexpr = 0;
	int			pos;
	pg_re_flags re_flags;
	regexp_matches_ctx *matchctx;

	if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
		PG_RETURN_NULL();

	str = PG_GETARG_TEXT_PP(0);
	pattern = PG_GETARG_TEXT_PP(1);

	/* Collect optional parameters */
	if (PG_NARGS() > 2)
	{
		if (PG_ARGISNULL(2))
			PG_RETURN_NULL();

		start = PG_GETARG_INT32(2);
		if (start <= 0)
			ereport(ERROR,
					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
					 errmsg("argument 'position' must be a number greater than 0")));
	}
	if (PG_NARGS() > 3)
	{
		if (PG_ARGISNULL(3))
			PG_RETURN_NULL();

		n = PG_GETARG_INT32(3);
		if (n <= 0)
			ereport(ERROR,
					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
					 errmsg("argument 'occurence' must be a number greater than 0")));
	}
	if (PG_NARGS() > 4)
	{
		if (PG_ARGISNULL(4))
			PG_RETURN_NULL();

		endoption = PG_GETARG_INT32(4);
		if (endoption != 0 && endoption != 1)
			ereport(ERROR,
					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
					 errmsg("argument 'return_opt' must be 0 or 1")));
	}
	if (PG_NARGS() > 5)
	{
		if (!PG_ARGISNULL(5))
			flags = PG_GETARG_TEXT_PP(5);
	}
	if (PG_NARGS() > 6)
	{
		if (PG_ARGISNULL(6))
			PG_RETURN_NULL();

		subexpr = PG_GETARG_INT32(6);
		if (subexpr < 0)
			ereport(ERROR,
					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
					 errmsg("argument 'group' must be a positive number")));
	}

	/* Determine options */
	parse_re_flags(&re_flags, flags);

	/* But we find all the matches anyway */
	re_flags.glob = true;

	/* Do the matching */
	matchctx = setup_regexp_matches(str, pattern, &re_flags, start - 1,
									PG_GET_COLLATION(),
									(subexpr > 0),	/* need submatches? */
									false, false);

	/* When n exceeds matches return 0 (includes case of no matches) */
	if (n > matchctx->nmatches)
		PG_RETURN_INT32(0);

	/* When subexpr exceeds number of subexpressions return 0 */
	if (subexpr > matchctx->npatterns)
		PG_RETURN_INT32(0);

	/* Select the appropriate match position to return */
	pos = (n - 1) * matchctx->npatterns;
	if (subexpr > 0)
		pos += subexpr - 1;
	pos *= 2;
	if (endoption == 1)
		pos += 1;

	if (matchctx->match_locs[pos] >= 0)
		PG_RETURN_INT32(matchctx->match_locs[pos] + 1);
	else
		PG_RETURN_INT32(0);		/* position not identifiable */
}

/* This is separate to keep the opr_sanity regression test from complaining */
Datum
orafce_regexp_instr_no_start(PG_FUNCTION_ARGS)
{
	return orafce_regexp_instr(fcinfo);
}

/* This is separate to keep the opr_sanity regression test from complaining */
Datum
orafce_regexp_instr_no_n(PG_FUNCTION_ARGS)
{
	return orafce_regexp_instr(fcinfo);
}

/* This is separate to keep the opr_sanity regression test from complaining */
Datum
orafce_regexp_instr_no_endoption(PG_FUNCTION_ARGS)
{
	return orafce_regexp_instr(fcinfo);
}

/* This is separate to keep the opr_sanity regression test from complaining */
Datum
orafce_regexp_instr_no_flags(PG_FUNCTION_ARGS)
{
	return orafce_regexp_instr(fcinfo);
}

/* This is separate to keep the opr_sanity regression test from complaining */
Datum
orafce_regexp_instr_no_subexpr(PG_FUNCTION_ARGS)
{
	return orafce_regexp_instr(fcinfo);
}

/*
 * textregexreplace_noopt()
 *		Return a string matched by a regular expression, with replacement.
 *
 * This version doesn't have an option argument: we default to case
 * sensitive match, replace the first instance only.
 */
Datum
orafce_textregexreplace_noopt(PG_FUNCTION_ARGS)
{
	text	   *s;
	text	   *p;
	text	   *r;

	if (PG_ARGISNULL(1) && !PG_ARGISNULL(0))
		PG_RETURN_TEXT_P(PG_GETARG_TEXT_PP(0));

	if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2))
		PG_RETURN_NULL();

	s = PG_GETARG_TEXT_PP(0);
	p = PG_GETARG_TEXT_PP(1);
	r = PG_GETARG_TEXT_PP(2);

	PG_RETURN_TEXT_P(orafce_replace_text_regexp(s, p, r,
										 REG_ADVANCED, PG_GET_COLLATION(),
										 0, 0));
}

/*
 * textregexreplace()
 *		Return a string matched by a regular expression, with replacement.
 */
Datum
orafce_textregexreplace(PG_FUNCTION_ARGS)
{
	text	   *s;
	text	   *p;
	text	   *r;
	text	   *opt = NULL;
	pg_re_flags flags;

	/* Always return NULL when start position or occurrence are NULL */
	if (PG_NARGS() > 3 && PG_ARGISNULL(3))
		PG_RETURN_NULL();
	if (PG_NARGS() > 4 && PG_ARGISNULL(4))
		PG_RETURN_NULL();

	/*
	 * Special case for second parameter in REGEXP_REPLACE, when NULL
	 * returns the original value unless the start position or occurrences
	 * are NULL too. In this case, it returns NULL (see instruction above).
	 */
	if (PG_ARGISNULL(1) && !PG_ARGISNULL(0))
		PG_RETURN_TEXT_P(PG_GETARG_TEXT_PP(0));

	if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2))
		PG_RETURN_NULL();

	s = PG_GETARG_TEXT_PP(0);
	p = PG_GETARG_TEXT_PP(1);
	r = PG_GETARG_TEXT_PP(2);

	if (!PG_ARGISNULL(3))
		opt = PG_GETARG_TEXT_PP(3);

	/*
	 * regexp_replace() with four arguments will be preferentially resolved as
	 * this form when the fourth argument is of type UNKNOWN.  However, the
	 * user might have intended to call textregexreplace_extended_no_n.  If we
	 * see flags that look like an integer, emit the same error that
	 * parse_re_flags would, but add a HINT about how to fix it.
	 */
	if (opt && VARSIZE_ANY_EXHDR(opt) > 0)
	{
		char	   *opt_p = VARDATA_ANY(opt);

		if (*opt_p >= '0' && *opt_p <= '9')
			ereport(ERROR,
					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
					 errmsg("invalid regular expression option: \"%.*s\"",
							pg_mblen(opt_p), opt_p),
					 errhint("If you meant to use regexp_replace() with a start parameter, cast the fourth argument to integer explicitly.")));
	}

	parse_re_flags(&flags, opt);

	PG_RETURN_TEXT_P(orafce_replace_text_regexp(s, p, r,
										 flags.cflags, PG_GET_COLLATION(),
										 0, 0));
}

/*
 * textregexreplace_extended()
 *		Return a string matched by a regular expression, with replacement.
 *		Extends textregexreplace by allowing a start position and the
 *		choice of the occurrence to replace (0 means all occurrences).
 */
Datum
orafce_textregexreplace_extended(PG_FUNCTION_ARGS)
{
	text	   *s;
	text	   *p;
	text	   *r;
	int			start = 1;
	int			n = 1;
	text	   *flags = NULL;
	pg_re_flags re_flags;

	/* Always return NULL when start position or occurrence are NULL */
	if (PG_NARGS() > 3 && PG_ARGISNULL(3))
		PG_RETURN_NULL();
	if (PG_NARGS() > 4 && PG_ARGISNULL(4))
		PG_RETURN_NULL();

	/*
	 * Special case for second parameter in REGEXP_REPLACE, when NULL
	 * returns the original value unless the start position or occurrences
	 * are NULL too. In this case, it returns NULL (see instruction above).
	 */
	if (PG_ARGISNULL(1) && !PG_ARGISNULL(0))
		PG_RETURN_TEXT_P(PG_GETARG_TEXT_PP(0));

	if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2))
		PG_RETURN_NULL();

	s = PG_GETARG_TEXT_PP(0);
	p = PG_GETARG_TEXT_PP(1);
	r = PG_GETARG_TEXT_PP(2);

	/* Collect optional parameters */
	if (PG_NARGS() > 3)
	{
		start = PG_GETARG_INT32(3);
		if (start <= 0)
			ereport(ERROR,
					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
					 errmsg("argument 'position' must be a number greater than 0")));
	}
	if (PG_NARGS() > 4)
	{
		n = PG_GETARG_INT32(4);
		if (n < 0)
			ereport(ERROR,
					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
					 errmsg("argument 'occurrence' must be a positive number")));
	}
	if (PG_NARGS() > 5)
	{
		if (!PG_ARGISNULL(5))
			flags = PG_GETARG_TEXT_PP(5);
	}

	/* Determine options */
	parse_re_flags(&re_flags, flags);

	/* The global modifier is not allowed with Oracle */
	if (re_flags.glob)
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
				 errmsg("modifier 'g' is not supported by this function")));

	/*
	 * If N was not specified, force the 'g' modifier. This is the
	 * default in Oracle when no occurence is specified.
	 */
	if (PG_NARGS() <= 4)
		n = 0;

	/* Do the replacement(s) */
	PG_RETURN_TEXT_P(orafce_replace_text_regexp(s, p, r,
										 re_flags.cflags, PG_GET_COLLATION(),
										 start - 1, n));
}

/* This is separate to keep the opr_sanity regression test from complaining */
Datum
orafce_textregexreplace_extended_no_n(PG_FUNCTION_ARGS)
{
	return orafce_textregexreplace_extended(fcinfo);
}

/* This is separate to keep the opr_sanity regression test from complaining */
Datum
orafce_textregexreplace_extended_no_flags(PG_FUNCTION_ARGS)
{
	return orafce_textregexreplace_extended(fcinfo);
}