/* * textregexreplace() * Return a string matched by a regular expression, with replacement. */ datum_t textregexreplace(PG_FUNC_ARGS) { text *s = ARG_TEXT_PP(0); text *p = ARG_TEXT_PP(1); text *r = ARG_TEXT_PP(2); text *opt = ARG_TEXT_PP(3); regex_t *re; pg_re_flags flags; parse_re_flags(&flags, opt); re = RE_compile_and_cache(p, flags.cflags, PG_COLLATION()); RET_TEXT_P(replace_text_regexp(s, (void *)re, r, flags.glob)); }
/* * textregexreplace() * Return a string matched by a regular expression, with replacement. */ Datum textregexreplace(PG_FUNCTION_ARGS) { text *s = PG_GETARG_TEXT_PP(0); text *p = PG_GETARG_TEXT_PP(1); text *r = PG_GETARG_TEXT_PP(2); text *opt = PG_GETARG_TEXT_PP(3); regex_t *re; pg_re_flags flags; parse_re_flags(&flags, opt); re = RE_compile_and_cache(p, flags.cflags); PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, flags.glob)); }
/* * setup_regexp_matches --- do the initial matching for regexp_matches() * or regexp_split() * * To avoid having to re-find the compiled pattern on each call, we do * all the matching in one swoop. The returned regexp_matches_ctx contains * the locations of all the substrings matching the pattern. * * The three bool parameters have only two patterns (one for each caller) * but it seems clearer to distinguish the functionality this way than to * key it all off one "is_split" flag. */ static regexp_matches_ctx * setup_regexp_matches(text *orig_str, text *pattern, text *flags, Oid collation, bool force_glob, bool use_subpatterns, bool ignore_degenerate) { regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx)); int orig_len; pg_wchar *wide_str; int wide_len; pg_re_flags re_flags; regex_t *cpattern; regmatch_t *pmatch; int pmatch_len; int array_len; int array_idx; int prev_match_end; int start_search; /* save original string --- we'll extract result substrings from it */ matchctx->orig_str = orig_str; /* convert string to pg_wchar form for matching */ orig_len = VARSIZE_ANY_EXHDR(orig_str); wide_str = (pg_wchar *) palloc(sizeof(pg_wchar) * (orig_len + 1)); wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len); /* determine options */ parse_re_flags(&re_flags, flags); if (force_glob) { /* user mustn't specify 'g' for regexp_split */ if (re_flags.glob) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("regexp_split does not support the global option"))); /* but we find all the matches anyway */ re_flags.glob = true; } /* set up the compiled pattern */ cpattern = RE_compile_and_cache(pattern, re_flags.cflags, collation); /* do we want to remember subpatterns? */ if (use_subpatterns && cpattern->re_nsub > 0) { matchctx->npatterns = cpattern->re_nsub; pmatch_len = cpattern->re_nsub + 1; } else { use_subpatterns = false; matchctx->npatterns = 1; pmatch_len = 1; } /* temporary output space for RE package */ pmatch = palloc(sizeof(regmatch_t) * pmatch_len); /* the real output space (grown dynamically if needed) */ array_len = re_flags.glob ? 256 : 32; matchctx->match_locs = (int *) palloc(sizeof(int) * array_len); array_idx = 0; /* search for the pattern, perhaps repeatedly */ prev_match_end = 0; start_search = 0; while (RE_wchar_execute(cpattern, wide_str, wide_len, start_search, pmatch_len, pmatch)) { /* * If requested, ignore degenerate matches, which are zero-length * matches occurring at the start or end of a string or just after a * previous match. */ if (!ignore_degenerate || (pmatch[0].rm_so < wide_len && pmatch[0].rm_eo > prev_match_end)) { /* enlarge output space if needed */ while (array_idx + matchctx->npatterns * 2 > array_len) { array_len *= 2; matchctx->match_locs = (int *) repalloc(matchctx->match_locs, sizeof(int) * array_len); } /* save this match's locations */ if (use_subpatterns) { int i; for (i = 1; i <= matchctx->npatterns; i++) { matchctx->match_locs[array_idx++] = pmatch[i].rm_so; matchctx->match_locs[array_idx++] = pmatch[i].rm_eo; } } else { matchctx->match_locs[array_idx++] = pmatch[0].rm_so; matchctx->match_locs[array_idx++] = pmatch[0].rm_eo; } matchctx->nmatches++; } prev_match_end = pmatch[0].rm_eo; /* if not glob, stop after one match */ if (!re_flags.glob) break; /* * Advance search position. Normally we start the next search at the * end of the previous match; but if the match was of zero length, we * have to advance by one character, or we'd just find the same match * again. */ start_search = prev_match_end; if (pmatch[0].rm_so == pmatch[0].rm_eo) start_search++; if (start_search > wide_len) break; } /* Clean up temp storage */ pfree(wide_str); pfree(pmatch); return matchctx; }
/* * setup_regexp_matches --- do the initial matching for regexp_matches() * or regexp_split() * * To avoid having to re-find the compiled pattern on each call, we do * all the matching in one swoop. The returned regexp_matches_ctx contains * the locations of all the substrings matching the pattern. * * The four bool parameters have only two patterns (one for matching, one for * splitting) but it seems clearer to distinguish the functionality this way * than to key it all off one "is_split" flag. We don't currently assume that * fetching_unmatched is exclusive of fetching the matched text too; if it's * set, the conversion buffer is large enough to fetch any single matched or * unmatched string, but not any larger substring. (In practice, when splitting * the matches are usually small anyway, and it didn't seem worth complicating * the code further.) */ static regexp_matches_ctx * setup_regexp_matches(text *orig_str, text *pattern, text *flags, Oid collation, bool force_glob, bool use_subpatterns, bool ignore_degenerate, bool fetching_unmatched) { regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx)); int eml = pg_database_encoding_max_length(); int orig_len; pg_wchar *wide_str; int wide_len; pg_re_flags re_flags; regex_t *cpattern; regmatch_t *pmatch; int pmatch_len; int array_len; int array_idx; int prev_match_end; int prev_valid_match_end; int start_search; int maxlen = 0; /* largest fetch length in characters */ /* save original string --- we'll extract result substrings from it */ matchctx->orig_str = orig_str; /* convert string to pg_wchar form for matching */ orig_len = VARSIZE_ANY_EXHDR(orig_str); wide_str = (pg_wchar *) palloc(sizeof(pg_wchar) * (orig_len + 1)); wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len); /* determine options */ parse_re_flags(&re_flags, flags); if (force_glob) { /* user mustn't specify 'g' for regexp_split */ if (re_flags.glob) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("regexp_split does not support the global option"))); /* but we find all the matches anyway */ re_flags.glob = true; } /* set up the compiled pattern */ cpattern = RE_compile_and_cache(pattern, re_flags.cflags, collation); /* do we want to remember subpatterns? */ if (use_subpatterns && cpattern->re_nsub > 0) { matchctx->npatterns = cpattern->re_nsub; pmatch_len = cpattern->re_nsub + 1; } else { use_subpatterns = false; matchctx->npatterns = 1; pmatch_len = 1; } /* temporary output space for RE package */ pmatch = palloc(sizeof(regmatch_t) * pmatch_len); /* * the real output space (grown dynamically if needed) * * use values 2^n-1, not 2^n, so that we hit the limit at 2^28-1 rather * than at 2^27 */ array_len = re_flags.glob ? 255 : 31; matchctx->match_locs = (int *) palloc(sizeof(int) * array_len); array_idx = 0; /* search for the pattern, perhaps repeatedly */ prev_match_end = 0; prev_valid_match_end = 0; start_search = 0; while (RE_wchar_execute(cpattern, wide_str, wide_len, start_search, pmatch_len, pmatch)) { /* * If requested, ignore degenerate matches, which are zero-length * matches occurring at the start or end of a string or just after a * previous match. */ if (!ignore_degenerate || (pmatch[0].rm_so < wide_len && pmatch[0].rm_eo > prev_match_end)) { /* enlarge output space if needed */ while (array_idx + matchctx->npatterns * 2 + 1 > array_len) { array_len += array_len + 1; /* 2^n-1 => 2^(n+1)-1 */ if (array_len > MaxAllocSize/sizeof(int)) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("too many regular expression matches"))); matchctx->match_locs = (int *) repalloc(matchctx->match_locs, sizeof(int) * array_len); } /* save this match's locations */ if (use_subpatterns) { int i; for (i = 1; i <= matchctx->npatterns; i++) { int so = pmatch[i].rm_so; int eo = pmatch[i].rm_eo; matchctx->match_locs[array_idx++] = so; matchctx->match_locs[array_idx++] = eo; if (so >= 0 && eo >= 0 && (eo - so) > maxlen) maxlen = (eo - so); } } else { int so = pmatch[0].rm_so; int eo = pmatch[0].rm_eo; matchctx->match_locs[array_idx++] = so; matchctx->match_locs[array_idx++] = eo; if (so >= 0 && eo >= 0 && (eo - so) > maxlen) maxlen = (eo - so); } matchctx->nmatches++; /* * check length of unmatched portion between end of previous valid * (nondegenerate, or degenerate but not ignored) match and start * of current one */ if (fetching_unmatched && pmatch[0].rm_so >= 0 && (pmatch[0].rm_so - prev_valid_match_end) > maxlen) maxlen = (pmatch[0].rm_so - prev_valid_match_end); prev_valid_match_end = pmatch[0].rm_eo; } prev_match_end = pmatch[0].rm_eo; /* if not glob, stop after one match */ if (!re_flags.glob) break; /* * Advance search position. Normally we start the next search at the * end of the previous match; but if the match was of zero length, we * have to advance by one character, or we'd just find the same match * again. */ start_search = prev_match_end; if (pmatch[0].rm_so == pmatch[0].rm_eo) start_search++; if (start_search > wide_len) break; } /* * check length of unmatched portion between end of last match and end of * input string */ if (fetching_unmatched && (wide_len - prev_valid_match_end) > maxlen) maxlen = (wide_len - prev_valid_match_end); /* * Keep a note of the end position of the string for the benefit of * splitting code. */ matchctx->match_locs[array_idx] = wide_len; if (eml > 1) { int64 maxsiz = eml * (int64) maxlen; int conv_bufsiz; /* * Make the conversion buffer large enough for any substring of * interest. * * Worst case: assume we need the maximum size (maxlen*eml), but take * advantage of the fact that the original string length in bytes is an * upper bound on the byte length of any fetched substring (and we know * that len+1 is safe to allocate because the varlena header is longer * than 1 byte). */ if (maxsiz > orig_len) conv_bufsiz = orig_len + 1; else conv_bufsiz = maxsiz + 1; /* safe since maxsiz < 2^30 */ matchctx->conv_buf = palloc(conv_bufsiz); matchctx->conv_bufsiz = conv_bufsiz; matchctx->wide_str = wide_str; } else { /* No need to keep the wide string if we're in a single-byte charset. */ pfree(wide_str); matchctx->wide_str = NULL; matchctx->conv_buf = NULL; matchctx->conv_bufsiz = 0; } /* Clean up temp storage */ pfree(pmatch); return matchctx; }