/* * RE_execute - execute a RE * * Returns TRUE on match, FALSE on no match * * re --- the compiled pattern as returned by RE_compile_and_cache * dat --- the data to match against (need not be null-terminated) * dat_len --- the length of the data string * nmatch, pmatch --- optional return area for match details * * Data is given in the database encoding. We internally * convert to array of pg_wchar which is what Spencer's regex package wants. */ static bool RE_execute(regex_t* re, char *dat, int dat_len, int nmatch, regmatch_t* pmatch) { pg_wchar *data; int data_len; bool match; /* Convert data string to wide characters */ data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar)); data_len = pg_mb2wchar_with_len(dat, data, dat_len); /* Perform RE match and return result */ match = RE_wchar_execute(re, data, data_len, 0, nmatch, pmatch); pfree(data); return match; }
/* * RE_compile_and_execute - compile and execute a RE * * Returns TRUE on match, FALSE on no match * * text_re --- the pattern, expressed as an *untoasted* TEXT object * dat --- the data to match against (need not be null-terminated) * dat_len --- the length of the data string * cflags --- compile options for the pattern * nmatch, pmatch --- optional return area for match details * * Both pattern and data are given in the database encoding. We internally * convert to array of pg_wchar which is what Spencer's regex package wants. */ static bool RE_compile_and_execute(text *text_re, char *dat, int dat_len, int cflags, int nmatch, regmatch_t *pmatch) { pg_wchar *data; size_t data_len; int regexec_result; regex_t *re; char errMsg[100]; /* Convert data string to wide characters */ data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar)); data_len = pg_mb2wchar_with_len(dat, data, dat_len); /* Compile RE */ re = RE_compile_and_cache(text_re, cflags); /* Perform RE match and return result */ regexec_result = pg_regexec(re, data, data_len, 0, NULL, /* no details */ nmatch, pmatch, 0); pfree(data); if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH) { /* re failed??? */ pg_regerror(regexec_result, re, errMsg, sizeof(errMsg)); ereport(ERROR, (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION), errmsg("regular expression failed: %s", errMsg))); } return (regexec_result == REG_OKAY); }
size_t char2wchar(wchar_t *to, const char *from, size_t len) { if (len == 0) return 0; #ifdef WIN32 if (GetDatabaseEncoding() == PG_UTF8) { int r; r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len); if (!r) { pg_verifymbstr(from, strlen(from), false); ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), errmsg("invalid multibyte character for locale"), errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding."))); } Assert(r <= len); return r; } else #endif /* WIN32 */ if ( lc_ctype_is_c() ) { /* * pg_mb2wchar_with_len always adds trailing '\0', so * 'to' should be allocated with sufficient space */ return pg_mb2wchar_with_len(from, (pg_wchar *)to, len); } return mbstowcs(to, from, len); }
/* * RE_compile_and_cache - compile a RE, caching if possible * * Returns regex_t * * * text_re --- the pattern, expressed as an *untoasted* TEXT object * cflags --- compile options for the pattern * * Pattern is given in the database encoding. We internally convert to * array of pg_wchar which is what Spencer's regex package wants. */ static regex_t * RE_compile_and_cache(text *text_re, int cflags) { int text_re_len = VARSIZE(text_re); pg_wchar *pattern; size_t pattern_len; int i; int regcomp_result; cached_re_str re_temp; char errMsg[100]; /* * Look for a match among previously compiled REs. Since the data * structure is self-organizing with most-used entries at the front, our * search strategy can just be to scan from the front. */ for (i = 0; i < num_res; i++) { if (VARSIZE(re_array[i].cre_pat) == text_re_len && memcmp(re_array[i].cre_pat, text_re, text_re_len) == 0 && re_array[i].cre_flags == cflags) { /* * Found a match; move it to front if not there already. */ if (i > 0) { re_temp = re_array[i]; memmove(&re_array[1], &re_array[0], i * sizeof(cached_re_str)); re_array[0] = re_temp; } return &re_array[0].cre_re; } } /* * Couldn't find it, so try to compile the new RE. To avoid leaking * resources on failure, we build into the re_temp local. */ /* Convert pattern string to wide characters */ pattern = (pg_wchar *) palloc((text_re_len - VARHDRSZ + 1) * sizeof(pg_wchar)); pattern_len = pg_mb2wchar_with_len(VARDATA(text_re), pattern, text_re_len - VARHDRSZ); regcomp_result = pg_regcomp(&re_temp.cre_re, pattern, pattern_len, cflags); pfree(pattern); if (regcomp_result != REG_OKAY) { /* re didn't compile */ pg_regerror(regcomp_result, &re_temp.cre_re, errMsg, sizeof(errMsg)); /* XXX should we pg_regfree here? */ ereport(ERROR, (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION), errmsg("invalid regular expression: %s", errMsg))); } /* * use malloc/free for the cre_pat field because the storage has to * persist across transactions */ re_temp.cre_pat = malloc(text_re_len); if (re_temp.cre_pat == NULL) { pg_regfree(&re_temp.cre_re); ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); } memcpy(re_temp.cre_pat, text_re, text_re_len); re_temp.cre_flags = cflags; /* * Okay, we have a valid new item in re_temp; insert it into the storage * array. Discard last entry if needed. */ if (num_res >= MAX_CACHED_RES) { --num_res; Assert(num_res < MAX_CACHED_RES); pg_regfree(&re_array[num_res].cre_re); free(re_array[num_res].cre_pat); } if (num_res > 0) memmove(&re_array[1], &re_array[0], num_res * sizeof(cached_re_str)); re_array[0] = re_temp; num_res++; return &re_array[0].cre_re; }
/* * setup_regexp_matches --- do the initial matching for regexp_matches() * or regexp_split() * * To avoid having to re-find the compiled pattern on each call, we do * all the matching in one swoop. The returned regexp_matches_ctx contains * the locations of all the substrings matching the pattern. * * The three bool parameters have only two patterns (one for each caller) * but it seems clearer to distinguish the functionality this way than to * key it all off one "is_split" flag. */ static regexp_matches_ctx * setup_regexp_matches(text *orig_str, text *pattern, text *flags, Oid collation, bool force_glob, bool use_subpatterns, bool ignore_degenerate) { regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx)); int orig_len; pg_wchar *wide_str; int wide_len; pg_re_flags re_flags; regex_t *cpattern; regmatch_t *pmatch; int pmatch_len; int array_len; int array_idx; int prev_match_end; int start_search; /* save original string --- we'll extract result substrings from it */ matchctx->orig_str = orig_str; /* convert string to pg_wchar form for matching */ orig_len = VARSIZE_ANY_EXHDR(orig_str); wide_str = (pg_wchar *) palloc(sizeof(pg_wchar) * (orig_len + 1)); wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len); /* determine options */ parse_re_flags(&re_flags, flags); if (force_glob) { /* user mustn't specify 'g' for regexp_split */ if (re_flags.glob) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("regexp_split does not support the global option"))); /* but we find all the matches anyway */ re_flags.glob = true; } /* set up the compiled pattern */ cpattern = RE_compile_and_cache(pattern, re_flags.cflags, collation); /* do we want to remember subpatterns? */ if (use_subpatterns && cpattern->re_nsub > 0) { matchctx->npatterns = cpattern->re_nsub; pmatch_len = cpattern->re_nsub + 1; } else { use_subpatterns = false; matchctx->npatterns = 1; pmatch_len = 1; } /* temporary output space for RE package */ pmatch = palloc(sizeof(regmatch_t) * pmatch_len); /* the real output space (grown dynamically if needed) */ array_len = re_flags.glob ? 256 : 32; matchctx->match_locs = (int *) palloc(sizeof(int) * array_len); array_idx = 0; /* search for the pattern, perhaps repeatedly */ prev_match_end = 0; start_search = 0; while (RE_wchar_execute(cpattern, wide_str, wide_len, start_search, pmatch_len, pmatch)) { /* * If requested, ignore degenerate matches, which are zero-length * matches occurring at the start or end of a string or just after a * previous match. */ if (!ignore_degenerate || (pmatch[0].rm_so < wide_len && pmatch[0].rm_eo > prev_match_end)) { /* enlarge output space if needed */ while (array_idx + matchctx->npatterns * 2 > array_len) { array_len *= 2; matchctx->match_locs = (int *) repalloc(matchctx->match_locs, sizeof(int) * array_len); } /* save this match's locations */ if (use_subpatterns) { int i; for (i = 1; i <= matchctx->npatterns; i++) { matchctx->match_locs[array_idx++] = pmatch[i].rm_so; matchctx->match_locs[array_idx++] = pmatch[i].rm_eo; } } else { matchctx->match_locs[array_idx++] = pmatch[0].rm_so; matchctx->match_locs[array_idx++] = pmatch[0].rm_eo; } matchctx->nmatches++; } prev_match_end = pmatch[0].rm_eo; /* if not glob, stop after one match */ if (!re_flags.glob) break; /* * Advance search position. Normally we start the next search at the * end of the previous match; but if the match was of zero length, we * have to advance by one character, or we'd just find the same match * again. */ start_search = prev_match_end; if (pmatch[0].rm_so == pmatch[0].rm_eo) start_search++; if (start_search > wide_len) break; } /* Clean up temp storage */ pfree(wide_str); pfree(pmatch); return matchctx; }
/* * RE_compile_and_cache - compile a RE, caching if possible * * Returns regex_t * * * text_re --- the pattern, expressed as a TEXT object * cflags --- compile options for the pattern * collation --- collation to use for LC_CTYPE-dependent behavior * * Pattern is given in the database encoding. We internally convert to * an array of pg_wchar, which is what Spencer's regex package wants. */ static regex_t * RE_compile_and_cache(text *text_re, int cflags, Oid collation) { int text_re_len = VARSIZE_ANY_EXHDR(text_re); char *text_re_val = VARDATA_ANY(text_re); pg_wchar *pattern; int pattern_len; int i; int regcomp_result; cached_re_str re_temp; char errMsg[100]; /* * Look for a match among previously compiled REs. Since the data * structure is self-organizing with most-used entries at the front, our * search strategy can just be to scan from the front. */ for (i = 0; i < num_res; i++) { if (re_array[i].cre_pat_len == text_re_len && re_array[i].cre_flags == cflags && re_array[i].cre_collation == collation && memcmp(re_array[i].cre_pat, text_re_val, text_re_len) == 0) { /* * Found a match; move it to front if not there already. */ if (i > 0) { re_temp = re_array[i]; memmove(&re_array[1], &re_array[0], i * sizeof(cached_re_str)); re_array[0] = re_temp; } return &re_array[0].cre_re; } } /* * Couldn't find it, so try to compile the new RE. To avoid leaking * resources on failure, we build into the re_temp local. */ /* Convert pattern string to wide characters */ pattern = (pg_wchar *) palloc((text_re_len + 1) * sizeof(pg_wchar)); pattern_len = pg_mb2wchar_with_len(text_re_val, pattern, text_re_len); regcomp_result = pg_regcomp(&re_temp.cre_re, pattern, pattern_len, cflags, collation); pfree(pattern); if (regcomp_result != REG_OKAY) { /* re didn't compile (no need for pg_regfree, if so) */ /* * Here and in other places in this file, do CHECK_FOR_INTERRUPTS * before reporting a regex error. This is so that if the regex * library aborts and returns REG_CANCEL, we don't print an error * message that implies the regex was invalid. */ CHECK_FOR_INTERRUPTS(); pg_regerror(regcomp_result, &re_temp.cre_re, errMsg, sizeof(errMsg)); ereport(ERROR, (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION), errmsg("invalid regular expression: %s", errMsg))); } /* * We use malloc/free for the cre_pat field because the storage has to * persist across transactions, and because we want to get control back on * out-of-memory. The Max() is because some malloc implementations return * NULL for malloc(0). */ re_temp.cre_pat = malloc(Max(text_re_len, 1)); if (re_temp.cre_pat == NULL) { pg_regfree(&re_temp.cre_re); ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"))); } memcpy(re_temp.cre_pat, text_re_val, text_re_len); re_temp.cre_pat_len = text_re_len; re_temp.cre_flags = cflags; re_temp.cre_collation = collation; /* * Okay, we have a valid new item in re_temp; insert it into the storage * array. Discard last entry if needed. */ if (num_res >= MAX_CACHED_RES) { --num_res; Assert(num_res < MAX_CACHED_RES); pg_regfree(&re_array[num_res].cre_re); free(re_array[num_res].cre_pat); } if (num_res > 0) memmove(&re_array[1], &re_array[0], num_res * sizeof(cached_re_str)); re_array[0] = re_temp; num_res++; return &re_array[0].cre_re; }
/* * setup_regexp_matches --- do the initial matching for regexp_matches() * or regexp_split() * * To avoid having to re-find the compiled pattern on each call, we do * all the matching in one swoop. The returned regexp_matches_ctx contains * the locations of all the substrings matching the pattern. * * The four bool parameters have only two patterns (one for matching, one for * splitting) but it seems clearer to distinguish the functionality this way * than to key it all off one "is_split" flag. We don't currently assume that * fetching_unmatched is exclusive of fetching the matched text too; if it's * set, the conversion buffer is large enough to fetch any single matched or * unmatched string, but not any larger substring. (In practice, when splitting * the matches are usually small anyway, and it didn't seem worth complicating * the code further.) */ static regexp_matches_ctx * setup_regexp_matches(text *orig_str, text *pattern, text *flags, Oid collation, bool force_glob, bool use_subpatterns, bool ignore_degenerate, bool fetching_unmatched) { regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx)); int eml = pg_database_encoding_max_length(); int orig_len; pg_wchar *wide_str; int wide_len; pg_re_flags re_flags; regex_t *cpattern; regmatch_t *pmatch; int pmatch_len; int array_len; int array_idx; int prev_match_end; int prev_valid_match_end; int start_search; int maxlen = 0; /* largest fetch length in characters */ /* save original string --- we'll extract result substrings from it */ matchctx->orig_str = orig_str; /* convert string to pg_wchar form for matching */ orig_len = VARSIZE_ANY_EXHDR(orig_str); wide_str = (pg_wchar *) palloc(sizeof(pg_wchar) * (orig_len + 1)); wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len); /* determine options */ parse_re_flags(&re_flags, flags); if (force_glob) { /* user mustn't specify 'g' for regexp_split */ if (re_flags.glob) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("regexp_split does not support the global option"))); /* but we find all the matches anyway */ re_flags.glob = true; } /* set up the compiled pattern */ cpattern = RE_compile_and_cache(pattern, re_flags.cflags, collation); /* do we want to remember subpatterns? */ if (use_subpatterns && cpattern->re_nsub > 0) { matchctx->npatterns = cpattern->re_nsub; pmatch_len = cpattern->re_nsub + 1; } else { use_subpatterns = false; matchctx->npatterns = 1; pmatch_len = 1; } /* temporary output space for RE package */ pmatch = palloc(sizeof(regmatch_t) * pmatch_len); /* * the real output space (grown dynamically if needed) * * use values 2^n-1, not 2^n, so that we hit the limit at 2^28-1 rather * than at 2^27 */ array_len = re_flags.glob ? 255 : 31; matchctx->match_locs = (int *) palloc(sizeof(int) * array_len); array_idx = 0; /* search for the pattern, perhaps repeatedly */ prev_match_end = 0; prev_valid_match_end = 0; start_search = 0; while (RE_wchar_execute(cpattern, wide_str, wide_len, start_search, pmatch_len, pmatch)) { /* * If requested, ignore degenerate matches, which are zero-length * matches occurring at the start or end of a string or just after a * previous match. */ if (!ignore_degenerate || (pmatch[0].rm_so < wide_len && pmatch[0].rm_eo > prev_match_end)) { /* enlarge output space if needed */ while (array_idx + matchctx->npatterns * 2 + 1 > array_len) { array_len += array_len + 1; /* 2^n-1 => 2^(n+1)-1 */ if (array_len > MaxAllocSize/sizeof(int)) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("too many regular expression matches"))); matchctx->match_locs = (int *) repalloc(matchctx->match_locs, sizeof(int) * array_len); } /* save this match's locations */ if (use_subpatterns) { int i; for (i = 1; i <= matchctx->npatterns; i++) { int so = pmatch[i].rm_so; int eo = pmatch[i].rm_eo; matchctx->match_locs[array_idx++] = so; matchctx->match_locs[array_idx++] = eo; if (so >= 0 && eo >= 0 && (eo - so) > maxlen) maxlen = (eo - so); } } else { int so = pmatch[0].rm_so; int eo = pmatch[0].rm_eo; matchctx->match_locs[array_idx++] = so; matchctx->match_locs[array_idx++] = eo; if (so >= 0 && eo >= 0 && (eo - so) > maxlen) maxlen = (eo - so); } matchctx->nmatches++; /* * check length of unmatched portion between end of previous valid * (nondegenerate, or degenerate but not ignored) match and start * of current one */ if (fetching_unmatched && pmatch[0].rm_so >= 0 && (pmatch[0].rm_so - prev_valid_match_end) > maxlen) maxlen = (pmatch[0].rm_so - prev_valid_match_end); prev_valid_match_end = pmatch[0].rm_eo; } prev_match_end = pmatch[0].rm_eo; /* if not glob, stop after one match */ if (!re_flags.glob) break; /* * Advance search position. Normally we start the next search at the * end of the previous match; but if the match was of zero length, we * have to advance by one character, or we'd just find the same match * again. */ start_search = prev_match_end; if (pmatch[0].rm_so == pmatch[0].rm_eo) start_search++; if (start_search > wide_len) break; } /* * check length of unmatched portion between end of last match and end of * input string */ if (fetching_unmatched && (wide_len - prev_valid_match_end) > maxlen) maxlen = (wide_len - prev_valid_match_end); /* * Keep a note of the end position of the string for the benefit of * splitting code. */ matchctx->match_locs[array_idx] = wide_len; if (eml > 1) { int64 maxsiz = eml * (int64) maxlen; int conv_bufsiz; /* * Make the conversion buffer large enough for any substring of * interest. * * Worst case: assume we need the maximum size (maxlen*eml), but take * advantage of the fact that the original string length in bytes is an * upper bound on the byte length of any fetched substring (and we know * that len+1 is safe to allocate because the varlena header is longer * than 1 byte). */ if (maxsiz > orig_len) conv_bufsiz = orig_len + 1; else conv_bufsiz = maxsiz + 1; /* safe since maxsiz < 2^30 */ matchctx->conv_buf = palloc(conv_bufsiz); matchctx->conv_bufsiz = conv_bufsiz; matchctx->wide_str = wide_str; } else { /* No need to keep the wide string if we're in a single-byte charset. */ pfree(wide_str); matchctx->wide_str = NULL; matchctx->conv_buf = NULL; matchctx->conv_bufsiz = 0; } /* Clean up temp storage */ pfree(pmatch); return matchctx; }