/* * parser_errposition * Report a parse-analysis-time cursor position, if possible. * * This is expected to be used within an ereport() call. The return value * is a dummy (always 0, in fact). * * The locations stored in raw parsetrees are byte offsets into the source * string. We have to convert them to 1-based character indexes for reporting * to clients. (We do things this way to avoid unnecessary overhead in the * normal non-error case: computing character indexes would be much more * expensive than storing token offsets.) */ int parser_errposition(ParseState *pstate, int location) { int pos; /* No-op if location was not provided */ if (location < 0) return 0; /* Can't do anything if source text is not available */ if (pstate == NULL || pstate->p_sourcetext == NULL) return 0; /* Convert offset to character number */ pos = pg_mbstrlen_with_len(pstate->p_sourcetext, location) + 1; /* And pass it to the ereport mechanism */ return errposition(pos); }
static int ora_instr_mb(text *txt, text *pattern, int start, int nth) { int c_len_txt, c_len_pat; int b_len_pat; int *pos_txt; const char *str_txt, *str_pat; int beg, end, i, dx; str_txt = VARDATA_ANY(txt); c_len_txt = ora_mb_strlen(txt, NULL, &pos_txt); str_pat = VARDATA_ANY(pattern); b_len_pat = VARSIZE_ANY_EXHDR(pattern); c_len_pat = pg_mbstrlen_with_len(str_pat, b_len_pat); if (start > 0) { dx = 1; beg = start - 1; end = c_len_txt - c_len_pat + 1; if (beg >= end) return 0; /* out of range */ } else { dx = -1; beg = Min(c_len_txt + start, c_len_txt - c_len_pat); end = -1; if (beg <= end) return 0; /* out of range */ } for (i = beg; i != end; i += dx) { if (memcmp(str_txt + pos_txt[i], str_pat, b_len_pat) == 0) { if (--nth == 0) return i + 1; } } return 0; }
levenshtein_internal(text *s, text *t, int ins_c, int del_c, int sub_c) #endif { int m, n, s_bytes, t_bytes; int *prev; int *curr; int *s_char_len = NULL; int i, j; const char *s_data; const char *t_data; const char *y; /* * For levenshtein_less_equal_internal, we have real variables called * start_column and stop_column; otherwise it's just short-hand for 0 and * m. */ #ifdef LEVENSHTEIN_LESS_EQUAL int start_column, stop_column; #undef START_COLUMN #undef STOP_COLUMN #define START_COLUMN start_column #define STOP_COLUMN stop_column #else #undef START_COLUMN #undef STOP_COLUMN #define START_COLUMN 0 #define STOP_COLUMN m #endif /* Extract a pointer to the actual character data. */ s_data = VARDATA_ANY(s); t_data = VARDATA_ANY(t); /* Determine length of each string in bytes and characters. */ s_bytes = VARSIZE_ANY_EXHDR(s); t_bytes = VARSIZE_ANY_EXHDR(t); m = pg_mbstrlen_with_len(s_data, s_bytes); n = pg_mbstrlen_with_len(t_data, t_bytes); /* * We can transform an empty s into t with n insertions, or a non-empty t * into an empty s with m deletions. */ if (!m) return n * ins_c; if (!n) return m * del_c; /* * For security concerns, restrict excessive CPU+RAM usage. (This * implementation uses O(m) memory and has O(mn) complexity.) */ if (m > MAX_LEVENSHTEIN_STRLEN || n > MAX_LEVENSHTEIN_STRLEN) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("argument exceeds the maximum length of %d bytes", MAX_LEVENSHTEIN_STRLEN))); #ifdef LEVENSHTEIN_LESS_EQUAL /* Initialize start and stop columns. */ start_column = 0; stop_column = m + 1; /* * If max_d >= 0, determine whether the bound is impossibly tight. If so, * return max_d + 1 immediately. Otherwise, determine whether it's tight * enough to limit the computation we must perform. If so, figure out * initial stop column. */ if (max_d >= 0) { int min_theo_d; /* Theoretical minimum distance. */ int max_theo_d; /* Theoretical maximum distance. */ int net_inserts = n - m; min_theo_d = net_inserts < 0 ? -net_inserts * del_c : net_inserts * ins_c; if (min_theo_d > max_d) return max_d + 1; if (ins_c + del_c < sub_c) sub_c = ins_c + del_c; max_theo_d = min_theo_d + sub_c * Min(m, n); if (max_d >= max_theo_d) max_d = -1; else if (ins_c + del_c > 0) { /* * Figure out how much of the first row of the notional matrix we * need to fill in. If the string is growing, the theoretical * minimum distance already incorporates the cost of deleting the * number of characters necessary to make the two strings equal in * length. Each additional deletion forces another insertion, so * the best-case total cost increases by ins_c + del_c. If the * string is shrinking, the minimum theoretical cost assumes no * excess deletions; that is, we're starting no further right than * column n - m. If we do start further right, the best-case * total cost increases by ins_c + del_c for each move right. */ int slack_d = max_d - min_theo_d; int best_column = net_inserts < 0 ? -net_inserts : 0; stop_column = best_column + (slack_d / (ins_c + del_c)) + 1; if (stop_column > m) stop_column = m + 1; } } #endif /* * In order to avoid calling pg_mblen() repeatedly on each character in s, * we cache all the lengths before starting the main loop -- but if all * the characters in both strings are single byte, then we skip this and * use a fast-path in the main loop. If only one string contains * multi-byte characters, we still build the array, so that the fast-path * needn't deal with the case where the array hasn't been initialized. */ if (m != s_bytes || n != t_bytes) { int i; const char *cp = s_data; s_char_len = (int *) palloc((m + 1) * sizeof(int)); for (i = 0; i < m; ++i) { s_char_len[i] = pg_mblen(cp); cp += s_char_len[i]; } s_char_len[i] = 0; } /* One more cell for initialization column and row. */ ++m; ++n; /* Previous and current rows of notional array. */ prev = (int *) palloc(2 * m * sizeof(int)); curr = prev + m; /* * To transform the first i characters of s into the first 0 characters of * t, we must perform i deletions. */ for (i = START_COLUMN; i < STOP_COLUMN; i++) prev[i] = i * del_c; /* Loop through rows of the notional array */ for (y = t_data, j = 1; j < n; j++) { int *temp; const char *x = s_data; int y_char_len = n != t_bytes + 1 ? pg_mblen(y) : 1; #ifdef LEVENSHTEIN_LESS_EQUAL /* * In the best case, values percolate down the diagonal unchanged, so * we must increment stop_column unless it's already on the right end * of the array. The inner loop will read prev[stop_column], so we * have to initialize it even though it shouldn't affect the result. */ if (stop_column < m) { prev[stop_column] = max_d + 1; ++stop_column; } /* * The main loop fills in curr, but curr[0] needs a special case: to * transform the first 0 characters of s into the first j characters * of t, we must perform j insertions. However, if start_column > 0, * this special case does not apply. */ if (start_column == 0) { curr[0] = j * ins_c; i = 1; } else i = start_column; #else curr[0] = j * ins_c; i = 1; #endif /* * This inner loop is critical to performance, so we include a * fast-path to handle the (fairly common) case where no multibyte * characters are in the mix. The fast-path is entitled to assume * that if s_char_len is not initialized then BOTH strings contain * only single-byte characters. */ if (s_char_len != NULL) { for (; i < STOP_COLUMN; i++) { int ins; int del; int sub; int x_char_len = s_char_len[i - 1]; /* * Calculate costs for insertion, deletion, and substitution. * * When calculating cost for substitution, we compare the last * character of each possibly-multibyte character first, * because that's enough to rule out most mis-matches. If we * get past that test, then we compare the lengths and the * remaining bytes. */ ins = prev[i] + ins_c; del = curr[i - 1] + del_c; if (x[x_char_len - 1] == y[y_char_len - 1] && x_char_len == y_char_len && (x_char_len == 1 || rest_of_char_same(x, y, x_char_len))) sub = prev[i - 1]; else sub = prev[i - 1] + sub_c; /* Take the one with minimum cost. */ curr[i] = Min(ins, del); curr[i] = Min(curr[i], sub); /* Point to next character. */ x += x_char_len; } } else { for (; i < STOP_COLUMN; i++) { int ins; int del; int sub; /* Calculate costs for insertion, deletion, and substitution. */ ins = prev[i] + ins_c; del = curr[i - 1] + del_c; sub = prev[i - 1] + ((*x == *y) ? 0 : sub_c); /* Take the one with minimum cost. */ curr[i] = Min(ins, del); curr[i] = Min(curr[i], sub); /* Point to next character. */ x++; } } /* Swap current row with previous row. */ temp = curr; curr = prev; prev = temp; /* Point to next character. */ y += y_char_len; #ifdef LEVENSHTEIN_LESS_EQUAL /* * This chunk of code represents a significant performance hit if used * in the case where there is no max_d bound. This is probably not * because the max_d >= 0 test itself is expensive, but rather because * the possibility of needing to execute this code prevents tight * optimization of the loop as a whole. */ if (max_d >= 0) { /* * The "zero point" is the column of the current row where the * remaining portions of the strings are of equal length. There * are (n - 1) characters in the target string, of which j have * been transformed. There are (m - 1) characters in the source * string, so we want to find the value for zp where (n - 1) - j = * (m - 1) - zp. */ int zp = j - (n - m); /* Check whether the stop column can slide left. */ while (stop_column > 0) { int ii = stop_column - 1; int net_inserts = ii - zp; if (prev[ii] + (net_inserts > 0 ? net_inserts * ins_c : -net_inserts * del_c) <= max_d) break; stop_column--; } /* Check whether the start column can slide right. */ while (start_column < stop_column) { int net_inserts = start_column - zp; if (prev[start_column] + (net_inserts > 0 ? net_inserts * ins_c : -net_inserts * del_c) <= max_d) break; /* * We'll never again update these values, so we must make sure * there's nothing here that could confuse any future * iteration of the outer loop. */ prev[start_column] = max_d + 1; curr[start_column] = max_d + 1; if (start_column != 0) s_data += (s_char_len != NULL) ? s_char_len[start_column - 1] : 1; start_column++; } /* If they cross, we're going to exceed the bound. */ if (start_column >= stop_column) return max_d + 1; } #endif } /* * Because the final value was swapped from the previous row to the * current row, that's where we'll find it. */ return prev[m - 1]; }
Datum rpad(PG_FUNCTION_ARGS) { text *string1 = PG_GETARG_TEXT_PP(0); int32 len = PG_GETARG_INT32(1); text *string2 = PG_GETARG_TEXT_PP(2); text *ret; char *ptr1, *ptr2, *ptr2start, *ptr2end, *ptr_ret; int m, s1len, s2len; int bytelen; /* Negative len is silently taken as zero */ if (len < 0) len = 0; s1len = VARSIZE_ANY_EXHDR(string1); if (s1len < 0) s1len = 0; /* shouldn't happen */ s2len = VARSIZE_ANY_EXHDR(string2); if (s2len < 0) s2len = 0; /* shouldn't happen */ s1len = pg_mbstrlen_with_len(VARDATA_ANY(string1), s1len); if (s1len > len) s1len = len; /* truncate string1 to len chars */ if (s2len <= 0) len = s1len; /* nothing to pad with, so don't pad */ bytelen = pg_database_encoding_max_length() * len; /* Check for integer overflow */ if (len != 0 && bytelen / pg_database_encoding_max_length() != len) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("requested length too large"))); ret = (text *) palloc(VARHDRSZ + bytelen); m = len - s1len; ptr1 = VARDATA_ANY(string1); ptr_ret = VARDATA(ret); while (s1len--) { int mlen = pg_mblen(ptr1); memcpy(ptr_ret, ptr1, mlen); ptr_ret += mlen; ptr1 += mlen; } ptr2 = ptr2start = VARDATA_ANY(string2); ptr2end = ptr2 + s2len; while (m--) { int mlen = pg_mblen(ptr2); memcpy(ptr_ret, ptr2, mlen); ptr_ret += mlen; ptr2 += mlen; if (ptr2 == ptr2end) /* wrap around at end of s2 */ ptr2 = ptr2start; } SET_VARSIZE(ret, ptr_ret - (char *) ret); PG_RETURN_TEXT_P(ret); }
/* * bpchar_input -- common guts of bpcharin and bpcharrecv * * s is the input text of length len (may not be null-terminated) * atttypmod is the typmod value to apply * * Note that atttypmod is measured in characters, which * is not necessarily the same as the number of bytes. * * If the input string is too long, raise an error, unless the extra * characters are spaces, in which case they're truncated. (per SQL) */ static BpChar * bpchar_input(const char *s, size_t len, int32 atttypmod) { BpChar *result; char *r; size_t maxlen; /* verify encoding */ pg_verifymbstr(s, len, false); /* If typmod is -1 (or invalid), use the actual string length */ if (atttypmod < (int32) VARHDRSZ) maxlen = len; else { size_t charlen; /* number of CHARACTERS in the input */ maxlen = atttypmod - VARHDRSZ; charlen = pg_mbstrlen_with_len(s, len); if (charlen > maxlen) { /* Verify that extra characters are spaces, and clip them off */ size_t mbmaxlen = pg_mbcharcliplen(s, len, maxlen); size_t j; /* * at this point, len is the actual BYTE length of the input * string, maxlen is the max number of CHARACTERS allowed for this * bpchar type, mbmaxlen is the length in BYTES of those chars. */ for (j = mbmaxlen; j < len; j++) { if (s[j] != ' ') ereport(ERROR, (errcode(ERRCODE_STRING_DATA_RIGHT_TRUNCATION), errmsg("value too long for type character(%d)", (int) maxlen))); } /* * Now we set maxlen to the necessary byte length, not the number * of CHARACTERS! */ maxlen = len = mbmaxlen; } else { /* * Now we set maxlen to the necessary byte length, not the number * of CHARACTERS! */ maxlen = len + (maxlen - charlen); } } result = (BpChar *) palloc(maxlen + VARHDRSZ); VARATT_SIZEP(result) = maxlen + VARHDRSZ; r = VARDATA(result); memcpy(r, s, len); /* blank pad the string if necessary */ if (maxlen > len) memset(r + len, ' ', maxlen - len); return result; }
/* * Converts a CHARACTER type to the specified size. * * maxlen is the typmod, ie, declared length plus VARHDRSZ bytes. * isExplicit is true if this is for an explicit cast to char(N). * * Truncation rules: for an explicit cast, silently truncate to the given * length; for an implicit cast, raise error unless extra characters are * all spaces. (This is sort-of per SQL: the spec would actually have us * raise a "completion condition" for the explicit cast case, but Postgres * hasn't got such a concept.) */ Datum bpchar(PG_FUNCTION_ARGS) { BpChar *source = PG_GETARG_BPCHAR_P(0); int32 maxlen = PG_GETARG_INT32(1); bool isExplicit = PG_GETARG_BOOL(2); BpChar *result; int32 len; char *r; char *s; int i; int charlen; /* number of characters in the input string + * VARHDRSZ */ /* No work if typmod is invalid */ if (maxlen < (int32) VARHDRSZ) PG_RETURN_BPCHAR_P(source); len = VARSIZE(source); charlen = pg_mbstrlen_with_len(VARDATA(source), len - VARHDRSZ) + VARHDRSZ; /* No work if supplied data matches typmod already */ if (charlen == maxlen) PG_RETURN_BPCHAR_P(source); if (charlen > maxlen) { /* Verify that extra characters are spaces, and clip them off */ size_t maxmblen; maxmblen = pg_mbcharcliplen(VARDATA(source), len - VARHDRSZ, maxlen - VARHDRSZ) + VARHDRSZ; if (!isExplicit) { for (i = maxmblen - VARHDRSZ; i < len - VARHDRSZ; i++) if (*(VARDATA(source) + i) != ' ') ereport(ERROR, (errcode(ERRCODE_STRING_DATA_RIGHT_TRUNCATION), errmsg("value too long for type character(%d)", maxlen - VARHDRSZ))); } len = maxmblen; /* * XXX: at this point, maxlen is the necessary byte length+VARHDRSZ, * not the number of CHARACTERS! */ maxlen = len; } else { /* * XXX: at this point, maxlen is the necessary byte length+VARHDRSZ, * not the number of CHARACTERS! */ maxlen = len + (maxlen - charlen); } s = VARDATA(source); result = palloc(maxlen); VARATT_SIZEP(result) = maxlen; r = VARDATA(result); memcpy(r, s, len - VARHDRSZ); /* blank pad the string if necessary */ if (maxlen > len) memset(r + len - VARHDRSZ, ' ', maxlen - len); PG_RETURN_BPCHAR_P(result); }
/* * similar_escape() * Convert a SQL:2008 regexp pattern to POSIX style, so it can be used by * our regexp engine. */ Datum similar_escape(PG_FUNCTION_ARGS) { text *pat_text; text *esc_text; text *result; char *p, *e, *r; int plen, elen; bool afterescape = false; bool incharclass = false; int nquotes = 0; /* This function is not strict, so must test explicitly */ if (PG_ARGISNULL(0)) PG_RETURN_NULL(); pat_text = PG_GETARG_TEXT_PP(0); p = VARDATA_ANY(pat_text); plen = VARSIZE_ANY_EXHDR(pat_text); if (PG_ARGISNULL(1)) { /* No ESCAPE clause provided; default to backslash as escape */ e = "\\"; elen = 1; } else { esc_text = PG_GETARG_TEXT_PP(1); e = VARDATA_ANY(esc_text); elen = VARSIZE_ANY_EXHDR(esc_text); if (elen == 0) e = NULL; /* no escape character */ else { int escape_mblen = pg_mbstrlen_with_len(e, elen); if (escape_mblen > 1) ereport(ERROR, (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), errmsg("invalid escape string"), errhint("Escape string must be empty or one character."))); } } /*---------- * We surround the transformed input string with * ^(?: ... )$ * which requires some explanation. We need "^" and "$" to force * the pattern to match the entire input string as per SQL99 spec. * The "(?:" and ")" are a non-capturing set of parens; we have to have * parens in case the string contains "|", else the "^" and "$" will * be bound into the first and last alternatives which is not what we * want, and the parens must be non capturing because we don't want them * to count when selecting output for SUBSTRING. *---------- */ /* * We need room for the prefix/postfix plus as many as 3 output bytes per * input byte; since the input is at most 1GB this can't overflow */ result = (text *) palloc(VARHDRSZ + 6 + 3 * plen); r = VARDATA(result); *r++ = '^'; *r++ = '('; *r++ = '?'; *r++ = ':'; while (plen > 0) { char pchar = *p; /* * If both the escape character and the current character from the * pattern are multi-byte, we need to take the slow path. * * But if one of them is single-byte, we can process the pattern one * byte at a time, ignoring multi-byte characters. (This works * because all server-encodings have the property that a valid * multi-byte character representation cannot contain the * representation of a valid single-byte character.) */ if (elen > 1) { int mblen = pg_mblen(p); if (mblen > 1) { /* slow, multi-byte path */ if (afterescape) { *r++ = '\\'; memcpy(r, p, mblen); r += mblen; afterescape = false; } else if (e && elen == mblen && memcmp(e, p, mblen) == 0) { /* SQL99 escape character; do not send to output */ afterescape = true; } else { /* * We know it's a multi-byte character, so we don't need * to do all the comparisons to single-byte characters * that we do below. */ memcpy(r, p, mblen); r += mblen; } p += mblen; plen -= mblen; continue; } } /* fast path */ if (afterescape) { if (pchar == '"' && !incharclass) /* for SUBSTRING patterns */ *r++ = ((nquotes++ % 2) == 0) ? '(' : ')'; else { *r++ = '\\'; *r++ = pchar; } afterescape = false; } else if (e && pchar == *e) { /* SQL99 escape character; do not send to output */ afterescape = true; } else if (incharclass) { if (pchar == '\\') *r++ = '\\'; *r++ = pchar; if (pchar == ']') incharclass = false; } else if (pchar == '[') { *r++ = pchar; incharclass = true; } else if (pchar == '%') { *r++ = '.'; *r++ = '*'; } else if (pchar == '_') *r++ = '.'; else if (pchar == '(') { /* convert to non-capturing parenthesis */ *r++ = '('; *r++ = '?'; *r++ = ':'; } else if (pchar == '\\' || pchar == '.' || pchar == '^' || pchar == '$') { *r++ = '\\'; *r++ = pchar; } else *r++ = pchar; p++, plen--; } *r++ = ')'; *r++ = '$'; SET_VARSIZE(result, r - ((char *) result)); PG_RETURN_TEXT_P(result); }