Esempio n. 1
0
/*
 * parser_errposition
 *		Report a parse-analysis-time cursor position, if possible.
 *
 * This is expected to be used within an ereport() call.  The return value
 * is a dummy (always 0, in fact).
 *
 * The locations stored in raw parsetrees are byte offsets into the source
 * string.	We have to convert them to 1-based character indexes for reporting
 * to clients.	(We do things this way to avoid unnecessary overhead in the
 * normal non-error case: computing character indexes would be much more
 * expensive than storing token offsets.)
 */
int
parser_errposition(ParseState *pstate, int location)
{
	int			pos;

	/* No-op if location was not provided */
	if (location < 0)
		return 0;
	/* Can't do anything if source text is not available */
	if (pstate == NULL || pstate->p_sourcetext == NULL)
		return 0;
	/* Convert offset to character number */
	pos = pg_mbstrlen_with_len(pstate->p_sourcetext, location) + 1;
	/* And pass it to the ereport mechanism */
	return errposition(pos);
}
Esempio n. 2
0
static int
ora_instr_mb(text *txt, text *pattern, int start, int nth)
{
	int			c_len_txt, c_len_pat;
	int			b_len_pat;
	int		   *pos_txt;
	const char *str_txt, *str_pat;
	int			beg, end, i, dx;

	str_txt = VARDATA_ANY(txt);
	c_len_txt = ora_mb_strlen(txt, NULL, &pos_txt);
	str_pat = VARDATA_ANY(pattern);
	b_len_pat = VARSIZE_ANY_EXHDR(pattern);
	c_len_pat = pg_mbstrlen_with_len(str_pat, b_len_pat);

	if (start > 0)
	{
		dx = 1;
		beg = start - 1;
		end = c_len_txt - c_len_pat + 1;
		if (beg >= end)
			return 0;	/* out of range */
	}
	else
	{
		dx = -1;
		beg = Min(c_len_txt + start, c_len_txt - c_len_pat);
		end = -1;
		if (beg <= end)
			return 0;	/* out of range */
	}

	for (i = beg; i != end; i += dx)
	{
		if (memcmp(str_txt + pos_txt[i], str_pat, b_len_pat) == 0)
		{
			if (--nth == 0)
				return i + 1;
		}
	}

	return 0;
}
Esempio n. 3
0
levenshtein_internal(text *s, text *t,
					 int ins_c, int del_c, int sub_c)
#endif
{
	int			m,
				n,
				s_bytes,
				t_bytes;
	int		   *prev;
	int		   *curr;
	int		   *s_char_len = NULL;
	int			i,
				j;
	const char *s_data;
	const char *t_data;
	const char *y;

	/*
	 * For levenshtein_less_equal_internal, we have real variables called
	 * start_column and stop_column; otherwise it's just short-hand for 0 and
	 * m.
	 */
#ifdef LEVENSHTEIN_LESS_EQUAL
	int			start_column,
				stop_column;

#undef START_COLUMN
#undef STOP_COLUMN
#define START_COLUMN start_column
#define STOP_COLUMN stop_column
#else
#undef START_COLUMN
#undef STOP_COLUMN
#define START_COLUMN 0
#define STOP_COLUMN m
#endif

	/* Extract a pointer to the actual character data. */
	s_data = VARDATA_ANY(s);
	t_data = VARDATA_ANY(t);

	/* Determine length of each string in bytes and characters. */
	s_bytes = VARSIZE_ANY_EXHDR(s);
	t_bytes = VARSIZE_ANY_EXHDR(t);
	m = pg_mbstrlen_with_len(s_data, s_bytes);
	n = pg_mbstrlen_with_len(t_data, t_bytes);

	/*
	 * We can transform an empty s into t with n insertions, or a non-empty t
	 * into an empty s with m deletions.
	 */
	if (!m)
		return n * ins_c;
	if (!n)
		return m * del_c;

	/*
	 * For security concerns, restrict excessive CPU+RAM usage. (This
	 * implementation uses O(m) memory and has O(mn) complexity.)
	 */
	if (m > MAX_LEVENSHTEIN_STRLEN ||
		n > MAX_LEVENSHTEIN_STRLEN)
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
				 errmsg("argument exceeds the maximum length of %d bytes",
						MAX_LEVENSHTEIN_STRLEN)));

#ifdef LEVENSHTEIN_LESS_EQUAL
	/* Initialize start and stop columns. */
	start_column = 0;
	stop_column = m + 1;

	/*
	 * If max_d >= 0, determine whether the bound is impossibly tight.	If so,
	 * return max_d + 1 immediately.  Otherwise, determine whether it's tight
	 * enough to limit the computation we must perform.  If so, figure out
	 * initial stop column.
	 */
	if (max_d >= 0)
	{
		int			min_theo_d; /* Theoretical minimum distance. */
		int			max_theo_d; /* Theoretical maximum distance. */
		int			net_inserts = n - m;

		min_theo_d = net_inserts < 0 ?
			-net_inserts * del_c : net_inserts * ins_c;
		if (min_theo_d > max_d)
			return max_d + 1;
		if (ins_c + del_c < sub_c)
			sub_c = ins_c + del_c;
		max_theo_d = min_theo_d + sub_c * Min(m, n);
		if (max_d >= max_theo_d)
			max_d = -1;
		else if (ins_c + del_c > 0)
		{
			/*
			 * Figure out how much of the first row of the notional matrix we
			 * need to fill in.  If the string is growing, the theoretical
			 * minimum distance already incorporates the cost of deleting the
			 * number of characters necessary to make the two strings equal in
			 * length.	Each additional deletion forces another insertion, so
			 * the best-case total cost increases by ins_c + del_c. If the
			 * string is shrinking, the minimum theoretical cost assumes no
			 * excess deletions; that is, we're starting no further right than
			 * column n - m.  If we do start further right, the best-case
			 * total cost increases by ins_c + del_c for each move right.
			 */
			int			slack_d = max_d - min_theo_d;
			int			best_column = net_inserts < 0 ? -net_inserts : 0;

			stop_column = best_column + (slack_d / (ins_c + del_c)) + 1;
			if (stop_column > m)
				stop_column = m + 1;
		}
	}
#endif

	/*
	 * In order to avoid calling pg_mblen() repeatedly on each character in s,
	 * we cache all the lengths before starting the main loop -- but if all
	 * the characters in both strings are single byte, then we skip this and
	 * use a fast-path in the main loop.  If only one string contains
	 * multi-byte characters, we still build the array, so that the fast-path
	 * needn't deal with the case where the array hasn't been initialized.
	 */
	if (m != s_bytes || n != t_bytes)
	{
		int			i;
		const char *cp = s_data;

		s_char_len = (int *) palloc((m + 1) * sizeof(int));
		for (i = 0; i < m; ++i)
		{
			s_char_len[i] = pg_mblen(cp);
			cp += s_char_len[i];
		}
		s_char_len[i] = 0;
	}

	/* One more cell for initialization column and row. */
	++m;
	++n;

	/* Previous and current rows of notional array. */
	prev = (int *) palloc(2 * m * sizeof(int));
	curr = prev + m;

	/*
	 * To transform the first i characters of s into the first 0 characters of
	 * t, we must perform i deletions.
	 */
	for (i = START_COLUMN; i < STOP_COLUMN; i++)
		prev[i] = i * del_c;

	/* Loop through rows of the notional array */
	for (y = t_data, j = 1; j < n; j++)
	{
		int		   *temp;
		const char *x = s_data;
		int			y_char_len = n != t_bytes + 1 ? pg_mblen(y) : 1;

#ifdef LEVENSHTEIN_LESS_EQUAL

		/*
		 * In the best case, values percolate down the diagonal unchanged, so
		 * we must increment stop_column unless it's already on the right end
		 * of the array.  The inner loop will read prev[stop_column], so we
		 * have to initialize it even though it shouldn't affect the result.
		 */
		if (stop_column < m)
		{
			prev[stop_column] = max_d + 1;
			++stop_column;
		}

		/*
		 * The main loop fills in curr, but curr[0] needs a special case: to
		 * transform the first 0 characters of s into the first j characters
		 * of t, we must perform j insertions.	However, if start_column > 0,
		 * this special case does not apply.
		 */
		if (start_column == 0)
		{
			curr[0] = j * ins_c;
			i = 1;
		}
		else
			i = start_column;
#else
		curr[0] = j * ins_c;
		i = 1;
#endif

		/*
		 * This inner loop is critical to performance, so we include a
		 * fast-path to handle the (fairly common) case where no multibyte
		 * characters are in the mix.  The fast-path is entitled to assume
		 * that if s_char_len is not initialized then BOTH strings contain
		 * only single-byte characters.
		 */
		if (s_char_len != NULL)
		{
			for (; i < STOP_COLUMN; i++)
			{
				int			ins;
				int			del;
				int			sub;
				int			x_char_len = s_char_len[i - 1];

				/*
				 * Calculate costs for insertion, deletion, and substitution.
				 *
				 * When calculating cost for substitution, we compare the last
				 * character of each possibly-multibyte character first,
				 * because that's enough to rule out most mis-matches.  If we
				 * get past that test, then we compare the lengths and the
				 * remaining bytes.
				 */
				ins = prev[i] + ins_c;
				del = curr[i - 1] + del_c;
				if (x[x_char_len - 1] == y[y_char_len - 1]
					&& x_char_len == y_char_len &&
					(x_char_len == 1 || rest_of_char_same(x, y, x_char_len)))
					sub = prev[i - 1];
				else
					sub = prev[i - 1] + sub_c;

				/* Take the one with minimum cost. */
				curr[i] = Min(ins, del);
				curr[i] = Min(curr[i], sub);

				/* Point to next character. */
				x += x_char_len;
			}
		}
		else
		{
			for (; i < STOP_COLUMN; i++)
			{
				int			ins;
				int			del;
				int			sub;

				/* Calculate costs for insertion, deletion, and substitution. */
				ins = prev[i] + ins_c;
				del = curr[i - 1] + del_c;
				sub = prev[i - 1] + ((*x == *y) ? 0 : sub_c);

				/* Take the one with minimum cost. */
				curr[i] = Min(ins, del);
				curr[i] = Min(curr[i], sub);

				/* Point to next character. */
				x++;
			}
		}

		/* Swap current row with previous row. */
		temp = curr;
		curr = prev;
		prev = temp;

		/* Point to next character. */
		y += y_char_len;

#ifdef LEVENSHTEIN_LESS_EQUAL

		/*
		 * This chunk of code represents a significant performance hit if used
		 * in the case where there is no max_d bound.  This is probably not
		 * because the max_d >= 0 test itself is expensive, but rather because
		 * the possibility of needing to execute this code prevents tight
		 * optimization of the loop as a whole.
		 */
		if (max_d >= 0)
		{
			/*
			 * The "zero point" is the column of the current row where the
			 * remaining portions of the strings are of equal length.  There
			 * are (n - 1) characters in the target string, of which j have
			 * been transformed.  There are (m - 1) characters in the source
			 * string, so we want to find the value for zp where (n - 1) - j =
			 * (m - 1) - zp.
			 */
			int			zp = j - (n - m);

			/* Check whether the stop column can slide left. */
			while (stop_column > 0)
			{
				int			ii = stop_column - 1;
				int			net_inserts = ii - zp;

				if (prev[ii] + (net_inserts > 0 ? net_inserts * ins_c :
								-net_inserts * del_c) <= max_d)
					break;
				stop_column--;
			}

			/* Check whether the start column can slide right. */
			while (start_column < stop_column)
			{
				int			net_inserts = start_column - zp;

				if (prev[start_column] +
					(net_inserts > 0 ? net_inserts * ins_c :
					 -net_inserts * del_c) <= max_d)
					break;

				/*
				 * We'll never again update these values, so we must make sure
				 * there's nothing here that could confuse any future
				 * iteration of the outer loop.
				 */
				prev[start_column] = max_d + 1;
				curr[start_column] = max_d + 1;
				if (start_column != 0)
					s_data += (s_char_len != NULL) ? s_char_len[start_column - 1] : 1;
				start_column++;
			}

			/* If they cross, we're going to exceed the bound. */
			if (start_column >= stop_column)
				return max_d + 1;
		}
#endif
	}

	/*
	 * Because the final value was swapped from the previous row to the
	 * current row, that's where we'll find it.
	 */
	return prev[m - 1];
}
Esempio n. 4
0
Datum
rpad(PG_FUNCTION_ARGS)
{
	text	   *string1 = PG_GETARG_TEXT_PP(0);
	int32		len = PG_GETARG_INT32(1);
	text	   *string2 = PG_GETARG_TEXT_PP(2);
	text	   *ret;
	char	   *ptr1,
			   *ptr2,
			   *ptr2start,
			   *ptr2end,
			   *ptr_ret;
	int			m,
				s1len,
				s2len;

	int			bytelen;

	/* Negative len is silently taken as zero */
	if (len < 0)
		len = 0;

	s1len = VARSIZE_ANY_EXHDR(string1);
	if (s1len < 0)
		s1len = 0;				/* shouldn't happen */

	s2len = VARSIZE_ANY_EXHDR(string2);
	if (s2len < 0)
		s2len = 0;				/* shouldn't happen */

	s1len = pg_mbstrlen_with_len(VARDATA_ANY(string1), s1len);

	if (s1len > len)
		s1len = len;			/* truncate string1 to len chars */

	if (s2len <= 0)
		len = s1len;			/* nothing to pad with, so don't pad */

	bytelen = pg_database_encoding_max_length() * len;

	/* Check for integer overflow */
	if (len != 0 && bytelen / pg_database_encoding_max_length() != len)
		ereport(ERROR,
				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
				 errmsg("requested length too large")));

	ret = (text *) palloc(VARHDRSZ + bytelen);
	m = len - s1len;

	ptr1 = VARDATA_ANY(string1);
	ptr_ret = VARDATA(ret);

	while (s1len--)
	{
		int			mlen = pg_mblen(ptr1);

		memcpy(ptr_ret, ptr1, mlen);
		ptr_ret += mlen;
		ptr1 += mlen;
	}

	ptr2 = ptr2start = VARDATA_ANY(string2);
	ptr2end = ptr2 + s2len;

	while (m--)
	{
		int			mlen = pg_mblen(ptr2);

		memcpy(ptr_ret, ptr2, mlen);
		ptr_ret += mlen;
		ptr2 += mlen;
		if (ptr2 == ptr2end)	/* wrap around at end of s2 */
			ptr2 = ptr2start;
	}

	SET_VARSIZE(ret, ptr_ret - (char *) ret);

	PG_RETURN_TEXT_P(ret);
}
Esempio n. 5
0
/*
 * bpchar_input -- common guts of bpcharin and bpcharrecv
 *
 * s is the input text of length len (may not be null-terminated)
 * atttypmod is the typmod value to apply
 *
 * Note that atttypmod is measured in characters, which
 * is not necessarily the same as the number of bytes.
 *
 * If the input string is too long, raise an error, unless the extra
 * characters are spaces, in which case they're truncated.  (per SQL)
 */
static BpChar *
bpchar_input(const char *s, size_t len, int32 atttypmod)
{
	BpChar	   *result;
	char	   *r;
	size_t		maxlen;

	/* verify encoding */
	pg_verifymbstr(s, len, false);

	/* If typmod is -1 (or invalid), use the actual string length */
	if (atttypmod < (int32) VARHDRSZ)
		maxlen = len;
	else
	{
		size_t		charlen;	/* number of CHARACTERS in the input */

		maxlen = atttypmod - VARHDRSZ;
		charlen = pg_mbstrlen_with_len(s, len);
		if (charlen > maxlen)
		{
			/* Verify that extra characters are spaces, and clip them off */
			size_t		mbmaxlen = pg_mbcharcliplen(s, len, maxlen);
			size_t		j;

			/*
			 * at this point, len is the actual BYTE length of the input
			 * string, maxlen is the max number of CHARACTERS allowed for this
			 * bpchar type, mbmaxlen is the length in BYTES of those chars.
			 */
			for (j = mbmaxlen; j < len; j++)
			{
				if (s[j] != ' ')
					ereport(ERROR,
							(errcode(ERRCODE_STRING_DATA_RIGHT_TRUNCATION),
							 errmsg("value too long for type character(%d)",
									(int) maxlen)));
			}

			/*
			 * Now we set maxlen to the necessary byte length, not the number
			 * of CHARACTERS!
			 */
			maxlen = len = mbmaxlen;
		}
		else
		{
			/*
			 * Now we set maxlen to the necessary byte length, not the number
			 * of CHARACTERS!
			 */
			maxlen = len + (maxlen - charlen);
		}
	}

	result = (BpChar *) palloc(maxlen + VARHDRSZ);
	VARATT_SIZEP(result) = maxlen + VARHDRSZ;
	r = VARDATA(result);
	memcpy(r, s, len);

	/* blank pad the string if necessary */
	if (maxlen > len)
		memset(r + len, ' ', maxlen - len);

	return result;
}
Esempio n. 6
0
/*
 * Converts a CHARACTER type to the specified size.
 *
 * maxlen is the typmod, ie, declared length plus VARHDRSZ bytes.
 * isExplicit is true if this is for an explicit cast to char(N).
 *
 * Truncation rules: for an explicit cast, silently truncate to the given
 * length; for an implicit cast, raise error unless extra characters are
 * all spaces.	(This is sort-of per SQL: the spec would actually have us
 * raise a "completion condition" for the explicit cast case, but Postgres
 * hasn't got such a concept.)
 */
Datum
bpchar(PG_FUNCTION_ARGS)
{
	BpChar	   *source = PG_GETARG_BPCHAR_P(0);
	int32		maxlen = PG_GETARG_INT32(1);
	bool		isExplicit = PG_GETARG_BOOL(2);
	BpChar	   *result;
	int32		len;
	char	   *r;
	char	   *s;
	int			i;
	int			charlen;		/* number of characters in the input string +
								 * VARHDRSZ */

	/* No work if typmod is invalid */
	if (maxlen < (int32) VARHDRSZ)
		PG_RETURN_BPCHAR_P(source);

	len = VARSIZE(source);

	charlen = pg_mbstrlen_with_len(VARDATA(source), len - VARHDRSZ) + VARHDRSZ;

	/* No work if supplied data matches typmod already */
	if (charlen == maxlen)
		PG_RETURN_BPCHAR_P(source);

	if (charlen > maxlen)
	{
		/* Verify that extra characters are spaces, and clip them off */
		size_t		maxmblen;

		maxmblen = pg_mbcharcliplen(VARDATA(source), len - VARHDRSZ,
									maxlen - VARHDRSZ) + VARHDRSZ;

		if (!isExplicit)
		{
			for (i = maxmblen - VARHDRSZ; i < len - VARHDRSZ; i++)
				if (*(VARDATA(source) + i) != ' ')
					ereport(ERROR,
							(errcode(ERRCODE_STRING_DATA_RIGHT_TRUNCATION),
							 errmsg("value too long for type character(%d)",
									maxlen - VARHDRSZ)));
		}

		len = maxmblen;

		/*
		 * XXX: at this point, maxlen is the necessary byte length+VARHDRSZ,
		 * not the number of CHARACTERS!
		 */
		maxlen = len;
	}
	else
	{
		/*
		 * XXX: at this point, maxlen is the necessary byte length+VARHDRSZ,
		 * not the number of CHARACTERS!
		 */
		maxlen = len + (maxlen - charlen);
	}

	s = VARDATA(source);

	result = palloc(maxlen);
	VARATT_SIZEP(result) = maxlen;
	r = VARDATA(result);

	memcpy(r, s, len - VARHDRSZ);

	/* blank pad the string if necessary */
	if (maxlen > len)
		memset(r + len - VARHDRSZ, ' ', maxlen - len);

	PG_RETURN_BPCHAR_P(result);
}
Esempio n. 7
0
/*
 * similar_escape()
 * Convert a SQL:2008 regexp pattern to POSIX style, so it can be used by
 * our regexp engine.
 */
Datum
similar_escape(PG_FUNCTION_ARGS)
{
	text	   *pat_text;
	text	   *esc_text;
	text	   *result;
	char	   *p,
			   *e,
			   *r;
	int			plen,
				elen;
	bool		afterescape = false;
	bool		incharclass = false;
	int			nquotes = 0;

	/* This function is not strict, so must test explicitly */
	if (PG_ARGISNULL(0))
		PG_RETURN_NULL();
	pat_text = PG_GETARG_TEXT_PP(0);
	p = VARDATA_ANY(pat_text);
	plen = VARSIZE_ANY_EXHDR(pat_text);
	if (PG_ARGISNULL(1))
	{
		/* No ESCAPE clause provided; default to backslash as escape */
		e = "\\";
		elen = 1;
	}
	else
	{
		esc_text = PG_GETARG_TEXT_PP(1);
		e = VARDATA_ANY(esc_text);
		elen = VARSIZE_ANY_EXHDR(esc_text);
		if (elen == 0)
			e = NULL;			/* no escape character */
		else
		{
			int			escape_mblen = pg_mbstrlen_with_len(e, elen);

			if (escape_mblen > 1)
				ereport(ERROR,
						(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
						 errmsg("invalid escape string"),
						 errhint("Escape string must be empty or one character.")));
		}
	}

	/*----------
	 * We surround the transformed input string with
	 *			^(?: ... )$
	 * which requires some explanation.  We need "^" and "$" to force
	 * the pattern to match the entire input string as per SQL99 spec.
	 * The "(?:" and ")" are a non-capturing set of parens; we have to have
	 * parens in case the string contains "|", else the "^" and "$" will
	 * be bound into the first and last alternatives which is not what we
	 * want, and the parens must be non capturing because we don't want them
	 * to count when selecting output for SUBSTRING.
	 *----------
	 */

	/*
	 * We need room for the prefix/postfix plus as many as 3 output bytes per
	 * input byte; since the input is at most 1GB this can't overflow
	 */
	result = (text *) palloc(VARHDRSZ + 6 + 3 * plen);
	r = VARDATA(result);

	*r++ = '^';
	*r++ = '(';
	*r++ = '?';
	*r++ = ':';

	while (plen > 0)
	{
		char		pchar = *p;

		/*
		 * If both the escape character and the current character from the
		 * pattern are multi-byte, we need to take the slow path.
		 *
		 * But if one of them is single-byte, we can process the pattern one
		 * byte at a time, ignoring multi-byte characters.  (This works
		 * because all server-encodings have the property that a valid
		 * multi-byte character representation cannot contain the
		 * representation of a valid single-byte character.)
		 */

		if (elen > 1)
		{
			int mblen = pg_mblen(p);
			if (mblen > 1)
			{
				/* slow, multi-byte path */
				if (afterescape)
				{
					*r++ = '\\';
					memcpy(r, p, mblen);
					r += mblen;
					afterescape = false;
				}
				else if (e && elen == mblen && memcmp(e, p, mblen) == 0)
				{
					/* SQL99 escape character; do not send to output */
					afterescape = true;
				}
				else
				{
					/*
					 * We know it's a multi-byte character, so we don't need
					 * to do all the comparisons to single-byte characters
					 * that we do below.
					 */
					memcpy(r, p, mblen);
					r += mblen;
				}

				p += mblen;
				plen -= mblen;

				continue;
			}
		}

		/* fast path */
		if (afterescape)
		{
			if (pchar == '"' && !incharclass)	/* for SUBSTRING patterns */
				*r++ = ((nquotes++ % 2) == 0) ? '(' : ')';
			else
			{
				*r++ = '\\';
				*r++ = pchar;
			}
			afterescape = false;
		}
		else if (e && pchar == *e)
		{
			/* SQL99 escape character; do not send to output */
			afterescape = true;
		}
		else if (incharclass)
		{
			if (pchar == '\\')
				*r++ = '\\';
			*r++ = pchar;
			if (pchar == ']')
				incharclass = false;
		}
		else if (pchar == '[')
		{
			*r++ = pchar;
			incharclass = true;
		}
		else if (pchar == '%')
		{
			*r++ = '.';
			*r++ = '*';
		}
		else if (pchar == '_')
			*r++ = '.';
		else if (pchar == '(')
		{
			/* convert to non-capturing parenthesis */
			*r++ = '(';
			*r++ = '?';
			*r++ = ':';
		}
		else if (pchar == '\\' || pchar == '.' ||
				 pchar == '^' || pchar == '$')
		{
			*r++ = '\\';
			*r++ = pchar;
		}
		else
			*r++ = pchar;
		p++, plen--;
	}

	*r++ = ')';
	*r++ = '$';

	SET_VARSIZE(result, r - ((char *) result));

	PG_RETURN_TEXT_P(result);
}