C++ (Cpp) pg_utf_mblen Examples

Example #1

0

Show file

File: xmlnode_util.c Project: andreypopp/pg_xnode

int
utf8cmp(char *c1, char *c2)
{
	unsigned char len1 = pg_utf_mblen((unsigned char *) c1);
	unsigned char len2 = pg_utf_mblen((unsigned char *) c2);

	Assert(len1 <= UTF_MAX_WIDTH && len2 <= UTF_MAX_WIDTH);
	if (len1 != len2)
	{
		return len1 > len2 ? 1 : -1;
	}
	else
	{
		unsigned char j;

		for (j = 0; j < len1; j++)
		{
			if (*c1 != *c2)
			{
				return *c1 > *c2 ? 1 : -1;
			}
			c1++;
			c2++;
		}
		return 0;
	}
}

Example #2

0

Show file

File: xmlnode_util.c Project: andreypopp/pg_xnode

/*
 * Test if a valid number starts at 'str'.
 * If it does, then '*end' is set to the first character after the number.
 *
 * If 'skipWhitespace' is true, then also skip all the following whitespace
 * should there be any.
 */
bool
xmlStringIsNumber(char *str, double *numValue, char **end, bool skipWhitespace)
{
	*numValue = strtod(str, end);

	if (*end == str)
	{
		return false;
	}

	if (skipWhitespace)
	{
		while (**end != '\0')
		{
			if (!XNODE_WHITESPACE(*end))
			{
				return false;
			}
			*end += pg_utf_mblen((unsigned char *) *end);
		}
		return true;
	}
	else
	{
		return true;
	}
}

Example #3

0

Show file

File: wchar.c Project: sunyangkobe/cscd43

static int
pg_utf8_verifier(const unsigned char *s, int len)
{
	int l = pg_utf_mblen(s);

	if (len < l)
		return -1;

	if (!pg_utf8_islegal(s, l))
		return -1;

	return l;
}

Example #4

0

Show file

File: utf8_and_iso8859_1.c Project: legendOfZelda/LDV

Datum
utf8_to_iso8859_1(PG_FUNCTION_ARGS)
{
	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
	int			len = PG_GETARG_INT32(4);
	unsigned short c,
				c1;

	Assert(PG_GETARG_INT32(0) == PG_UTF8);
	Assert(PG_GETARG_INT32(1) == PG_LATIN1);
	Assert(len >= 0);

	while (len > 0)
	{
		c = *src;
		if (c == 0)
			report_invalid_encoding(PG_UTF8, (const char *) src, len);
		/* fast path for ASCII-subset characters */
		if (!IS_HIGHBIT_SET(c))
		{
			*dest++ = c;
			src++;
			len--;
		}
		else
		{
			int			l = pg_utf_mblen(src);

			if (l > len || !pg_utf8_islegal(src, l))
				report_invalid_encoding(PG_UTF8, (const char *) src, len);
			if (l != 2)
				report_untranslatable_char(PG_UTF8, PG_LATIN1,
										   (const char *) src, len);
			c1 = src[1] & 0x3f;
			c = ((c & 0x1f) << 6) | c1;
			if (c >= 0x80 && c <= 0xff)
			{
				*dest++ = (unsigned char) c;
				src += 2;
				len -= 2;
			}
			else
				report_untranslatable_char(PG_UTF8, PG_LATIN1,
										   (const char *) src, len);
		}
	}
	*dest = '\0';

	PG_RETURN_VOID();
}

Example #5

0

Show file

File: saslprep.c Project: adityavs/postgres

/*
 * Calculate the length in characters of a null-terminated UTF-8 string.
 *
 * Returns -1 if the input is not valid UTF-8.
 */
static int
pg_utf8_string_len(const char *source)
{
	const unsigned char *p = (const unsigned char *) source;
	int			l;
	int			num_chars = 0;

	while (*p)
	{
		l = pg_utf_mblen(p);

		if (!pg_utf8_islegal(p, l))
			return -1;

		p += l;
		num_chars++;
	}

	return num_chars;
}

Example #6

0

Show file

File: xmlnode.c Project: andreypopp/pg_xnode

Datum
xmlelement(PG_FUNCTION_ARGS)
{
	Datum		nameText;
	ArrayType  *attrs = NULL;
	char	   *elName;
	unsigned int nameLen,
				resSizeMax;
	unsigned int childSize = 0;
	char	   *c,
			   *result,
			   *resData,
			   *resCursor,
			   *nameDst;
	XMLCompNodeHdr element;
	XMLNodeOffset *rootOffPtr;
	bool		nameFirstChar = true;
	char	  **attrNames = NULL;
	char	  **attrValues = NULL;
	char	   *attrValFlags = NULL;
	XMLNodeHdr *attrNodes = NULL;
	XMLNodeHdr	child = NULL;
	char	  **newNds = NULL;
	char	   *newNd = NULL;
	unsigned int attrCount = 0;
	unsigned int attrsSizeTotal = 0;
	unsigned short childCount = 0;

	if (PG_ARGISNULL(0))
	{
		elog(ERROR, "invalid element name");
	}
	nameText = PG_GETARG_DATUM(0);
	elName = TextDatumGetCString(nameText);

	nameLen = strlen(elName);
	if (nameLen == 0)
	{
		elog(ERROR, "invalid element name");
	}

	if (!PG_ARGISNULL(1))
	{
		int		   *dims;
		Oid			elType,
					arrType;
		int16		arrLen,
					elLen;
		bool		elByVal,
					elIsNull;
		char		elAlign;
		unsigned int i;

		attrs = PG_GETARG_ARRAYTYPE_P(1);
		if (ARR_NDIM(attrs) != 2)
		{
			elog(ERROR, "attributes must be passed in 2 dimensional array");
		}
		dims = ARR_DIMS(attrs);
		if (dims[1] != 2)
		{
			elog(ERROR, "the second dimension of attribute array must be 2");
		}

		attrCount = dims[0];
		Assert(attrCount > 0);

		elType = attrs->elemtype;
		arrType = get_array_type(elType);
		arrLen = get_typlen(arrType);
		Assert(arrType != InvalidOid);
		get_typlenbyvalalign(elType, &elLen, &elByVal, &elAlign);
		attrNames = (char **) palloc(attrCount * sizeof(char *));
		attrValues = (char **) palloc(attrCount * sizeof(char *));
		attrValFlags = (bool *) palloc(attrCount * sizeof(char));

		for (i = 1; i <= attrCount; i++)
		{
			int			subscrName[] = {i, 1};
			int			subscrValue[] = {i, 2};
			Datum		elDatum;
			char	   *nameStr,
					   *valueStr;
			bool		valueHasRefs = false;

			elDatum = array_ref(attrs, 2, subscrName, arrLen, elLen, elByVal, elAlign, &elIsNull);
			if (elIsNull)
			{
				elog(ERROR, "attribute name must not be null");
			}
			nameStr = text_to_cstring(DatumGetTextP(elDatum));
			if (strlen(nameStr) == 0)
			{
				elog(ERROR, "attribute name must be a string of non-zero length");
			}
			else
			{					/* Check validity of characters. */
				char	   *c = nameStr;
				int			cWidth = pg_utf_mblen((unsigned char *) c);

				if (!XNODE_VALID_NAME_START(c))
				{
					elog(ERROR, "attribute name starts with invalid character");
				}
				do
				{
					c += cWidth;
					cWidth = pg_utf_mblen((unsigned char *) c);
				} while (XNODE_VALID_NAME_CHAR(c));
				if (*c != '\0')
				{
					elog(ERROR, "invalid character in attribute name");
				}
			}

			/* Check uniqueness of the attribute name. */
			if (i > 1)
			{
				unsigned short j;

				for (j = 0; j < (i - 1); j++)
				{
					if (strcmp(nameStr, attrNames[j]) == 0)
					{
						elog(ERROR, "attribute name '%s' is not unique", nameStr);
					}
				}
			}

			elDatum = array_ref(attrs, 2, subscrValue, arrLen, elLen, elByVal, elAlign, &elIsNull);
			if (elIsNull)
			{
				elog(ERROR, "attribute value must not be null");
			}
			valueStr = text_to_cstring(DatumGetTextP(elDatum));

			attrValFlags[i - 1] = 0;

			if (strlen(valueStr) > 0)
			{
				XMLNodeParserStateData state;
				char	   *valueStrOrig = valueStr;

				/* Parse the value and check validity. */
				initXMLParserState(&state, valueStr, true);
				valueStr = readXMLAttValue(&state, true, &valueHasRefs);

				/*
				 * If the value contains quotation mark, then apostrophe is
				 * the delimiter.
				 */
				if (strchr(valueStr, XNODE_CHAR_QUOTMARK) != NULL)
				{
					attrValFlags[i - 1] |= XNODE_ATTR_APOSTROPHE;
				}
				finalizeXMLParserState(&state);
				pfree(valueStrOrig);
			}

			attrNames[i - 1] = nameStr;
			attrValues[i - 1] = valueStr;
			if (valueHasRefs)
			{
				attrValFlags[i - 1] |= XNODE_ATTR_CONTAINS_REF;
			}
			attrsSizeTotal += sizeof(XMLNodeHdrData) + strlen(nameStr) + strlen(valueStr) + 2;
		}
	}

	if (!PG_ARGISNULL(2))
	{
		Datum		childNodeDatum = PG_GETARG_DATUM(2);
		xmlnode		childRaw = (xmlnode) PG_DETOAST_DATUM(childNodeDatum);

		child = XNODE_ROOT(childRaw);
		if (child->kind == XMLNODE_DOC_FRAGMENT)
		{
			childSize = getXMLNodeSize(child, true) - getXMLNodeSize(child, false);
		}
		else
		{
			childSize = getXMLNodeSize(child, true);
		}
	}

	/* Make sure the element name is valid. */
	c = elName;
	while (*c != '\0')
	{
		if ((nameFirstChar && !XNODE_VALID_NAME_START(c)) || (!nameFirstChar && !XNODE_VALID_NAME_CHAR(c)))
		{
			elog(ERROR, "unrecognized character '%c' in element name", *c);
		}
		if (nameFirstChar)
		{
			nameFirstChar = false;
		}
		c += pg_utf_mblen((unsigned char *) c);
	};

	if (child != NULL)
	{
		if (child->kind == XMLNODE_DOC_FRAGMENT)
		{
			childCount = ((XMLCompNodeHdr) child)->children;
		}
		else
		{
			childCount = 1;
		}
	}

	/*
	 * It's hard to determine the byte width of references until the copying
	 * has finished. Therefore we assume the worst case: 4 bytes per
	 * reference.
	 */
	resSizeMax = VARHDRSZ + attrsSizeTotal + childSize + (attrCount + childCount) * 4 +
		sizeof(XMLCompNodeHdrData) + nameLen + 1 + sizeof(XMLNodeOffset);
	result = (char *) palloc(resSizeMax);
	resCursor = resData = VARDATA(result);

	if (attrCount > 0)
	{							/* Copy attributes. */
		unsigned short i;

		Assert(attrNames != NULL && attrValues != NULL && attrValFlags != NULL);

		attrNodes = (XMLNodeHdr *) palloc(attrCount * sizeof(XMLNodeHdr));
		for (i = 0; i < attrCount; i++)
		{
			XMLNodeHdr	attrNode = (XMLNodeHdr) resCursor;
			char	   *name = attrNames[i];
			unsigned int nameLen = strlen(name);
			char	   *value = attrValues[i];
			unsigned int valueLen = strlen(value);

			attrNodes[i] = attrNode;
			attrNode->kind = XMLNODE_ATTRIBUTE;
			attrNode->flags = attrValFlags[i];

			if (xmlAttrValueIsNumber(value))
			{
				attrNode->flags |= XNODE_ATTR_NUMBER;
			}

			resCursor = XNODE_CONTENT(attrNode);
			memcpy(resCursor, name, nameLen);
			resCursor += nameLen;
			*(resCursor++) = '\0';
			pfree(name);

			memcpy(resCursor, value, valueLen);
			resCursor += valueLen;
			*(resCursor++) = '\0';
			pfree(value);
		}
		pfree(attrNames);
		pfree(attrValues);
		pfree(attrValFlags);
	}

	if (child != NULL)
	{
		XMLNodeKind k = child->kind;

		/*
		 * Check if the node to be inserted is of a valid kind. If the node is
		 * document fragment, its assumed that invalid node kinds are never
		 * added. Otherwise we'd have to check the node fragment (recursively)
		 * not only here.
		 */
		if (k != XMLNODE_DOC_FRAGMENT)
		{
			if (k == XMLNODE_DOC || k == XMLNODE_DTD || k == XMLNODE_ATTRIBUTE)
			{
				elog(ERROR, "the nested node must not be %s", getXMLNodeKindStr(k));
			}
		}
		copyXMLNodeOrDocFragment(child, childSize, &resCursor, &newNd, &newNds);
	}

	element = (XMLCompNodeHdr) resCursor;
	element->common.kind = XMLNODE_ELEMENT;
	element->common.flags = (child == NULL) ? XNODE_EMPTY : 0;
	element->children = attrCount + childCount;

	if (childCount > 0 || attrCount > 0)
	{
		XMLNodeOffset childOff,
					childOffMax;
		char		bwidth;
		char	   *refPtr;

		/* Save relative offset(s) of the child node(s). */

		if (attrCount > 0)
		{
			childOffMax = (char *) element - resData;
		}
		else if (childCount > 0)
		{
			if (child->kind == XMLNODE_DOC_FRAGMENT)
			{
				Assert(newNds != NULL);
				childOffMax = (char *) element - newNds[0];
			}
			else
			{
				childOffMax = (char *) element - newNd;
			}
		}
		else
		{
			childOffMax = 0;
		}
		bwidth = getXMLNodeOffsetByteWidth(childOffMax);
		XNODE_SET_REF_BWIDTH(element, bwidth);

		refPtr = XNODE_FIRST_REF(element);

		if (attrCount > 0)
		{
			unsigned short i;

			/* The attribute references first... */
			for (i = 0; i < attrCount; i++)
			{
				XMLNodeHdr	node = attrNodes[i];

				childOff = (char *) element - (char *) node;
				writeXMLNodeOffset(childOff, &refPtr, bwidth, true);
			}
			pfree(attrNodes);
		}


		if (childCount > 0)
		{
			/* ...followed by those of the other children. */
			if (child->kind == XMLNODE_DOC_FRAGMENT)
			{
				unsigned short i;

				for (i = 0; i < childCount; i++)
				{
					childOff = (char *) element - newNds[i];
					writeXMLNodeOffset(childOff, &refPtr, bwidth, true);
				}
				pfree(newNds);
			}
			else
			{
				childOff = (char *) element - newNd;
				writeXMLNodeOffset(childOff, &refPtr, bwidth, true);
			}
		}
	}

	/* And finally set the element name. */
	nameDst = XNODE_ELEMENT_NAME(element);
	memcpy(nameDst, elName, nameLen);
	nameDst[nameLen] = '\0';
	resCursor = nameDst + strlen(elName) + 1;

	SET_VARSIZE(result, (char *) resCursor - result + sizeof(XMLNodeOffset));
	rootOffPtr = XNODE_ROOT_OFFSET_PTR(result);
	*rootOffPtr = (char *) element - resData;
	PG_RETURN_POINTER(result);
}

Example #7

0

Show file

File: saslprep.c Project: adityavs/postgres

/*
 * pg_saslprep - Normalize a password with SASLprep.
 *
 * SASLprep requires the input to be in UTF-8 encoding, but PostgreSQL
 * supports many encodings, so we don't blindly assume that.  pg_saslprep
 * will check if the input looks like valid UTF-8, and returns
 * SASLPREP_INVALID_UTF8 if not.
 *
 * If the string contains prohibited characters (or more precisely, if the
 * output string would contain prohibited characters after normalization),
 * returns SASLPREP_PROHIBITED.
 *
 * On success, returns SASLPREP_SUCCESS, and the normalized string in
 * *output.
 *
 * In frontend, the normalized string is malloc'd, and the caller is
 * responsible for freeing it.  If an allocation fails, returns
 * SASLPREP_OOM.  In backend, the normalized string is palloc'd instead,
 * and a failed allocation leads to ereport(ERROR).
 */
pg_saslprep_rc
pg_saslprep(const char *input, char **output)
{
	pg_wchar   *input_chars = NULL;
	pg_wchar   *output_chars = NULL;
	int			input_size;
	char	   *result;
	int			result_size;
	int			count;
	int			i;
	bool		contains_RandALCat;
	unsigned char *p;
	pg_wchar   *wp;

	/* Ensure we return *output as NULL on failure */
	*output = NULL;

	/* Check that the password isn't stupendously long */
	if (strlen(input) > MAX_PASSWORD_LENGTH)
	{
#ifndef FRONTEND
		ereport(ERROR,
				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
				 errmsg("password too long")));
#else
		return SASLPREP_OOM;
#endif
	}

	/*
	 * Quick check if the input is pure ASCII.  An ASCII string requires no
	 * further processing.
	 */
	if (pg_is_ascii_string(input))
	{
		*output = STRDUP(input);
		if (!(*output))
			goto oom;
		return SASLPREP_SUCCESS;
	}

	/*
	 * Convert the input from UTF-8 to an array of Unicode codepoints.
	 *
	 * This also checks that the input is a legal UTF-8 string.
	 */
	input_size = pg_utf8_string_len(input);
	if (input_size < 0)
		return SASLPREP_INVALID_UTF8;

	input_chars = ALLOC((input_size + 1) * sizeof(pg_wchar));
	if (!input_chars)
		goto oom;

	p = (unsigned char *) input;
	for (i = 0; i < input_size; i++)
	{
		input_chars[i] = utf8_to_unicode(p);
		p += pg_utf_mblen(p);
	}
	input_chars[i] = (pg_wchar) '\0';

	/*
	 * The steps below correspond to the steps listed in [RFC3454], Section
	 * "2. Preparation Overview"
	 */

	/*
	 * 1) Map -- For each character in the input, check if it has a mapping
	 * and, if so, replace it with its mapping.
	 */
	count = 0;
	for (i = 0; i < input_size; i++)
	{
		pg_wchar	code = input_chars[i];

		if (IS_CODE_IN_TABLE(code, non_ascii_space_ranges))
			input_chars[count++] = 0x0020;
		else if (IS_CODE_IN_TABLE(code, commonly_mapped_to_nothing_ranges))
		{
			/* map to nothing */
		}
		else
			input_chars[count++] = code;
	}
	input_chars[count] = (pg_wchar) '\0';
	input_size = count;

	if (input_size == 0)
		goto prohibited;		/* don't allow empty password */

	/*
	 * 2) Normalize -- Normalize the result of step 1 using Unicode
	 * normalization.
	 */
	output_chars = unicode_normalize_kc(input_chars);
	if (!output_chars)
		goto oom;

	/*
	 * 3) Prohibit -- Check for any characters that are not allowed in the
	 * output.  If any are found, return an error.
	 */
	for (i = 0; i < input_size; i++)
	{
		pg_wchar	code = input_chars[i];

		if (IS_CODE_IN_TABLE(code, prohibited_output_ranges))
			goto prohibited;
		if (IS_CODE_IN_TABLE(code, unassigned_codepoint_ranges))
			goto prohibited;
	}

	/*
	 * 4) Check bidi -- Possibly check for right-to-left characters, and if
	 * any are found, make sure that the whole string satisfies the
	 * requirements for bidirectional strings.  If the string does not satisfy
	 * the requirements for bidirectional strings, return an error.
	 *
	 * [RFC3454], Section "6. Bidirectional Characters" explains in more
	 * detail what that means:
	 *
	 * "In any profile that specifies bidirectional character handling, all
	 * three of the following requirements MUST be met:
	 *
	 * 1) The characters in section 5.8 MUST be prohibited.
	 *
	 * 2) If a string contains any RandALCat character, the string MUST NOT
	 * contain any LCat character.
	 *
	 * 3) If a string contains any RandALCat character, a RandALCat character
	 * MUST be the first character of the string, and a RandALCat character
	 * MUST be the last character of the string."
	 */
	contains_RandALCat = false;
	for (i = 0; i < input_size; i++)
	{
		pg_wchar	code = input_chars[i];

		if (IS_CODE_IN_TABLE(code, RandALCat_codepoint_ranges))
		{
			contains_RandALCat = true;
			break;
		}
	}

	if (contains_RandALCat)
	{
		pg_wchar	first = input_chars[0];
		pg_wchar	last = input_chars[input_size - 1];

		for (i = 0; i < input_size; i++)
		{
			pg_wchar	code = input_chars[i];

			if (IS_CODE_IN_TABLE(code, LCat_codepoint_ranges))
				goto prohibited;
		}

		if (!IS_CODE_IN_TABLE(first, RandALCat_codepoint_ranges) ||
			!IS_CODE_IN_TABLE(last, RandALCat_codepoint_ranges))
			goto prohibited;
	}

	/*
	 * Finally, convert the result back to UTF-8.
	 */
	result_size = 0;
	for (wp = output_chars; *wp; wp++)
	{
		unsigned char buf[4];

		unicode_to_utf8(*wp, buf);
		result_size += pg_utf_mblen(buf);
	}

	result = ALLOC(result_size + 1);
	if (!result)
		goto oom;

	/*
	 * There are no error exits below here, so the error exit paths don't need
	 * to worry about possibly freeing "result".
	 */
	p = (unsigned char *) result;
	for (wp = output_chars; *wp; wp++)
	{
		unicode_to_utf8(*wp, p);
		p += pg_utf_mblen(p);
	}
	Assert((char *) p == result + result_size);
	*p = '\0';

	FREE(input_chars);
	FREE(output_chars);

	*output = result;
	return SASLPREP_SUCCESS;

prohibited:
	if (input_chars)
		FREE(input_chars);
	if (output_chars)
		FREE(output_chars);

	return SASLPREP_PROHIBITED;

oom:
	if (input_chars)
		FREE(input_chars);
	if (output_chars)
		FREE(output_chars);

	return SASLPREP_OOM;
}

Example #8

0

Show file

File: json.c Project: sqlparser/postgres

/*
 * The next token in the input stream is known to be a string; lex it.
 */
static inline void
json_lex_string(JsonLexContext *lex)
{
	char	   *s;
	int			len;
	int			hi_surrogate = -1;

	if (lex->strval != NULL)
		resetStringInfo(lex->strval);

	Assert(lex->input_length > 0);
	s = lex->token_start;
	len = lex->token_start - lex->input;
	for (;;)
	{
		s++;
		len++;
		/* Premature end of the string. */
		if (len >= lex->input_length)
		{
			lex->token_terminator = s;
			report_invalid_token(lex);
		}
		else if (*s == '"')
			break;
		else if ((unsigned char) *s < 32)
		{
			/* Per RFC4627, these characters MUST be escaped. */
			/* Since *s isn't printable, exclude it from the context string */
			lex->token_terminator = s;
			ereport(ERROR,
					(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
					 errmsg("invalid input syntax for type json"),
					 errdetail("Character with value 0x%02x must be escaped.",
							   (unsigned char) *s),
					 report_json_context(lex)));
		}
		else if (*s == '\\')
		{
			/* OK, we have an escape character. */
			s++;
			len++;
			if (len >= lex->input_length)
			{
				lex->token_terminator = s;
				report_invalid_token(lex);
			}
			else if (*s == 'u')
			{
				int			i;
				int			ch = 0;

				for (i = 1; i <= 4; i++)
				{
					s++;
					len++;
					if (len >= lex->input_length)
					{
						lex->token_terminator = s;
						report_invalid_token(lex);
					}
					else if (*s >= '0' && *s <= '9')
						ch = (ch * 16) + (*s - '0');
					else if (*s >= 'a' && *s <= 'f')
						ch = (ch * 16) + (*s - 'a') + 10;
					else if (*s >= 'A' && *s <= 'F')
						ch = (ch * 16) + (*s - 'A') + 10;
					else
					{
						lex->token_terminator = s + pg_mblen(s);
						ereport(ERROR,
								(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
								 errmsg("invalid input syntax for type json"),
								 errdetail("\"\\u\" must be followed by four hexadecimal digits."),
								 report_json_context(lex)));
					}
				}
				if (lex->strval != NULL)
				{
					char		utf8str[5];
					int			utf8len;

					if (ch >= 0xd800 && ch <= 0xdbff)
					{
						if (hi_surrogate != -1)
							ereport(ERROR,
							   (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
								errmsg("invalid input syntax for type json"),
								errdetail("Unicode high surrogate must not follow a high surrogate."),
								report_json_context(lex)));
						hi_surrogate = (ch & 0x3ff) << 10;
						continue;
					}
					else if (ch >= 0xdc00 && ch <= 0xdfff)
					{
						if (hi_surrogate == -1)
							ereport(ERROR,
							   (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
								errmsg("invalid input syntax for type json"),
								errdetail("Unicode low surrogate must follow a high surrogate."),
								report_json_context(lex)));
						ch = 0x10000 + hi_surrogate + (ch & 0x3ff);
						hi_surrogate = -1;
					}

					if (hi_surrogate != -1)
						ereport(ERROR,
								(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
								 errmsg("invalid input syntax for type json"),
								 errdetail("Unicode low surrogate must follow a high surrogate."),
								 report_json_context(lex)));

					/*
					 * For UTF8, replace the escape sequence by the actual utf8
					 * character in lex->strval. Do this also for other encodings
					 * if the escape designates an ASCII character, otherwise
					 * raise an error. We don't ever unescape a \u0000, since that
					 * would result in an impermissible nul byte.
					 */

					if (ch == 0)
					{
						appendStringInfoString(lex->strval, "\\u0000");
					}
					else if (GetDatabaseEncoding() == PG_UTF8)
					{
						unicode_to_utf8(ch, (unsigned char *) utf8str);
						utf8len = pg_utf_mblen((unsigned char *) utf8str);
						appendBinaryStringInfo(lex->strval, utf8str, utf8len);
					}
					else if (ch <= 0x007f)
					{
						/*
						 * This is the only way to designate things like a form feed
						 * character in JSON, so it's useful in all encodings.
						 */
						appendStringInfoChar(lex->strval, (char) ch);
					}
					else
					{
						ereport(ERROR,
								(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
								 errmsg("invalid input syntax for type json"),
								 errdetail("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8."),
								 report_json_context(lex)));
					}

				}
			}
			else if (lex->strval != NULL)
			{
				if (hi_surrogate != -1)
					ereport(ERROR,
							(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
							 errmsg("invalid input syntax for type json"),
							 errdetail("Unicode low surrogate must follow a high surrogate."),
							 report_json_context(lex)));

				switch (*s)
				{
					case '"':
					case '\\':
					case '/':
						appendStringInfoChar(lex->strval, *s);
						break;
					case 'b':
						appendStringInfoChar(lex->strval, '\b');
						break;
					case 'f':
						appendStringInfoChar(lex->strval, '\f');
						break;
					case 'n':
						appendStringInfoChar(lex->strval, '\n');
						break;
					case 'r':
						appendStringInfoChar(lex->strval, '\r');
						break;
					case 't':
						appendStringInfoChar(lex->strval, '\t');
						break;
					default:
						/* Not a valid string escape, so error out. */
						lex->token_terminator = s + pg_mblen(s);
						ereport(ERROR,
								(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
								 errmsg("invalid input syntax for type json"),
							errdetail("Escape sequence \"\\%s\" is invalid.",
									  extract_mb_char(s)),
								 report_json_context(lex)));
				}
			}
			else if (strchr("\"\\/bfnrt", *s) == NULL)
			{
				/*
				 * Simpler processing if we're not bothered about de-escaping
				 *
				 * It's very tempting to remove the strchr() call here and
				 * replace it with a switch statement, but testing so far has
				 * shown it's not a performance win.
				 */
				lex->token_terminator = s + pg_mblen(s);
				ereport(ERROR,
						(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
						 errmsg("invalid input syntax for type json"),
						 errdetail("Escape sequence \"\\%s\" is invalid.",
								   extract_mb_char(s)),
						 report_json_context(lex)));
			}

		}
		else if (lex->strval != NULL)
		{
			if (hi_surrogate != -1)
				ereport(ERROR,
						(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
						 errmsg("invalid input syntax for type json"),
						 errdetail("Unicode low surrogate must follow a high surrogate."),
						 report_json_context(lex)));

			appendStringInfoChar(lex->strval, *s);
		}

	}

	if (hi_surrogate != -1)
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
				 errmsg("invalid input syntax for type json"),
		errdetail("Unicode low surrogate must follow a high surrogate."),
				 report_json_context(lex)));

	/* Hooray, we found the end of the string! */
	lex->prev_token_terminator = lex->token_terminator;
	lex->token_terminator = s + 1;
}

Example #9

0

Show file

File: conv.c Project: markwkm/postgres

/*
 * UTF8 ---> local code
 *
 * utf: input UTF8 string (need not be null-terminated).
 * iso: pointer to the output area (must be large enough!)
 * map: the conversion map.
 * cmap: the conversion map for combined characters.
 *		  (optional)
 * size1: the size of the conversion map.
 * size2: the size of the conversion map for combined characters
 *		  (optional)
 * encoding: the PG identifier for the local encoding.
 * len: length of input string.
 */
void
UtfToLocal(const unsigned char *utf, unsigned char *iso,
		   const pg_utf_to_local *map, const pg_utf_to_local_combined *cmap,
		   int size1, int size2, int encoding, int len)
{
	uint32		iutf;
	uint32		cutf[2];
	uint32		code;
	pg_utf_to_local *p;
	pg_utf_to_local_combined *cp;
	int			l;

	for (; len > 0; len -= l)
	{
		/* "break" cases all represent errors */
		if (*utf == '\0')
			break;

		l = pg_utf_mblen(utf);

		if (len < l)
			break;

		if (!pg_utf8_islegal(utf, l))
			break;

		if (l == 1)
		{
			/* ASCII case is easy */
			*iso++ = *utf++;
			continue;
		}
		else if (l == 2)
		{
			iutf = *utf++ << 8;
			iutf |= *utf++;
		}
		else if (l == 3)
		{
			iutf = *utf++ << 16;
			iutf |= *utf++ << 8;
			iutf |= *utf++;
		}
		else if (l == 4)
		{
			iutf = *utf++ << 24;
			iutf |= *utf++ << 16;
			iutf |= *utf++ << 8;
			iutf |= *utf++;
		}

		/*
		 * first, try with combined map if possible
		 */
		if (cmap && len > l)
		{
			const unsigned char *utf_save = utf;
			int			len_save = len;
			int			l_save = l;

			len -= l;

			l = pg_utf_mblen(utf);
			if (len < l)
				break;

			if (!pg_utf8_islegal(utf, l))
				break;

			cutf[0] = iutf;

			if (l == 1)
			{
				if (len_save > 1)
				{
					p = bsearch(&cutf[0], map, size1,
								sizeof(pg_utf_to_local), compare1);
					if (p == NULL)
						report_untranslatable_char(PG_UTF8, encoding,
							   (const char *) (utf_save - l_save), len_save);
					iso = set_iso_code(iso, p->code);
				}

				/* ASCII case is easy */
				*iso++ = *utf++;
				continue;
			}
			else if (l == 2)
			{
				iutf = *utf++ << 8;
				iutf |= *utf++;
			}
			else if (l == 3)
			{
				iutf = *utf++ << 16;
				iutf |= *utf++ << 8;
				iutf |= *utf++;
			}
			else if (l == 4)
			{
				iutf = *utf++ << 24;
				iutf |= *utf++ << 16;
				iutf |= *utf++ << 8;
				iutf |= *utf++;
			}

			cutf[1] = iutf;
			cp = bsearch(cutf, cmap, size2,
						 sizeof(pg_utf_to_local_combined), compare3);
			if (cp)
				code = cp->code;
			else
			{
				/* not found in combined map. try with ordinary map */
				p = bsearch(&cutf[0], map, size1,
							sizeof(pg_utf_to_local), compare1);
				if (p == NULL)
					report_untranslatable_char(PG_UTF8, encoding,
							   (const char *) (utf_save - l_save), len_save);
				iso = set_iso_code(iso, p->code);

				p = bsearch(&cutf[1], map, size1,
							sizeof(pg_utf_to_local), compare1);
				if (p == NULL)
					report_untranslatable_char(PG_UTF8, encoding,
											   (const char *) (utf - l), len);
				code = p->code;
			}
		}
		else	/* no cmap or no remaining data */
		{
			p = bsearch(&iutf, map, size1,
						sizeof(pg_utf_to_local), compare1);
			if (p == NULL)
				report_untranslatable_char(PG_UTF8, encoding,
										   (const char *) (utf - l), len);
			code = p->code;
		}
		iso = set_iso_code(iso, code);
	}

	if (len > 0)
		report_invalid_encoding(PG_UTF8, (const char *) utf, len);

	*iso = '\0';
}

Example #10

0

Show file

File: conv.c Project: ArgenBarbie/postgresql-9.5.0

/*
 * UTF8 ---> local code
 *
 * utf: input string in UTF8 encoding (need not be null-terminated)
 * len: length of input string (in bytes)
 * iso: pointer to the output area (must be large enough!)
		  (output string will be null-terminated)
 * map: conversion map for single characters
 * mapsize: number of entries in the conversion map
 * cmap: conversion map for combined characters
 *		  (optional, pass NULL if none)
 * cmapsize: number of entries in the conversion map for combined characters
 *		  (optional, pass 0 if none)
 * conv_func: algorithmic encoding conversion function
 *		  (optional, pass NULL if none)
 * encoding: PG identifier for the local encoding
 *
 * For each character, the cmap (if provided) is consulted first; if no match,
 * the map is consulted next; if still no match, the conv_func (if provided)
 * is applied.  An error is raised if no match is found.
 *
 * See pg_wchar.h for more details about the data structures used here.
 */
void
UtfToLocal(const unsigned char *utf, int len,
		   unsigned char *iso,
		   const pg_utf_to_local *map, int mapsize,
		   const pg_utf_to_local_combined *cmap, int cmapsize,
		   utf_local_conversion_func conv_func,
		   int encoding)
{
	uint32		iutf;
	int			l;
	const pg_utf_to_local *p;
	const pg_utf_to_local_combined *cp;

	if (!PG_VALID_ENCODING(encoding))
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
				 errmsg("invalid encoding number: %d", encoding)));

	for (; len > 0; len -= l)
	{
		/* "break" cases all represent errors */
		if (*utf == '\0')
			break;

		l = pg_utf_mblen(utf);
		if (len < l)
			break;

		if (!pg_utf8_islegal(utf, l))
			break;

		if (l == 1)
		{
			/* ASCII case is easy, assume it's one-to-one conversion */
			*iso++ = *utf++;
			continue;
		}

		/* collect coded char of length l */
		if (l == 2)
		{
			iutf = *utf++ << 8;
			iutf |= *utf++;
		}
		else if (l == 3)
		{
			iutf = *utf++ << 16;
			iutf |= *utf++ << 8;
			iutf |= *utf++;
		}
		else if (l == 4)
		{
			iutf = *utf++ << 24;
			iutf |= *utf++ << 16;
			iutf |= *utf++ << 8;
			iutf |= *utf++;
		}
		else
		{
			elog(ERROR, "unsupported character length %d", l);
			iutf = 0;			/* keep compiler quiet */
		}

		/* First, try with combined map if possible */
		if (cmap && len > l)
		{
			const unsigned char *utf_save = utf;
			int			len_save = len;
			int			l_save = l;

			/* collect next character, same as above */
			len -= l;

			l = pg_utf_mblen(utf);
			if (len < l)
				break;

			if (!pg_utf8_islegal(utf, l))
				break;

			/* We assume ASCII character cannot be in combined map */
			if (l > 1)
			{
				uint32		iutf2;
				uint32		cutf[2];

				if (l == 2)
				{
					iutf2 = *utf++ << 8;
					iutf2 |= *utf++;
				}
				else if (l == 3)
				{
					iutf2 = *utf++ << 16;
					iutf2 |= *utf++ << 8;
					iutf2 |= *utf++;
				}
				else if (l == 4)
				{
					iutf2 = *utf++ << 24;
					iutf2 |= *utf++ << 16;
					iutf2 |= *utf++ << 8;
					iutf2 |= *utf++;
				}
				else
				{
					elog(ERROR, "unsupported character length %d", l);
					iutf2 = 0;	/* keep compiler quiet */
				}

				cutf[0] = iutf;
				cutf[1] = iutf2;

				cp = bsearch(cutf, cmap, cmapsize,
							 sizeof(pg_utf_to_local_combined), compare3);

				if (cp)
				{
					iso = store_coded_char(iso, cp->code);
					continue;
				}
			}

			/* fail, so back up to reprocess second character next time */
			utf = utf_save;
			len = len_save;
			l = l_save;
		}

		/* Now check ordinary map */
		p = bsearch(&iutf, map, mapsize,
					sizeof(pg_utf_to_local), compare1);

		if (p)
		{
			iso = store_coded_char(iso, p->code);
			continue;
		}

		/* if there's a conversion function, try that */
		if (conv_func)
		{
			uint32		converted = (*conv_func) (iutf);

			if (converted)
			{
				iso = store_coded_char(iso, converted);
				continue;
			}
		}

		/* failed to translate this character */
		report_untranslatable_char(PG_UTF8, encoding,
								   (const char *) (utf - l), len);
	}

	/* if we broke out of loop early, must be invalid input */
	if (len > 0)
		report_invalid_encoding(PG_UTF8, (const char *) utf, len);

	*iso = '\0';
}