Exemple #1
0
size_t utf8totitle(const char* input, size_t inputSize, char* target, size_t targetSize, int32_t* errors)
{
	CaseMappingState state;
	size_t bytes_written = 0;

	/* Validate parameters */

	UTF8_VALIDATE_PARAMETERS_CHAR(char, bytes_written);

	/* Initialize case mapping */

	if (!casemapping_initialize(
		&state,
		input, inputSize,
		target, targetSize,
		TitlecaseIndex1Ptr, TitlecaseIndex2Ptr, TitlecaseDataPtr))
	{
		UTF8_SET_ERROR(NONE);

		return bytes_written;
	}

	/* Execute case mapping as long as input remains */

	while (state.src_size > 0)
	{
		size_t result = casemapping_execute(&state);
		if (!result)
		{
			UTF8_SET_ERROR(NOT_ENOUGH_SPACE);

			return bytes_written;
		}

		/* The first letter of every word should be titlecase, the rest lowercase */

		if (state.property_data == TitlecaseDataPtr)
		{
			if ((state.last_general_category & GeneralCategory_Letter) != 0)
			{
				state.property_index1 = LowercaseIndex1Ptr;
				state.property_index2 = LowercaseIndex2Ptr;
				state.property_data = LowercaseDataPtr;
			}
		}
		else if (
			(state.last_general_category & GeneralCategory_Letter) == 0)
		{
			state.property_index1 = TitlecaseIndex1Ptr;
			state.property_index2 = TitlecaseIndex2Ptr;
			state.property_data = TitlecaseDataPtr;
		}

		bytes_written += result;
	}

	UTF8_SET_ERROR(NONE);

	return bytes_written;
}
Exemple #2
0
size_t utf8toutf32(const char* input, size_t inputSize, unicode_t* target, size_t targetSize, int32_t* errors)
{
	const char* src;
	size_t src_size;
	unicode_t* dst;
	size_t dst_size;
	size_t bytes_written = 0;

	/* Validate parameters */

	UTF8_VALIDATE_PARAMETERS(char, unicode_t, bytes_written);

	/* Setup cursors */

	src = input;
	src_size = inputSize;
	dst = target;
	dst_size = targetSize;

	/* Loop over input */

	while (src_size > 0)
	{
		unicode_t decoded;
		uint8_t decoded_length = codepoint_read(src, src_size, &decoded);

		if (dst != 0)
		{
			/* Write to output */

			if (dst_size < sizeof(unicode_t))
			{
				UTF8_SET_ERROR(NOT_ENOUGH_SPACE);

				return bytes_written;
			}

			*dst++ = decoded;
			dst_size -= sizeof(unicode_t);
		}

		bytes_written += sizeof(unicode_t);

		src += decoded_length;
		src_size -= decoded_length;
	}

	UTF8_SET_ERROR(NONE);

	return bytes_written;
}
Exemple #3
0
size_t utf8tolower(const char* input, size_t inputSize, char* target, size_t targetSize, int32_t* errors)
{
	CaseMappingState state;
	size_t bytes_written = 0;

	/* Validate parameters */

	UTF8_VALIDATE_PARAMETERS_CHAR(char, bytes_written);

	/* Initialize case mapping */

	if (!casemapping_initialize(
		&state,
		input, inputSize,
		target, targetSize,
		LowercaseIndex1Ptr, LowercaseIndex2Ptr, LowercaseDataPtr))
	{
		UTF8_SET_ERROR(NONE);

		return bytes_written;
	}

	/* Execute case mapping as long as input remains */

	while (state.src_size > 0)
	{
		size_t result = casemapping_execute(&state);
		if (!result)
		{
			UTF8_SET_ERROR(NOT_ENOUGH_SPACE);

			return bytes_written;
		}

		bytes_written += result;
	}

	UTF8_SET_ERROR(NONE);

	return bytes_written;
}
Exemple #4
0
size_t utf8tolower(const char* input, size_t inputSize, char* target, size_t targetSize, size_t locale, int32_t* errors)
{
	CaseMappingState state;

	/* Validate parameters */

	UTF8_VALIDATE_PARAMETERS_CHAR(char, 0);

	/* Initialize case mapping */

	if (!casemapping_initialize(
		&state,
		input, inputSize,
		target, targetSize,
		LowercaseIndex1Ptr, LowercaseIndex2Ptr, LowercaseDataPtr,
		QuickCheckCaseMapped_Lowercase, locale,
		errors))
	{
		return state.total_bytes_needed;
	}

	/* Execute case mapping as long as input remains */

	while (state.src_size > 0)
	{
		size_t converted;

		if ((converted = casemapping_execute(&state, errors)) == 0)
		{
			return state.total_bytes_needed;
		}

		state.total_bytes_needed += converted;
	}

	UTF8_SET_ERROR(NONE);

	return state.total_bytes_needed;
}
Exemple #5
0
size_t utf16toutf8(const utf16_t* input, size_t inputSize, char* target, size_t targetSize, int32_t* errors)
{
	const utf16_t* src;
	size_t src_size;
	char* dst;
	size_t dst_size;
	size_t bytes_written = 0;

	/* Validate parameters */

	UTF8_VALIDATE_PARAMETERS_CHAR(utf16_t, bytes_written);
	UTF8_SET_ERROR(NONE);

	/* Setup cursors */

	src = input;
	src_size = inputSize;
	dst = target;
	dst_size = targetSize;

	/* Loop over input */

	while (src_size > 0)
	{
		unicode_t codepoint;
		uint8_t encoded_size;

		if (src_size < sizeof(utf16_t))
		{
			/* Not enough data */

			goto invaliddata;
		}

		codepoint = (unicode_t)*src;

		if (codepoint >= SURROGATE_HIGH_START &&
			codepoint <= SURROGATE_LOW_END)
		{
			/* Decode surrogate pair */

			if (codepoint > SURROGATE_HIGH_END)
			{
				/* Missing high surrogate codepoint */

				codepoint = REPLACEMENT_CHARACTER;

				UTF8_SET_ERROR(INVALID_DATA);
			}
			else if (
				src_size < 2 * sizeof(utf16_t))
			{
				/* Not enough data */

				goto invaliddata;
			}
			else
			{
				/* Read low surrogate codepoint */

				if (src[1] < SURROGATE_LOW_START ||
					src[1] > SURROGATE_LOW_END)
				{
					/* Missing low surrogate codepoint */

					codepoint = REPLACEMENT_CHARACTER;

					UTF8_SET_ERROR(INVALID_DATA);
				}
				else
				{
					/* Decode codepoint from surrogate pair */

					codepoint =
						(MAX_BASIC_MULTILINGUAL_PLANE + 1) +
						(src[1] - SURROGATE_LOW_START) +
						((src[0] - SURROGATE_HIGH_START) << 10);

					src++;
					src_size -= sizeof(utf16_t);
				}
			}
		}

		encoded_size = codepoint_write(codepoint, &dst, &dst_size);
		if (encoded_size == 0)
		{
			UTF8_SET_ERROR(NOT_ENOUGH_SPACE);

			return bytes_written;
		}

		bytes_written += encoded_size;

		src++;
		src_size -= sizeof(utf16_t);
	}

	return bytes_written;

invaliddata:
	if (dst != 0)
	{
		if (dst_size < REPLACEMENT_CHARACTER_STRING_LENGTH)
		{
			UTF8_SET_ERROR(NOT_ENOUGH_SPACE);

			return bytes_written;
		}

		/* Write replacement codepoint to output */

		memcpy(dst, REPLACEMENT_CHARACTER_STRING, REPLACEMENT_CHARACTER_STRING_LENGTH);
	}

	UTF8_SET_ERROR(INVALID_DATA);

	return bytes_written + REPLACEMENT_CHARACTER_STRING_LENGTH;
}
Exemple #6
0
size_t utf8casefold(const char* input, size_t inputSize, char* target, size_t targetSize, size_t locale, int32_t* errors)
{
	CaseMappingState state;

	/* Validate parameters */

	UTF8_VALIDATE_PARAMETERS_CHAR(char, 0);

	/* Initialize case mapping */

	if (!casemapping_initialize(
		&state,
		input, inputSize,
		target, targetSize,
		CaseFoldingIndex1Ptr, CaseFoldingIndex2Ptr, CaseFoldingDataPtr,
		QuickCheckCaseMapped_Casefolded, locale,
		errors))
	{
		return state.total_bytes_needed;
	}

	if (state.locale == UTF8_LOCALE_TURKISH_AND_AZERI_LATIN)
	{
		/* Exceptional behavior for Turkish and Azerbaijani (Latin) locales */

		while (state.src_size > 0)
		{
			const char* resolved = 0;
			uint8_t bytes_needed = 0;

			/* Read next code point */

			if (!(state.last_code_point_size = codepoint_read(state.src, state.src_size, &state.last_code_point)))
			{
				goto invaliddata;
			}

			/* Move source cursor */

			if (state.src_size >= state.last_code_point_size)
			{
				state.src += state.last_code_point_size;
				state.src_size -= state.last_code_point_size;
			}
			else
			{
				state.src_size = 0;
			}

			/* Resolve case folding */

			if ((PROPERTY_GET_CM(state.last_code_point) & QuickCheckCaseMapped_Casefolded) != 0)
			{
				if (state.last_code_point == CP_LATIN_CAPITAL_LETTER_I)
				{
					resolved = "\xC4\xB1";
					bytes_needed = 2;
				}
				else if (
					state.last_code_point == CP_LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
				{
					resolved = "i";
					bytes_needed = 1;
				}
				else
				{
					resolved = database_querydecomposition(state.last_code_point, state.property_index1, state.property_index2, state.property_data, &bytes_needed);
				}
			}

			/* Write to output */

			if (resolved != 0)
			{
				/* Write resolved string to output */

				if (state.dst != 0)
				{
					if (state.dst_size < bytes_needed)
					{
						goto outofspace;
					}

					memcpy(state.dst, resolved, bytes_needed);

					state.dst += bytes_needed;
					state.dst_size -= bytes_needed;
				}
			}
			else
			{
				/* Write code point unchanged to output */

				if (!(bytes_needed = codepoint_write(state.last_code_point, &state.dst, &state.dst_size)))
				{
					goto outofspace;
				}
			}

			state.total_bytes_needed += bytes_needed;
		}
	}
	else
	{
		/* Execute case mapping as long as input remains */

		while (state.src_size > 0)
		{
			const char* resolved = 0;
			uint8_t bytes_needed = 0;

			/* Read next code point */

			if (!(state.last_code_point_size = codepoint_read(state.src, state.src_size, &state.last_code_point)))
			{
				goto invaliddata;
			}

			/* Move source cursor */

			if (state.src_size >= state.last_code_point_size)
			{
				state.src += state.last_code_point_size;
				state.src_size -= state.last_code_point_size;
			}
			else
			{
				state.src_size = 0;
			}

			/* Resolve case folding */

			if ((PROPERTY_GET_CM(state.last_code_point) & QuickCheckCaseMapped_Casefolded) != 0)
			{
				resolved = database_querydecomposition(state.last_code_point, state.property_index1, state.property_index2, state.property_data, &bytes_needed);
			}

			if (resolved != 0)
			{
				/* Write resolved string to output */

				if (state.dst != 0)
				{
					if (state.dst_size < bytes_needed)
					{
						goto outofspace;
					}

					memcpy(state.dst, resolved, bytes_needed);

					state.dst += bytes_needed;
					state.dst_size -= bytes_needed;
				}
			}
			else
			{
				/* Write code point unchanged to output */

				if (!(bytes_needed = codepoint_write(state.last_code_point, &state.dst, &state.dst_size)))
				{
					goto outofspace;
				}
			}

			state.total_bytes_needed += bytes_needed;
		}
	}

	UTF8_SET_ERROR(NONE);

	return state.total_bytes_needed;

invaliddata:
	UTF8_SET_ERROR(INVALID_DATA);

	return state.total_bytes_needed;

outofspace:
	UTF8_SET_ERROR(NOT_ENOUGH_SPACE);

	return state.total_bytes_needed;
}
Exemple #7
0
size_t utf8totitle(const char* input, size_t inputSize, char* target, size_t targetSize, size_t locale, int32_t* errors)
{
	CaseMappingState state;

	/* Validate parameters */

	UTF8_VALIDATE_PARAMETERS_CHAR(char, 0);

	/* Initialize case mapping */

	if (!casemapping_initialize(
		&state,
		input, inputSize,
		target, targetSize,
		TitlecaseIndex1Ptr, TitlecaseIndex2Ptr, TitlecaseDataPtr,
		QuickCheckCaseMapped_Titlecase, locale,
		errors))
	{
		return state.total_bytes_needed;
	}

	/* Execute case mapping as long as input remains */

	while (state.src_size > 0)
	{
		size_t converted;
		
		if ((converted = casemapping_execute(&state, errors)) == 0)
		{
			return state.total_bytes_needed;
		}

		/*
			The first letter of every word should be titlecase, the rest should
			be converted to lowercase.
		*/

		if (state.last_canonical_combining_class == CCC_NOT_REORDERED)
		{
			if (state.property_data == TitlecaseDataPtr)
			{
				if ((state.last_general_category & UTF8_CATEGORY_LETTER) != 0)
				{
					state.property_index1 = LowercaseIndex1Ptr;
					state.property_index2 = LowercaseIndex2Ptr;
					state.property_data = LowercaseDataPtr;

					state.quickcheck_flags = QuickCheckCaseMapped_Lowercase;
				}
			}
			else if (
				(state.last_general_category & UTF8_CATEGORY_LETTER) == 0)
			{
				state.property_index1 = TitlecaseIndex1Ptr;
				state.property_index2 = TitlecaseIndex2Ptr;
				state.property_data = TitlecaseDataPtr;

				state.quickcheck_flags = QuickCheckCaseMapped_Titlecase;
			}
		}

		state.total_bytes_needed += converted;
	}

	UTF8_SET_ERROR(NONE);

	return state.total_bytes_needed;
}
Exemple #8
0
size_t utf8toutf16(const char* input, size_t inputSize, utf16_t* target, size_t targetSize, int32_t* errors)
{
	const char* src;
	size_t src_size;
	utf16_t* dst;
	size_t dst_size;
	size_t bytes_written = 0;

	/* Validate parameters */

	UTF8_VALIDATE_PARAMETERS(char, utf16_t, bytes_written);

	/* Setup cursors */

	src = input;
	src_size = inputSize;
	dst = target;
	dst_size = targetSize;

	/* Loop over input */

	while (src_size > 0)
	{
		unicode_t decoded;
		uint8_t decoded_size = codepoint_read(src, src_size, &decoded);

		if (decoded <= MAX_BASIC_MULTILINGUAL_PLANE)
		{
			/* Codepoint fits in a single UTF-16 codepoint */

			if (dst != 0)
			{
				/* Write to output */

				if (dst_size < sizeof(utf16_t))
				{
					UTF8_SET_ERROR(NOT_ENOUGH_SPACE);

					return bytes_written;
				}

				*dst++ = (utf16_t)decoded;
				dst_size -= sizeof(utf16_t);
			}

			bytes_written += sizeof(utf16_t);
		}
		else
		{
			/* Codepoint must be encoded using a surrogate pair */

			if (dst != 0)
			{
				/* Write to output */

				if (dst_size < 2 * sizeof(utf16_t))
				{
					UTF8_SET_ERROR(NOT_ENOUGH_SPACE);

					return bytes_written;
				}

				/* Encoded value is always beyond BMP */

				decoded -= (MAX_BASIC_MULTILINGUAL_PLANE + 1);
				*dst++ = SURROGATE_HIGH_START + (decoded >> 10);
				*dst++ = SURROGATE_LOW_START + (decoded & 0x03FF);

				dst_size -= 2 * sizeof(utf16_t);
			}

			bytes_written += 2 * sizeof(utf16_t);
		}

		src += decoded_size;
		src_size -= decoded_size;
	}

	UTF8_SET_ERROR(NONE);

	return bytes_written;
}
Exemple #9
0
size_t utf8normalize(const char* input, size_t inputSize, char* target, size_t targetSize, size_t flags, int32_t* errors)
{
	char* dst = target;
	size_t dst_size = targetSize;
	StreamState stream[4];
	DecomposeState decompose_state;
	ComposeState compose_state;
	uint8_t compatibility = (flags & UTF8_NORMALIZE_COMPATIBILITY) != 0;
	StreamState* stream_output;
	uint8_t finished = 0;
	size_t bytes_written = 0;

	/*
		Decomposition uses the following process:

		input         -->  stream[0]  -->
		(decompose)   -->  stream[1]  -->
		(accumulate)  -->  stream[2]  -->
		output

		The accumulation step is necessary in order to prevent buffer overflow
		attacks.

		Composition adds another stream buffer:

		input         --> stream[0]  -->
		(decompose)   --> stream[1]  -->
		(accumulate)  --> stream[2]  -->
		(compose)     --> stream[3]  -->
		output

		Although four streaming buffers may seem excessive, they are necessary
		for preventing allocations on the heap.
	*/

	/* Check for valid flags */

	if ((flags & (UTF8_NORMALIZE_DECOMPOSE | UTF8_NORMALIZE_COMPOSE)) == 0)
	{
		UTF8_SET_ERROR(INVALID_FLAG);

		return bytes_written;
	}

	/* Validate parameters */

	UTF8_VALIDATE_PARAMETERS_CHAR(char, bytes_written);

	/* Initialize decomposition */

	memset(stream, 0, sizeof(stream));

	if (!stream_initialize(&stream[0], input, inputSize) ||
		!decompose_initialize(&decompose_state, &stream[0], &stream[1], compatibility))
	{
		UTF8_SET_ERROR(INVALID_DATA);

		return bytes_written;
	}

	stream_output = &stream[2];

	if ((flags & UTF8_NORMALIZE_COMPOSE) != 0)
	{
		/* Initialize composition */

		if (!compose_initialize(&compose_state, &stream[2], &stream[3], compatibility))
		{
			UTF8_SET_ERROR(INVALID_DATA);

			return bytes_written;
		}

		stream_output = &stream[3];
	}

	do
	{
		uint8_t write = 0;

		/* Accumulate decomposed input in next stream */

		if (stream[1].current > 0)
		{
			unicode_t* src_codepoint = stream[1].codepoint;
			unicode_t* dst_codepoint = stream[2].codepoint + stream[2].filled;
			uint8_t* src_qc = stream[1].quick_check;
			uint8_t* dst_qc = stream[2].quick_check + stream[2].filled;
			uint8_t* src_ccc = stream[1].canonical_combining_class;
			uint8_t* dst_ccc = stream[2].canonical_combining_class + stream[2].filled;

			if ((flags & UTF8_NORMALIZE_COMPOSE) != 0)
			{
				uint8_t i;

				/* Update stream properties to use composition values */

				for (i = 0; i < stream[1].current; ++i)
				{
					*dst_qc++ = PROPERTY_GET(compose_state.qc_index, compose_state.qc_data, *src_codepoint);
					*dst_ccc++ = *src_ccc++;
					*dst_codepoint++ = *src_codepoint++;
				}
			}
			else
			{
				/* Copy directly */

				memcpy(dst_codepoint, src_codepoint, stream[1].current * sizeof(unicode_t));
				memcpy(dst_qc, src_qc, stream[1].current * sizeof(uint8_t));
				memcpy(dst_ccc, src_ccc, stream[1].current * sizeof(uint8_t));
			}

			stream[2].current += stream[1].current;
			stream[2].filled += stream[1].current;
		}

		/* Decompose input sequence into next stream */

		finished = !decompose_execute(&decompose_state);
		if (!finished)
		{
			/* Output current stream it it could overflow accumulation buffer */

			write = (stream[1].current + stream[2].filled) >= STREAM_SAFE_MAX;
		}

		/* Reorder potentially unordered decomposed stream */

		if (!stream[1].stable)
		{
			stream_reorder(&stream[1]);
		}

		/* Write stream to output when overflowing or when accumulation buffer is empty*/

		if (write ||
			finished)
		{
			uint8_t i;

			/* Compose accumulation buffer */

			if ((flags & UTF8_NORMALIZE_COMPOSE) != 0 &&
				!compose_execute(&compose_state))
			{
				break;
			}

			/* Write to output buffer */

			for (i = 0; i < stream_output->current; ++i)
			{
				uint8_t encoded_size = codepoint_write(stream_output->codepoint[i], &dst, &dst_size);
				if (encoded_size == 0)
				{
					UTF8_SET_ERROR(NOT_ENOUGH_SPACE);

					return bytes_written;
				}

				bytes_written += encoded_size;
			}

			/* Reset accumulation buffer */

			stream[2].current = 0;
			stream[2].filled = 0;
		}
	}
	while (!finished);

	UTF8_SET_ERROR(NONE);

	return bytes_written;
}