static INTVAL ucs4_partial_scan(PARROT_INTERP, ARGIN(const char *buf), ARGMOD(Parrot_String_Bounds *bounds)) { ASSERT_ARGS(ucs4_partial_scan) const utf32_t * const ptr = (const utf32_t *)buf; UINTVAL len = bounds->bytes >> 1; const INTVAL chars = bounds->chars; const INTVAL delim = bounds->delim; INTVAL c = -1; UINTVAL i; if (chars >= 0 && (UINTVAL)chars < len) len = chars; for (i = 0; i < len; ++i) { c = ptr[i]; if (UNICODE_IS_INVALID(c)) Parrot_ex_throw_from_c_noargs(interp, EXCEPTION_INVALID_CHARACTER, "Invalid character in UCS-4 string"); if (c == delim) { len = i + 1; break; } } bounds->bytes = len << 2; bounds->chars = len; bounds->delim = c; return 0; }
static UINTVAL utf8_scan(PARROT_INTERP, ARGIN(const STRING *src)) { ASSERT_ARGS(utf8_scan) const utf8_t *u8ptr = (const utf8_t *)src->strstart; const utf8_t *u8end = (const utf8_t *)(src->strstart + src->bufused); UINTVAL characters = 0; while (u8ptr < u8end) { UINTVAL c = *u8ptr; if (UTF8_IS_START(c)) { size_t len = UTF8SKIP(u8ptr); size_t count; if (u8ptr + len > u8end) Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8, "Unaligned end in UTF-8 string\n"); /* Check for overlong forms */ if (UTF8_IS_OVERLONG(c, u8ptr[1])) Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8, "Overlong form in UTF-8 string\n"); c &= UTF8_START_MASK(len); for (count = 1; count < len; ++count) { ++u8ptr; if (!UTF8_IS_CONTINUATION(*u8ptr)) Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8, "Malformed UTF-8 string\n"); c = UTF8_ACCUMULATE(c, *u8ptr); } if (UNICODE_IS_INVALID(c)) Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_CHARACTER, "Invalid character in UTF-8 string\n"); } else if (!UNICODE_IS_INVARIANT(c)) { Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8, "Malformed UTF-8 string\n"); } ++u8ptr; ++characters; } return characters; }
PARROT_CANNOT_RETURN_NULL static utf8_t * utf8_encode(PARROT_INTERP, ARGMOD(utf8_t *ptr), UINTVAL c) { ASSERT_ARGS(utf8_encode) const UINTVAL len = UNISKIP(c); utf8_t *end = ptr + len - 1; if (UNICODE_IS_INVALID(c)) Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_CHARACTER, "Invalid character for UTF-8 encoding\n"); while (end > ptr) { *end-- = (c & UTF8_CONTINUATION_MASK) | UTF8_CONTINUATION_MARK; c >>= UTF8_ACCUMULATION_SHIFT; } *end = (c & UTF8_START_MASK(len)) | UTF8_START_MARK(len); return ptr + len; }
static void ucs4_scan(PARROT_INTERP, ARGMOD(STRING *src)) { ASSERT_ARGS(ucs4_scan) const utf32_t * const ptr = (utf32_t *)src->strstart; const UINTVAL len = src->bufused >> 2; UINTVAL i; if (src->bufused & 3) Parrot_ex_throw_from_c_noargs(interp, EXCEPTION_INVALID_CHARACTER, "Unaligned end in UCS-4 string"); for (i = 0; i < len; ++i) { UINTVAL c = ptr[i]; if (UNICODE_IS_INVALID(c)) Parrot_ex_throw_from_c_noargs(interp, EXCEPTION_INVALID_CHARACTER, "Invalid character in UCS-4 string"); } src->strlen = len; }