static UINTVAL utf8_scan(PARROT_INTERP, ARGIN(const STRING *src)) { ASSERT_ARGS(utf8_scan) const utf8_t *u8ptr = (const utf8_t *)src->strstart; const utf8_t *u8end = (const utf8_t *)(src->strstart + src->bufused); UINTVAL characters = 0; while (u8ptr < u8end) { UINTVAL c = *u8ptr; if (UTF8_IS_START(c)) { size_t len = UTF8SKIP(u8ptr); size_t count; if (u8ptr + len > u8end) Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8, "Unaligned end in UTF-8 string\n"); /* Check for overlong forms */ if (UTF8_IS_OVERLONG(c, u8ptr[1])) Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8, "Overlong form in UTF-8 string\n"); c &= UTF8_START_MASK(len); for (count = 1; count < len; ++count) { ++u8ptr; if (!UTF8_IS_CONTINUATION(*u8ptr)) Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8, "Malformed UTF-8 string\n"); c = UTF8_ACCUMULATE(c, *u8ptr); } if (UNICODE_IS_INVALID(c)) Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_CHARACTER, "Invalid character in UTF-8 string\n"); } else if (!UNICODE_IS_INVARIANT(c)) { Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8, "Malformed UTF-8 string\n"); } ++u8ptr; ++characters; } return characters; }
EXTERN bool probably_utf8_chunk(pTHX_ char *s, STRLEN len) { char *e = s + len; STRLEN clen; /* ignore partial utf8 char at end of buffer */ while (s < e && UTF8_IS_CONTINUATION((U8)*(e - 1))) e--; if (s < e && UTF8_IS_START((U8)*(e - 1))) e--; clen = len - (e - s); if (clen && UTF8SKIP(e) == clen) { /* all promised continuation bytes are present */ e = s + len; } if (!has_hibit(s, e)) return 0; return is_utf8_string((U8*)s, e - s); }