STATIC char * S_scan_word(pTHX_ register char *s, char *dest, STRLEN destlen, int allow_package, STRLEN *slp) { register char *d = dest; register char * const e = d + destlen - 3; /* two-character token, ending NUL */ for (;;) { if (d >= e) Perl_croak(aTHX_ ident_too_long); if (isALNUM(*s)) /* UTF handled below */ *d++ = *s++; else if (*s == '\'' && allow_package && isIDFIRST_lazy_if(s+1,UTF)) { *d++ = ':'; *d++ = ':'; s++; } else if (*s == ':' && s[1] == ':' && allow_package && s[2] != '$') { *d++ = *s++; *d++ = *s++; } else if (UTF && UTF8_IS_START(*s) && isALNUM_utf8((U8*)s)) { char *t = s + UTF8SKIP(s); while (UTF8_IS_CONTINUED(*t) && _is_utf8_mark((U8*)t)) t += UTF8SKIP(t); if (d + (t - s) > e) Perl_croak(aTHX_ ident_too_long); Copy(s, d, t - s, char); d += t - s; s = t; } else {
static UINTVAL utf8_scan(PARROT_INTERP, ARGIN(const STRING *src)) { ASSERT_ARGS(utf8_scan) const utf8_t *u8ptr = (const utf8_t *)src->strstart; const utf8_t *u8end = (const utf8_t *)(src->strstart + src->bufused); UINTVAL characters = 0; while (u8ptr < u8end) { UINTVAL c = *u8ptr; if (UTF8_IS_START(c)) { size_t len = UTF8SKIP(u8ptr); size_t count; if (u8ptr + len > u8end) Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8, "Unaligned end in UTF-8 string\n"); /* Check for overlong forms */ if (UTF8_IS_OVERLONG(c, u8ptr[1])) Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8, "Overlong form in UTF-8 string\n"); c &= UTF8_START_MASK(len); for (count = 1; count < len; ++count) { ++u8ptr; if (!UTF8_IS_CONTINUATION(*u8ptr)) Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8, "Malformed UTF-8 string\n"); c = UTF8_ACCUMULATE(c, *u8ptr); } if (UNICODE_IS_INVALID(c)) Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_CHARACTER, "Invalid character in UTF-8 string\n"); } else if (!UNICODE_IS_INVARIANT(c)) { Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8, "Malformed UTF-8 string\n"); } ++u8ptr; ++characters; } return characters; }
static UINTVAL utf8_decode(PARROT_INTERP, ARGIN(const utf8_t *ptr)) { ASSERT_ARGS(utf8_decode) const utf8_t *u8ptr = ptr; UINTVAL c = *u8ptr; if (UTF8_IS_START(c)) { UINTVAL len = UTF8SKIP(u8ptr); UINTVAL count; c &= UTF8_START_MASK(len); for (count = 1; count < len; ++count) { ++u8ptr; c = UTF8_ACCUMULATE(c, *u8ptr); } } return c; }
EXTERN bool probably_utf8_chunk(pTHX_ char *s, STRLEN len) { char *e = s + len; STRLEN clen; /* ignore partial utf8 char at end of buffer */ while (s < e && UTF8_IS_CONTINUATION((U8)*(e - 1))) e--; if (s < e && UTF8_IS_START((U8)*(e - 1))) e--; clen = len - (e - s); if (clen && UTF8SKIP(e) == clen) { /* all promised continuation bytes are present */ e = s + len; } if (!has_hibit(s, e)) return 0; return is_utf8_string((U8*)s, e - s); }