Esempio n. 1
0
static UINTVAL
utf8_scan(PARROT_INTERP, ARGIN(const STRING *src))
{
    ASSERT_ARGS(utf8_scan)
    const utf8_t *u8ptr = (const utf8_t *)src->strstart;
    const utf8_t *u8end = (const utf8_t *)(src->strstart + src->bufused);
    UINTVAL characters = 0;

    while (u8ptr < u8end) {
        UINTVAL c = *u8ptr;

        if (UTF8_IS_START(c)) {
            size_t len = UTF8SKIP(u8ptr);
            size_t count;

            if (u8ptr + len > u8end)
                Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
                    "Unaligned end in UTF-8 string\n");

            /* Check for overlong forms */
            if (UTF8_IS_OVERLONG(c, u8ptr[1]))
                Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
                    "Overlong form in UTF-8 string\n");

            c &= UTF8_START_MASK(len);

            for (count = 1; count < len; ++count) {
                ++u8ptr;

                if (!UTF8_IS_CONTINUATION(*u8ptr))
                    Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
                        "Malformed UTF-8 string\n");

                c = UTF8_ACCUMULATE(c, *u8ptr);
            }

            if (UNICODE_IS_INVALID(c))
                Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_CHARACTER,
                    "Invalid character in UTF-8 string\n");
        }
        else if (!UNICODE_IS_INVARIANT(c)) {
            Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
                "Malformed UTF-8 string\n");
        }

        ++u8ptr;
        ++characters;
    }

    return characters;
}
Esempio n. 2
0
EXTERN bool
probably_utf8_chunk(pTHX_ char *s, STRLEN len)
{
    char *e = s + len;
    STRLEN clen;

    /* ignore partial utf8 char at end of buffer */
    while (s < e && UTF8_IS_CONTINUATION((U8)*(e - 1)))
	e--;
    if (s < e && UTF8_IS_START((U8)*(e - 1)))
	e--;
    clen = len - (e - s);
    if (clen && UTF8SKIP(e) == clen) {
	/* all promised continuation bytes are present */
	e = s + len;
    }

    if (!has_hibit(s, e))
	return 0;

    return is_utf8_string((U8*)s, e - s);
}