Ejemplo n.º 1
0
static INTVAL
ucs4_partial_scan(PARROT_INTERP, ARGIN(const char *buf),
        ARGMOD(Parrot_String_Bounds *bounds))
{
    ASSERT_ARGS(ucs4_partial_scan)
    const utf32_t * const ptr = (const utf32_t *)buf;
    UINTVAL               len   = bounds->bytes >> 1;
    const INTVAL          chars = bounds->chars;
    const INTVAL          delim = bounds->delim;
    INTVAL                c     = -1;
    UINTVAL               i;

    if (chars >= 0 && (UINTVAL)chars < len)
        len = chars;

    for (i = 0; i < len; ++i) {
        c = ptr[i];

        if (UNICODE_IS_INVALID(c))
            Parrot_ex_throw_from_c_noargs(interp, EXCEPTION_INVALID_CHARACTER,
                    "Invalid character in UCS-4 string");

        if (c == delim) {
            len = i + 1;
            break;
        }
    }

    bounds->bytes = len << 2;
    bounds->chars = len;
    bounds->delim = c;

    return 0;
}
Ejemplo n.º 2
0
static UINTVAL
utf8_scan(PARROT_INTERP, ARGIN(const STRING *src))
{
    ASSERT_ARGS(utf8_scan)
    const utf8_t *u8ptr = (const utf8_t *)src->strstart;
    const utf8_t *u8end = (const utf8_t *)(src->strstart + src->bufused);
    UINTVAL characters = 0;

    while (u8ptr < u8end) {
        UINTVAL c = *u8ptr;

        if (UTF8_IS_START(c)) {
            size_t len = UTF8SKIP(u8ptr);
            size_t count;

            if (u8ptr + len > u8end)
                Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
                    "Unaligned end in UTF-8 string\n");

            /* Check for overlong forms */
            if (UTF8_IS_OVERLONG(c, u8ptr[1]))
                Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
                    "Overlong form in UTF-8 string\n");

            c &= UTF8_START_MASK(len);

            for (count = 1; count < len; ++count) {
                ++u8ptr;

                if (!UTF8_IS_CONTINUATION(*u8ptr))
                    Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
                        "Malformed UTF-8 string\n");

                c = UTF8_ACCUMULATE(c, *u8ptr);
            }

            if (UNICODE_IS_INVALID(c))
                Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_CHARACTER,
                    "Invalid character in UTF-8 string\n");
        }
        else if (!UNICODE_IS_INVARIANT(c)) {
            Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
                "Malformed UTF-8 string\n");
        }

        ++u8ptr;
        ++characters;
    }

    return characters;
}
Ejemplo n.º 3
0
PARROT_CANNOT_RETURN_NULL
static utf8_t *
utf8_encode(PARROT_INTERP, ARGMOD(utf8_t *ptr), UINTVAL c)
{
    ASSERT_ARGS(utf8_encode)
    const UINTVAL  len = UNISKIP(c);
    utf8_t        *end = ptr + len - 1;

    if (UNICODE_IS_INVALID(c))
        Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_CHARACTER,
                "Invalid character for UTF-8 encoding\n");

    while (end > ptr) {
        *end-- = (c & UTF8_CONTINUATION_MASK) | UTF8_CONTINUATION_MARK;
        c >>= UTF8_ACCUMULATION_SHIFT;
    }

    *end = (c & UTF8_START_MASK(len)) | UTF8_START_MARK(len);

    return ptr + len;
}
Ejemplo n.º 4
0
static void
ucs4_scan(PARROT_INTERP, ARGMOD(STRING *src))
{
    ASSERT_ARGS(ucs4_scan)
    const utf32_t * const ptr = (utf32_t *)src->strstart;
    const UINTVAL         len = src->bufused >> 2;
    UINTVAL               i;

    if (src->bufused & 3)
        Parrot_ex_throw_from_c_noargs(interp, EXCEPTION_INVALID_CHARACTER,
            "Unaligned end in UCS-4 string");

    for (i = 0; i < len; ++i) {
        UINTVAL c = ptr[i];

        if (UNICODE_IS_INVALID(c))
            Parrot_ex_throw_from_c_noargs(interp, EXCEPTION_INVALID_CHARACTER,
                    "Invalid character in UCS-4 string");
    }

    src->strlen = len;
}