Example #1
0
/* Decodes using a decodestream. Decodes as far as it can with the input
 * buffers, or until a stopper is reached. */
void MVM_string_ascii_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds,
                                   MVMint32 *stopper_chars, MVMint32 *stopper_sep) {
    MVMint32 count = 0, total = 0;
    MVMint32 bufsize;
    MVMCodepoint32 *buffer;
    MVMDecodeStreamBytes *cur_bytes;
    MVMDecodeStreamBytes *last_accept_bytes = ds->bytes_head;
    MVMint32 last_accept_pos;

    /* If there's no buffers, we're done. */
    if (!ds->bytes_head)
        return;
    last_accept_pos = ds->bytes_head_pos;

    /* If we're asked for zero chars, also done. */
    if (stopper_chars && *stopper_chars == 0)
        return;

    /* Take length of head buffer as initial guess. */
    bufsize = ds->bytes_head->length;
    buffer = malloc(bufsize * sizeof(MVMCodepoint32));

    /* Decode each of the buffers. */
    cur_bytes = ds->bytes_head;
    while (cur_bytes) {
        /* Process this buffer. */
        MVMint32  pos   = cur_bytes == ds->bytes_head ? ds->bytes_head_pos : 0;
        char     *bytes = cur_bytes->bytes;
        while (pos < cur_bytes->length) {
            MVMCodepoint32 codepoint = bytes[pos++];
            if (codepoint > 127)
                MVM_exception_throw_adhoc(tc,
                    "Will not decode invalid ASCII (code point > 127 found)");
            if (count == bufsize) {
                /* We filled the buffer. Attach this one to the buffers
                 * linked list, and continue with a new one. */
                MVM_string_decodestream_add_chars(tc, ds, buffer, bufsize);
                buffer = malloc(bufsize * sizeof(MVMCodepoint32));
                count = 0;
            }
            buffer[count++] = codepoint;
            last_accept_bytes = cur_bytes;
            last_accept_pos = pos;
            total++;
            if (stopper_chars && *stopper_chars == total)
                goto done;
            if (stopper_sep && *stopper_sep == codepoint)
                goto done;
        }
        cur_bytes = cur_bytes->next;
    }
  done:

    /* Attach what we successfully parsed as a result buffer, and trim away
     * what we chewed through. */
    if (count)
        MVM_string_decodestream_add_chars(tc, ds, buffer, count);
    MVM_string_decodestream_discard_to(tc, ds, last_accept_bytes, last_accept_pos);
}
Example #2
0
/* In situations where we have hit EOF, we need to decode what's left and flush
 * the normalization buffer also. */
static void reached_eof(MVMThreadContext *tc, MVMDecodeStream *ds) {
    /* Decode all the things. */
    if (ds->bytes_head)
        run_decode(tc, ds, NULL, NULL);

    /* If there's some things left in the normalization buffer, take them. */
    MVM_unicode_normalizer_eof(tc, &(ds->norm));
    if (MVM_unicode_normalizer_available(tc, &(ds->norm))) {
        MVMint32 ready = MVM_unicode_normalizer_available(tc, &(ds->norm));
        MVMGrapheme32 *buffer = MVM_malloc(ready * sizeof(MVMGrapheme32));
        MVMint32 count = 0;
        while (ready--)
            buffer[count++] = MVM_unicode_normalizer_get_grapheme(tc, &(ds->norm));
        MVM_string_decodestream_add_chars(tc, ds, buffer, count);
    }
}
Example #3
0
/* Decodes using a decodestream. Decodes as far as it can with the input
 * buffers, or until a stopper is reached. */
MVMuint32 MVM_string_latin1_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds,
                                    const MVMint32 *stopper_chars,
                                    MVMDecodeStreamSeparators *seps) {
    MVMint32 count = 0, total = 0;
    MVMint32 bufsize;
    MVMGrapheme32 *buffer;
    MVMDecodeStreamBytes *cur_bytes;
    MVMDecodeStreamBytes *last_accept_bytes = ds->bytes_head;
    MVMint32 last_accept_pos, last_was_cr;
    MVMuint32 reached_stopper;

    /* If there's no buffers, we're done. */
    if (!ds->bytes_head)
        return 0;
    last_accept_pos = ds->bytes_head_pos;

    /* If we're asked for zero chars, also done. */
    if (stopper_chars && *stopper_chars == 0)
        return 1;

    /* Take length of head buffer as initial guess. */
    bufsize = ds->bytes_head->length;
    buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));

    /* Decode each of the buffers. */
    cur_bytes = ds->bytes_head;
    last_was_cr = 0;
    reached_stopper = 0;
    while (cur_bytes) {
        /* Process this buffer. */
        MVMint32  pos = cur_bytes == ds->bytes_head ? ds->bytes_head_pos : 0;
        unsigned char *bytes = (unsigned char *)cur_bytes->bytes;
        while (pos < cur_bytes->length) {
            MVMCodepoint codepoint = bytes[pos++];
            MVMGrapheme32 graph;
            if (last_was_cr) {
                if (codepoint == '\n') {
                    graph = MVM_nfg_crlf_grapheme(tc);
                }
                else {
                    graph = '\r';
                    pos--;
                }
                last_was_cr = 0;
            }
            else if (codepoint == '\r') {
                last_was_cr = 1;
                continue;
            }
            else {
                graph = codepoint;
            }
            if (count == bufsize) {
                /* We filled the buffer. Attach this one to the buffers
                 * linked list, and continue with a new one. */
                MVM_string_decodestream_add_chars(tc, ds, buffer, bufsize);
                buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
                count = 0;
            }
            buffer[count++] = graph;
            last_accept_bytes = cur_bytes;
            last_accept_pos = pos;
            total++;
            if (stopper_chars && *stopper_chars == total) {
                reached_stopper = 1;
                goto done;
            }
            if (MVM_string_decode_stream_maybe_sep(tc, seps, codepoint)) {
                reached_stopper = 1;
                goto done;
            }
        }
        cur_bytes = cur_bytes->next;
    }
  done:

    /* Attach what we successfully parsed as a result buffer, and trim away
     * what we chewed through. */
    if (count) {
        MVM_string_decodestream_add_chars(tc, ds, buffer, count);
    }
    else {
        MVM_free(buffer);
    }
    MVM_string_decodestream_discard_to(tc, ds, last_accept_bytes, last_accept_pos);

    return reached_stopper;
}
Example #4
0
/* Decodes all the buffers, producing a string containing all the decoded
 * characters. */
MVMString * MVM_string_decodestream_get_all(MVMThreadContext *tc, MVMDecodeStream *ds) {
    MVMString *result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
    result->body.storage_type = MVM_STRING_GRAPHEME_32;

    /* Decode all the things. */
    run_decode(tc, ds, NULL, NULL);

    /* If there's some things left in the normalization buffer, take them. */
    MVM_unicode_normalizer_eof(tc, &(ds->norm));
    if (MVM_unicode_normalizer_available(tc, &(ds->norm))) {
        MVMint32 ready = MVM_unicode_normalizer_available(tc, &(ds->norm));
        MVMGrapheme32 *buffer = MVM_malloc(ready * sizeof(MVMGrapheme32));
        MVMint32 count = 0;
        while (ready--)
            buffer[count++] = MVM_unicode_normalizer_get_grapheme(tc, &(ds->norm));
        MVM_string_decodestream_add_chars(tc, ds, buffer, count);
    }

    /* If there's no codepoint buffer, then return the empty string. */
    if (!ds->chars_head) {
        result->body.storage.blob_32 = NULL;
        result->body.num_graphs      = 0;
    }

    /* If there's exactly one resulting codepoint buffer and we swallowed none
     * of it, just use it. */
    else if (ds->chars_head == ds->chars_tail && ds->chars_head_pos == 0) {
        /* Set up result string. */
        result->body.storage.blob_32 = ds->chars_head->chars;
        result->body.num_graphs      = ds->chars_head->length;

        /* Don't free the buffer's memory itself, just the holder, as we
         * stole that for the buffer into the string above. */
        MVM_free(ds->chars_head);
        ds->chars_head = ds->chars_tail = NULL;
    }

    /* Otherwise, need to assemble all the things. */
    else {
        /* Calculate length. */
        MVMint32 length = 0, pos = 0;
        MVMDecodeStreamChars *cur_chars = ds->chars_head;
        while (cur_chars) {
            if (cur_chars == ds->chars_head)
                length += cur_chars->length - ds->chars_head_pos;
            else
                length += cur_chars->length;
            cur_chars = cur_chars->next;
        }

        /* Allocate a result buffer of the right size. */
        result->body.storage.blob_32 = MVM_malloc(length * sizeof(MVMGrapheme32));
        result->body.num_graphs      = length;

        /* Copy all the things into the target, freeing as we go. */
        cur_chars = ds->chars_head;
        while (cur_chars) {
            if (cur_chars == ds->chars_head) {
                MVMint32 to_copy = ds->chars_head->length - ds->chars_head_pos;
                memcpy(result->body.storage.blob_32 + pos, cur_chars->chars + ds->chars_head_pos,
                    cur_chars->length * sizeof(MVMGrapheme32));
                pos += to_copy;
            }
            else {
                memcpy(result->body.storage.blob_32 + pos, cur_chars->chars,
                    cur_chars->length * sizeof(MVMGrapheme32));
                pos += cur_chars->length;
            }
            cur_chars = cur_chars->next;
        }
        ds->chars_head = ds->chars_tail = NULL;
    }

    return result;
}