/* Decodes using a decodestream. Decodes as far as it can with the input * buffers, or until a stopper is reached. */ void MVM_string_ascii_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds, MVMint32 *stopper_chars, MVMint32 *stopper_sep) { MVMint32 count = 0, total = 0; MVMint32 bufsize; MVMCodepoint32 *buffer; MVMDecodeStreamBytes *cur_bytes; MVMDecodeStreamBytes *last_accept_bytes = ds->bytes_head; MVMint32 last_accept_pos; /* If there's no buffers, we're done. */ if (!ds->bytes_head) return; last_accept_pos = ds->bytes_head_pos; /* If we're asked for zero chars, also done. */ if (stopper_chars && *stopper_chars == 0) return; /* Take length of head buffer as initial guess. */ bufsize = ds->bytes_head->length; buffer = malloc(bufsize * sizeof(MVMCodepoint32)); /* Decode each of the buffers. */ cur_bytes = ds->bytes_head; while (cur_bytes) { /* Process this buffer. */ MVMint32 pos = cur_bytes == ds->bytes_head ? ds->bytes_head_pos : 0; char *bytes = cur_bytes->bytes; while (pos < cur_bytes->length) { MVMCodepoint32 codepoint = bytes[pos++]; if (codepoint > 127) MVM_exception_throw_adhoc(tc, "Will not decode invalid ASCII (code point > 127 found)"); if (count == bufsize) { /* We filled the buffer. Attach this one to the buffers * linked list, and continue with a new one. */ MVM_string_decodestream_add_chars(tc, ds, buffer, bufsize); buffer = malloc(bufsize * sizeof(MVMCodepoint32)); count = 0; } buffer[count++] = codepoint; last_accept_bytes = cur_bytes; last_accept_pos = pos; total++; if (stopper_chars && *stopper_chars == total) goto done; if (stopper_sep && *stopper_sep == codepoint) goto done; } cur_bytes = cur_bytes->next; } done: /* Attach what we successfully parsed as a result buffer, and trim away * what we chewed through. */ if (count) MVM_string_decodestream_add_chars(tc, ds, buffer, count); MVM_string_decodestream_discard_to(tc, ds, last_accept_bytes, last_accept_pos); }
/* In situations where we have hit EOF, we need to decode what's left and flush * the normalization buffer also. */ static void reached_eof(MVMThreadContext *tc, MVMDecodeStream *ds) { /* Decode all the things. */ if (ds->bytes_head) run_decode(tc, ds, NULL, NULL); /* If there's some things left in the normalization buffer, take them. */ MVM_unicode_normalizer_eof(tc, &(ds->norm)); if (MVM_unicode_normalizer_available(tc, &(ds->norm))) { MVMint32 ready = MVM_unicode_normalizer_available(tc, &(ds->norm)); MVMGrapheme32 *buffer = MVM_malloc(ready * sizeof(MVMGrapheme32)); MVMint32 count = 0; while (ready--) buffer[count++] = MVM_unicode_normalizer_get_grapheme(tc, &(ds->norm)); MVM_string_decodestream_add_chars(tc, ds, buffer, count); } }
/* Decodes using a decodestream. Decodes as far as it can with the input * buffers, or until a stopper is reached. */ MVMuint32 MVM_string_latin1_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds, const MVMint32 *stopper_chars, MVMDecodeStreamSeparators *seps) { MVMint32 count = 0, total = 0; MVMint32 bufsize; MVMGrapheme32 *buffer; MVMDecodeStreamBytes *cur_bytes; MVMDecodeStreamBytes *last_accept_bytes = ds->bytes_head; MVMint32 last_accept_pos, last_was_cr; MVMuint32 reached_stopper; /* If there's no buffers, we're done. */ if (!ds->bytes_head) return 0; last_accept_pos = ds->bytes_head_pos; /* If we're asked for zero chars, also done. */ if (stopper_chars && *stopper_chars == 0) return 1; /* Take length of head buffer as initial guess. */ bufsize = ds->bytes_head->length; buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32)); /* Decode each of the buffers. */ cur_bytes = ds->bytes_head; last_was_cr = 0; reached_stopper = 0; while (cur_bytes) { /* Process this buffer. */ MVMint32 pos = cur_bytes == ds->bytes_head ? ds->bytes_head_pos : 0; unsigned char *bytes = (unsigned char *)cur_bytes->bytes; while (pos < cur_bytes->length) { MVMCodepoint codepoint = bytes[pos++]; MVMGrapheme32 graph; if (last_was_cr) { if (codepoint == '\n') { graph = MVM_nfg_crlf_grapheme(tc); } else { graph = '\r'; pos--; } last_was_cr = 0; } else if (codepoint == '\r') { last_was_cr = 1; continue; } else { graph = codepoint; } if (count == bufsize) { /* We filled the buffer. Attach this one to the buffers * linked list, and continue with a new one. */ MVM_string_decodestream_add_chars(tc, ds, buffer, bufsize); buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32)); count = 0; } buffer[count++] = graph; last_accept_bytes = cur_bytes; last_accept_pos = pos; total++; if (stopper_chars && *stopper_chars == total) { reached_stopper = 1; goto done; } if (MVM_string_decode_stream_maybe_sep(tc, seps, codepoint)) { reached_stopper = 1; goto done; } } cur_bytes = cur_bytes->next; } done: /* Attach what we successfully parsed as a result buffer, and trim away * what we chewed through. */ if (count) { MVM_string_decodestream_add_chars(tc, ds, buffer, count); } else { MVM_free(buffer); } MVM_string_decodestream_discard_to(tc, ds, last_accept_bytes, last_accept_pos); return reached_stopper; }
/* Decodes all the buffers, producing a string containing all the decoded * characters. */ MVMString * MVM_string_decodestream_get_all(MVMThreadContext *tc, MVMDecodeStream *ds) { MVMString *result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString); result->body.storage_type = MVM_STRING_GRAPHEME_32; /* Decode all the things. */ run_decode(tc, ds, NULL, NULL); /* If there's some things left in the normalization buffer, take them. */ MVM_unicode_normalizer_eof(tc, &(ds->norm)); if (MVM_unicode_normalizer_available(tc, &(ds->norm))) { MVMint32 ready = MVM_unicode_normalizer_available(tc, &(ds->norm)); MVMGrapheme32 *buffer = MVM_malloc(ready * sizeof(MVMGrapheme32)); MVMint32 count = 0; while (ready--) buffer[count++] = MVM_unicode_normalizer_get_grapheme(tc, &(ds->norm)); MVM_string_decodestream_add_chars(tc, ds, buffer, count); } /* If there's no codepoint buffer, then return the empty string. */ if (!ds->chars_head) { result->body.storage.blob_32 = NULL; result->body.num_graphs = 0; } /* If there's exactly one resulting codepoint buffer and we swallowed none * of it, just use it. */ else if (ds->chars_head == ds->chars_tail && ds->chars_head_pos == 0) { /* Set up result string. */ result->body.storage.blob_32 = ds->chars_head->chars; result->body.num_graphs = ds->chars_head->length; /* Don't free the buffer's memory itself, just the holder, as we * stole that for the buffer into the string above. */ MVM_free(ds->chars_head); ds->chars_head = ds->chars_tail = NULL; } /* Otherwise, need to assemble all the things. */ else { /* Calculate length. */ MVMint32 length = 0, pos = 0; MVMDecodeStreamChars *cur_chars = ds->chars_head; while (cur_chars) { if (cur_chars == ds->chars_head) length += cur_chars->length - ds->chars_head_pos; else length += cur_chars->length; cur_chars = cur_chars->next; } /* Allocate a result buffer of the right size. */ result->body.storage.blob_32 = MVM_malloc(length * sizeof(MVMGrapheme32)); result->body.num_graphs = length; /* Copy all the things into the target, freeing as we go. */ cur_chars = ds->chars_head; while (cur_chars) { if (cur_chars == ds->chars_head) { MVMint32 to_copy = ds->chars_head->length - ds->chars_head_pos; memcpy(result->body.storage.blob_32 + pos, cur_chars->chars + ds->chars_head_pos, cur_chars->length * sizeof(MVMGrapheme32)); pos += to_copy; } else { memcpy(result->body.storage.blob_32 + pos, cur_chars->chars, cur_chars->length * sizeof(MVMGrapheme32)); pos += cur_chars->length; } cur_chars = cur_chars->next; } ds->chars_head = ds->chars_tail = NULL; } return result; }