static void grapheme_composition(MVMThreadContext *tc, MVMNormalizer *n, MVMint32 from, MVMint32 to) { if (to - from >= 2) { MVMint32 starterish = from; MVMint32 insert_pos = from; MVMint32 pos = from; while (pos < to) { MVMint32 next_pos = pos + 1; if (next_pos == to || should_break(tc, n->buffer[pos], n->buffer[next_pos])) { /* Last in buffer or next code point is a non-starter; turn * sequence into a synthetic. */ MVMGrapheme32 g = MVM_nfg_codes_to_grapheme(tc, n->buffer + starterish, next_pos - starterish); if (n->translate_newlines && g == MVM_nfg_crlf_grapheme(tc)) g = '\n'; n->buffer[insert_pos++] = g; /* The next code point is our new starterish (harmless if we * are already at the end of the buffer). */ starterish = next_pos; } pos++; } memmove(n->buffer + insert_pos, n->buffer + to, (n->buffer_end - to) * sizeof(MVMCodepoint)); n->buffer_end -= to - insert_pos; } }
/* Returns non-zero if the result of concatenating the two strings will freely * leave us in NFG without any further effort. */ MVMint32 MVM_nfg_is_concat_stable(MVMThreadContext *tc, MVMString *a, MVMString *b) { MVMGrapheme32 last_a; MVMGrapheme32 first_b; MVMGrapheme32 crlf; /* If either string is empty, we're good. */ if (a->body.num_graphs == 0 || b->body.num_graphs == 0) return 1; /* Get first and last graphemes of the strings. */ last_a = MVM_string_get_grapheme_at_nocheck(tc, a, a->body.num_graphs - 1); first_b = MVM_string_get_grapheme_at_nocheck(tc, b, 0); /* Put the case where we are adding a lf or crlf line ending */ if (first_b == '\n') /* If we see \r + \n we need to renormalize. Otherwise we're good */ return last_a == '\r' ? 0 : 1; crlf = MVM_nfg_crlf_grapheme(tc); /* As a control code we are always going to break if we see one of these. * Check first_b for speeding up line endings */ if (first_b == crlf || last_a == crlf) return 0; /* If either is synthetic other than "\r\n", assume we'll have to re-normalize * (this is an over-estimate, most likely). Note if you optimize this that it * serves as a guard for what follows. * TODO get the last codepoint of last_a and first codepoint of first_b and call * MVM_unicode_normalize_should_break */ if (last_a < 0 || first_b < 0) return 0; /* If both less than the first significant char for NFC we are good */ if (last_a < MVM_NORMALIZE_FIRST_SIG_NFC && first_b < MVM_NORMALIZE_FIRST_SIG_NFC) { return 1; } else { /* Check if the two codepoints would be joined during normalization. * Returns 1 if they would break and thus is safe under concat, or 0 if * they would be joined. */ MVMNormalizer norm; int rtrn; MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG); /* Since we are only looking at two codepoints, we don't know what came * before. Because of special rules with Regional Indicators, pretend * the previous codepoint was a regional indicator. This will return the * special value of 2 from MVM_unicode_normalize_should_break and trigger * re_nfg if last_a and first_b are both regional indicators and we will * never break NFG regardless of what the codepoint before last_a is. */ norm.regional_indicator = 1; rtrn = MVM_unicode_normalize_should_break(tc, last_a, first_b, &norm); MVM_unicode_normalizer_cleanup(tc, &norm); /* If both CCC are non-zero then it may need to be reordered. For now return 0. * This can be optimized. */ if (MVM_unicode_relative_ccc(tc, last_a) != 0 && MVM_unicode_relative_ccc(tc, first_b) != 0) return 0; return rtrn; } }
/* Sets a decode stream separator to its default value. */ void MVM_string_decode_stream_sep_default(MVMThreadContext *tc, MVMDecodeStreamSeparators *sep_spec) { sep_spec->num_seps = 2; sep_spec->sep_lengths = MVM_malloc(sep_spec->num_seps * sizeof(MVMint32)); sep_spec->sep_graphemes = MVM_malloc(sep_spec->num_seps * sizeof(MVMGrapheme32)); sep_spec->sep_lengths[0] = 1; sep_spec->sep_graphemes[0] = '\n'; sep_spec->sep_lengths[1] = 1; sep_spec->sep_graphemes[1] = MVM_nfg_crlf_grapheme(tc); }
/* Decodes the specified number of bytes of latin1 into an NFG string, * creating a result of the specified type. The type must have the MVMString * REPR. */ MVMString * MVM_string_latin1_decode(MVMThreadContext *tc, const MVMObject *result_type, char *latin1_c, size_t bytes) { MVMuint8 *latin1 = (MVMuint8 *)latin1_c; MVMString *result = (MVMString *)REPR(result_type)->allocate(tc, STABLE(result_type)); size_t i, result_graphs; result->body.storage_type = MVM_STRING_GRAPHEME_32; result->body.storage.blob_32 = MVM_malloc(sizeof(MVMint32) * bytes); result_graphs = 0; for (i = 0; i < bytes; i++) { if (latin1[i] == '\r' && i + 1 < bytes && latin1[i + 1] == '\n') { result->body.storage.blob_32[result_graphs++] = MVM_nfg_crlf_grapheme(tc); i++; } else { result->body.storage.blob_32[result_graphs++] = latin1[i]; } } result->body.num_graphs = result_graphs; return result; }
/* Decodes using a decodestream. Decodes as far as it can with the input * buffers, or until a stopper is reached. */ MVMuint32 MVM_string_latin1_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds, const MVMint32 *stopper_chars, MVMDecodeStreamSeparators *seps) { MVMint32 count = 0, total = 0; MVMint32 bufsize; MVMGrapheme32 *buffer; MVMDecodeStreamBytes *cur_bytes; MVMDecodeStreamBytes *last_accept_bytes = ds->bytes_head; MVMint32 last_accept_pos, last_was_cr; MVMuint32 reached_stopper; /* If there's no buffers, we're done. */ if (!ds->bytes_head) return 0; last_accept_pos = ds->bytes_head_pos; /* If we're asked for zero chars, also done. */ if (stopper_chars && *stopper_chars == 0) return 1; /* Take length of head buffer as initial guess. */ bufsize = ds->bytes_head->length; buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32)); /* Decode each of the buffers. */ cur_bytes = ds->bytes_head; last_was_cr = 0; reached_stopper = 0; while (cur_bytes) { /* Process this buffer. */ MVMint32 pos = cur_bytes == ds->bytes_head ? ds->bytes_head_pos : 0; unsigned char *bytes = (unsigned char *)cur_bytes->bytes; while (pos < cur_bytes->length) { MVMCodepoint codepoint = bytes[pos++]; MVMGrapheme32 graph; if (last_was_cr) { if (codepoint == '\n') { graph = MVM_nfg_crlf_grapheme(tc); } else { graph = '\r'; pos--; } last_was_cr = 0; } else if (codepoint == '\r') { last_was_cr = 1; continue; } else { graph = codepoint; } if (count == bufsize) { /* We filled the buffer. Attach this one to the buffers * linked list, and continue with a new one. */ MVM_string_decodestream_add_chars(tc, ds, buffer, bufsize); buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32)); count = 0; } buffer[count++] = graph; last_accept_bytes = cur_bytes; last_accept_pos = pos; total++; if (stopper_chars && *stopper_chars == total) { reached_stopper = 1; goto done; } if (MVM_string_decode_stream_maybe_sep(tc, seps, codepoint)) { reached_stopper = 1; goto done; } } cur_bytes = cur_bytes->next; } done: /* Attach what we successfully parsed as a result buffer, and trim away * what we chewed through. */ if (count) { MVM_string_decodestream_add_chars(tc, ds, buffer, count); } else { MVM_free(buffer); } MVM_string_decodestream_discard_to(tc, ds, last_accept_bytes, last_accept_pos); return reached_stopper; }