static MVMint32 NFD_and_push_collation_values (MVMThreadContext *tc, MVMCodepoint cp, collation_stack *stack, MVMCodepointIter *ci, char *name) { MVMNormalizer norm; MVMCodepoint cp_out; MVMint32 ready, result_pos = 0; MVMCodepoint *result = MVM_malloc(sizeof(MVMCodepoint) * initial_collation_norm_buf_size); MVMint32 result_size = initial_collation_norm_buf_size; MVMint64 rtrn = 0; MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFD); ready = MVM_unicode_normalizer_process_codepoint(tc, &norm, cp, &cp_out); if (ready) { if (result_size <= result_pos + ready) result = MVM_realloc(result, sizeof(MVMCodepoint) * (result_size += initial_collation_norm_buf_size)); result[result_pos++] = cp_out; while (0 < --ready) result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm); } MVM_unicode_normalizer_eof(tc, &norm); ready = MVM_unicode_normalizer_available(tc, &norm); while (ready--) { if (result_size <= result_pos + ready + 1) result = MVM_realloc(result, sizeof(MVMCodepoint) * (result_size += initial_collation_norm_buf_size)); result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm); } /* If the codepoint changed or we now have more than before */ if (result[0] != cp || 1 < result_pos) rtrn = collation_push_cp(tc, stack, ci, result, result_pos, name); if (result) MVM_free(result); return rtrn; }
/* In situations where we have hit EOF, we need to decode what's left and flush * the normalization buffer also. */ static void reached_eof(MVMThreadContext *tc, MVMDecodeStream *ds) { /* Decode all the things. */ if (ds->bytes_head) run_decode(tc, ds, NULL, NULL); /* If there's some things left in the normalization buffer, take them. */ MVM_unicode_normalizer_eof(tc, &(ds->norm)); if (MVM_unicode_normalizer_available(tc, &(ds->norm))) { MVMint32 ready = MVM_unicode_normalizer_available(tc, &(ds->norm)); MVMGrapheme32 *buffer = MVM_malloc(ready * sizeof(MVMGrapheme32)); MVMint32 count = 0; while (ready--) buffer[count++] = MVM_unicode_normalizer_get_grapheme(tc, &(ds->norm)); MVM_string_decodestream_add_chars(tc, ds, buffer, count); } }
/* Processes a codepoint that we regard as a "normalization terminator". These * never have a decomposition, and for all practical purposes will not have a * combiner on them. We treat them specially so we don't, during I/O, block on * seeing a codepoint after them, which for things like REPLs that need to see * input right after a \n makes for problems. */ MVMint32 MVM_unicode_normalizer_process_codepoint_norm_terminator(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint in, MVMCodepoint *out) { /* Add the codepoint into the buffer. */ add_codepoint_to_buffer(tc, n, in); /* Treat this as an "eof", which really means "normalize what ya got". */ MVM_unicode_normalizer_eof(tc, n); /* Hand back a normalized codepoint, and the number available (have to * compensate for the one we steal for *out). */ *out = MVM_unicode_normalizer_get_codepoint(tc, n); return 1 + MVM_unicode_normalizer_available(tc, n); }
/* Takes an NFG string and populates the array out, which must be a 32-bit * integer array, with codepoints normalized according to the specified * normalization form. */ void MVM_unicode_string_to_codepoints(MVMThreadContext *tc, MVMString *s, MVMNormalization form, MVMObject *out) { MVMCodepoint *result; MVMint64 result_pos, result_alloc; MVMCodepointIter ci; /* Validate output array and set up result storage. */ assert_codepoint_array(tc, out, "Normalization output must be native array of 32-bit integers"); result_alloc = s->body.num_graphs; result = MVM_malloc(result_alloc * sizeof(MVMCodepoint)); result_pos = 0; /* Create codepoint iterator. */ MVM_string_ci_init(tc, &ci, s); /* If we want NFC, just iterate, since NFG is constructed out of NFC. */ if (form == MVM_NORMALIZE_NFC) { while (MVM_string_ci_has_more(tc, &ci)) { maybe_grow_result(&result, &result_alloc, result_pos + 1); result[result_pos++] = MVM_string_ci_get_codepoint(tc, &ci); } } /* Otherwise, need to feed it through a normalizer. */ else { MVMNormalizer norm; MVMint32 ready; MVM_unicode_normalizer_init(tc, &norm, form); while (MVM_string_ci_has_more(tc, &ci)) { MVMCodepoint cp; ready = MVM_unicode_normalizer_process_codepoint(tc, &norm, MVM_string_ci_get_codepoint(tc, &ci), &cp); if (ready) { maybe_grow_result(&result, &result_alloc, result_pos + ready); result[result_pos++] = cp; while (--ready > 0) result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm); } } MVM_unicode_normalizer_eof(tc, &norm); ready = MVM_unicode_normalizer_available(tc, &norm); maybe_grow_result(&result, &result_alloc, result_pos + ready); while (ready--) result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm); MVM_unicode_normalizer_cleanup(tc, &norm); } /* Put result into array body. */ ((MVMArray *)out)->body.slots.u32 = result; ((MVMArray *)out)->body.start = 0; ((MVMArray *)out)->body.elems = result_pos; }
/* Takes an object, which must be of VMArray representation and holding * 32-bit integers. Treats them as Unicode codepoints, normalizes them at * Grapheme level, and returns the resulting NFG string. */ MVMString * MVM_unicode_codepoints_to_nfg_string(MVMThreadContext *tc, MVMObject *codes) { MVMNormalizer norm; MVMCodepoint *input; MVMGrapheme32 *result; MVMint64 input_pos, input_codes, result_pos, result_alloc; MVMint32 ready; MVMString *str; /* Get input array; if it's empty, we're done already. */ assert_codepoint_array(tc, codes, "Code points to string input must be native array of 32-bit integers"); input = (MVMCodepoint *)((MVMArray *)codes)->body.slots.u32 + ((MVMArray *)codes)->body.start; input_codes = ((MVMArray *)codes)->body.elems; if (input_codes == 0) return tc->instance->str_consts.empty; /* Guess output size based on input size. */ result_alloc = input_codes; result = MVM_malloc(result_alloc * sizeof(MVMCodepoint)); /* Perform normalization at grapheme level. */ MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG); input_pos = 0; result_pos = 0; while (input_pos < input_codes) { MVMGrapheme32 g; ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, input[input_pos], &g); if (ready) { maybe_grow_result(&result, &result_alloc, result_pos + ready); result[result_pos++] = g; while (--ready > 0) result[result_pos++] = MVM_unicode_normalizer_get_grapheme(tc, &norm); } input_pos++; } MVM_unicode_normalizer_eof(tc, &norm); ready = MVM_unicode_normalizer_available(tc, &norm); maybe_grow_result(&result, &result_alloc, result_pos + ready); while (ready--) result[result_pos++] = MVM_unicode_normalizer_get_grapheme(tc, &norm); MVM_unicode_normalizer_cleanup(tc, &norm); /* Produce an MVMString of the result. */ str = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString); str->body.storage.blob_32 = result; str->body.storage_type = MVM_STRING_GRAPHEME_32; str->body.num_graphs = result_pos; return str; }
void MVM_unicode_normalize_codepoints(MVMThreadContext *tc, MVMObject *in, MVMObject *out, MVMNormalization form) { MVMNormalizer norm; MVMCodepoint *input; MVMCodepoint *result; MVMint64 input_pos, input_codes, result_pos, result_alloc; MVMint32 ready; /* Validate input/output array. */ assert_codepoint_array(tc, in, "Normalization input must be native array of 32-bit integers"); assert_codepoint_array(tc, out, "Normalization output must be native array of 32-bit integers"); /* Get input array; if it's empty, we're done already. */ input = (MVMCodepoint *)((MVMArray *)in)->body.slots.u32 + ((MVMArray *)in)->body.start; input_codes = ((MVMArray *)in)->body.elems; if (input_codes == 0) return; /* Guess output size based on input size. */ result_alloc = input_codes; result = MVM_malloc(result_alloc * sizeof(MVMCodepoint)); /* Perform normalization. */ MVM_unicode_normalizer_init(tc, &norm, form); input_pos = 0; result_pos = 0; while (input_pos < input_codes) { MVMCodepoint cp; ready = MVM_unicode_normalizer_process_codepoint(tc, &norm, input[input_pos], &cp); if (ready) { maybe_grow_result(&result, &result_alloc, result_pos + ready); result[result_pos++] = cp; while (--ready > 0) result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm); } input_pos++; } MVM_unicode_normalizer_eof(tc, &norm); ready = MVM_unicode_normalizer_available(tc, &norm); maybe_grow_result(&result, &result_alloc, result_pos + ready); while (ready--) result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm); MVM_unicode_normalizer_cleanup(tc, &norm); /* Put result into array body. */ ((MVMArray *)out)->body.slots.u32 = result; ((MVMArray *)out)->body.start = 0; ((MVMArray *)out)->body.elems = result_pos; }
/* Checks if the decode stream is empty. */ MVMint32 MVM_string_decodestream_is_empty(MVMThreadContext *tc, MVMDecodeStream *ds) { return !(ds->bytes_head || ds->chars_head || MVM_unicode_normalizer_available(tc, &(ds->norm))); }
/* Decodes all the buffers, producing a string containing all the decoded * characters. */ MVMString * MVM_string_decodestream_get_all(MVMThreadContext *tc, MVMDecodeStream *ds) { MVMString *result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString); result->body.storage_type = MVM_STRING_GRAPHEME_32; /* Decode all the things. */ run_decode(tc, ds, NULL, NULL); /* If there's some things left in the normalization buffer, take them. */ MVM_unicode_normalizer_eof(tc, &(ds->norm)); if (MVM_unicode_normalizer_available(tc, &(ds->norm))) { MVMint32 ready = MVM_unicode_normalizer_available(tc, &(ds->norm)); MVMGrapheme32 *buffer = MVM_malloc(ready * sizeof(MVMGrapheme32)); MVMint32 count = 0; while (ready--) buffer[count++] = MVM_unicode_normalizer_get_grapheme(tc, &(ds->norm)); MVM_string_decodestream_add_chars(tc, ds, buffer, count); } /* If there's no codepoint buffer, then return the empty string. */ if (!ds->chars_head) { result->body.storage.blob_32 = NULL; result->body.num_graphs = 0; } /* If there's exactly one resulting codepoint buffer and we swallowed none * of it, just use it. */ else if (ds->chars_head == ds->chars_tail && ds->chars_head_pos == 0) { /* Set up result string. */ result->body.storage.blob_32 = ds->chars_head->chars; result->body.num_graphs = ds->chars_head->length; /* Don't free the buffer's memory itself, just the holder, as we * stole that for the buffer into the string above. */ MVM_free(ds->chars_head); ds->chars_head = ds->chars_tail = NULL; } /* Otherwise, need to assemble all the things. */ else { /* Calculate length. */ MVMint32 length = 0, pos = 0; MVMDecodeStreamChars *cur_chars = ds->chars_head; while (cur_chars) { if (cur_chars == ds->chars_head) length += cur_chars->length - ds->chars_head_pos; else length += cur_chars->length; cur_chars = cur_chars->next; } /* Allocate a result buffer of the right size. */ result->body.storage.blob_32 = MVM_malloc(length * sizeof(MVMGrapheme32)); result->body.num_graphs = length; /* Copy all the things into the target, freeing as we go. */ cur_chars = ds->chars_head; while (cur_chars) { if (cur_chars == ds->chars_head) { MVMint32 to_copy = ds->chars_head->length - ds->chars_head_pos; memcpy(result->body.storage.blob_32 + pos, cur_chars->chars + ds->chars_head_pos, cur_chars->length * sizeof(MVMGrapheme32)); pos += to_copy; } else { memcpy(result->body.storage.blob_32 + pos, cur_chars->chars, cur_chars->length * sizeof(MVMGrapheme32)); pos += cur_chars->length; } cur_chars = cur_chars->next; } ds->chars_head = ds->chars_tail = NULL; } return result; }
static void compute_case_change(MVMThreadContext *tc, MVMGrapheme32 synth_g, MVMNFGSynthetic *synth_info, MVMint32 case_) { MVMint32 num_result_graphs; MVMGrapheme32 *result = NULL; const MVMCodepoint *result_cps = NULL; /* Transform the base character. */ MVMuint32 num_result_cps = MVM_unicode_get_case_change(tc, synth_info->codes[synth_info->base_index], case_, &result_cps); if (num_result_cps == 0 || (num_result_cps == 1 && result_cps[0] == synth_info->codes[synth_info->base_index])) { /* Base character does not change, so grapheme stays the same. We * install a non-null sentinel for this case, and set the result * grapheme count to zero, which indicates no change. */ result = CASE_UNCHANGED; num_result_graphs = 0; } else { /* We can potentially get multiple graphemes back. We may also get * into situations where we case change the base and suddenly we * can normalize the whole thing to a non-synthetic. So, we take * a trip through the normalizer. We push any codepoints before the * base in the synthetic (only happens with Prepend codepoints). * We then push the first codepoint we get back from the case change * then the codeponits after the base characters (generally Extend * codepoints). * Finally we push anything else the case change produced. This should * do about the right thing for both case changes that produce a * base and a combiner, and those that produce a base and a base, * since the normalizer applies canonical combining class sorting. */ MVMNormalizer norm; MVMint32 i; MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG); if (0 < synth_info->base_index) MVM_unicode_normalizer_push_codepoints(tc, &norm, synth_info->codes, synth_info->base_index); /* Push the first result on */ MVM_unicode_normalizer_push_codepoints(tc, &norm, result_cps, 1); /* Push any combiners after that codepoint so the combiners attach to the * first codepoint of the casechange not the second or more */ MVM_unicode_normalizer_push_codepoints(tc, &norm, synth_info->codes + synth_info->base_index + 1, synth_info->num_codes - synth_info->base_index - 1); if (1 < num_result_cps) MVM_unicode_normalizer_push_codepoints(tc, &norm, result_cps + 1, num_result_cps - 1); MVM_unicode_normalizer_eof(tc, &norm); num_result_graphs = MVM_unicode_normalizer_available(tc, &norm); result = MVM_malloc(num_result_graphs * sizeof(MVMGrapheme32)); for (i = 0; i < num_result_graphs; i++) result[i] = MVM_unicode_normalizer_get_grapheme(tc, &norm); MVM_unicode_normalizer_cleanup(tc, &norm); } switch (case_) { case MVM_unicode_case_change_type_upper: synth_info->case_uc = result; synth_info->case_uc_graphs = num_result_graphs; break; case MVM_unicode_case_change_type_lower: synth_info->case_lc = result; synth_info->case_lc_graphs = num_result_graphs; break; case MVM_unicode_case_change_type_title: synth_info->case_tc = result; synth_info->case_tc_graphs = num_result_graphs; break; case MVM_unicode_case_change_type_fold: synth_info->case_fc = result; synth_info->case_fc_graphs = num_result_graphs; break; default: MVM_panic(1, "NFG: invalid case change %d", case_); } }
static void compute_case_change(MVMThreadContext *tc, MVMGrapheme32 synth, MVMNFGSynthetic *synth_info, MVMint32 case_) { MVMGrapheme32 *result; MVMint32 num_result_graphs; /* Transform the base character. */ const MVMCodepoint *result_cps; MVMuint32 num_result_cps = MVM_unicode_get_case_change(tc, synth_info->base, case_, &result_cps); if (num_result_cps == 0 || *result_cps == synth_info->base) { /* Base character does not change, so grapheme stays the same. We * install a non-null sentinel for this case, and set the result * grapheme count to zero, which indicates no change. */ result = CASE_UNCHANGED; num_result_graphs = 0; } else { /* We can potentially get multiple graphemes back. We may also get * into situations where we case change the base and suddenly we * can normalize the whole thing to a non-synthetic. So, we take * a trip through the normalizer. Note we push the first thing * we get back from the case change, then our combiners, and * finally anything else the case change produced. This should * do about the right thing for both case changes that produce a * base and a combiner, and those that produce a base and a base, * since the normalizer applies Unicode canonical sorting. */ MVMNormalizer norm; MVMint32 i; MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG); MVM_unicode_normalizer_push_codepoints(tc, &norm, result_cps, 1); MVM_unicode_normalizer_push_codepoints(tc, &norm, synth_info->combs, synth_info->num_combs); if (num_result_cps > 1) MVM_unicode_normalizer_push_codepoints(tc, &norm, result_cps + 1, num_result_cps - 1); MVM_unicode_normalizer_eof(tc, &norm); num_result_graphs = MVM_unicode_normalizer_available(tc, &norm); result = MVM_malloc(num_result_graphs * sizeof(MVMGrapheme32)); for (i = 0; i < num_result_graphs; i++) result[i] = MVM_unicode_normalizer_get_grapheme(tc, &norm); MVM_unicode_normalizer_cleanup(tc, &norm); } switch (case_) { case MVM_unicode_case_change_type_upper: synth_info->case_uc = result; synth_info->case_uc_graphs = num_result_graphs; break; case MVM_unicode_case_change_type_lower: synth_info->case_lc = result; synth_info->case_lc_graphs = num_result_graphs; break; case MVM_unicode_case_change_type_title: synth_info->case_tc = result; synth_info->case_tc_graphs = num_result_graphs; break; case MVM_unicode_case_change_type_fold: synth_info->case_fc = result; synth_info->case_fc_graphs = num_result_graphs; break; default: MVM_panic(1, "NFG: invalid case change %d", case_); } }