static MVMint32 NFD_and_push_collation_values (MVMThreadContext *tc, MVMCodepoint cp, collation_stack *stack, MVMCodepointIter *ci, char *name) { MVMNormalizer norm; MVMCodepoint cp_out; MVMint32 ready, result_pos = 0; MVMCodepoint *result = MVM_malloc(sizeof(MVMCodepoint) * initial_collation_norm_buf_size); MVMint32 result_size = initial_collation_norm_buf_size; MVMint64 rtrn = 0; MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFD); ready = MVM_unicode_normalizer_process_codepoint(tc, &norm, cp, &cp_out); if (ready) { if (result_size <= result_pos + ready) result = MVM_realloc(result, sizeof(MVMCodepoint) * (result_size += initial_collation_norm_buf_size)); result[result_pos++] = cp_out; while (0 < --ready) result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm); } MVM_unicode_normalizer_eof(tc, &norm); ready = MVM_unicode_normalizer_available(tc, &norm); while (ready--) { if (result_size <= result_pos + ready + 1) result = MVM_realloc(result, sizeof(MVMCodepoint) * (result_size += initial_collation_norm_buf_size)); result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm); } /* If the codepoint changed or we now have more than before */ if (result[0] != cp || 1 < result_pos) rtrn = collation_push_cp(tc, stack, ci, result, result_pos, name); if (result) MVM_free(result); return rtrn; }
/* Takes an NFG string and populates the array out, which must be a 32-bit * integer array, with codepoints normalized according to the specified * normalization form. */ void MVM_unicode_string_to_codepoints(MVMThreadContext *tc, MVMString *s, MVMNormalization form, MVMObject *out) { MVMCodepoint *result; MVMint64 result_pos, result_alloc; MVMCodepointIter ci; /* Validate output array and set up result storage. */ assert_codepoint_array(tc, out, "Normalization output must be native array of 32-bit integers"); result_alloc = s->body.num_graphs; result = MVM_malloc(result_alloc * sizeof(MVMCodepoint)); result_pos = 0; /* Create codepoint iterator. */ MVM_string_ci_init(tc, &ci, s); /* If we want NFC, just iterate, since NFG is constructed out of NFC. */ if (form == MVM_NORMALIZE_NFC) { while (MVM_string_ci_has_more(tc, &ci)) { maybe_grow_result(&result, &result_alloc, result_pos + 1); result[result_pos++] = MVM_string_ci_get_codepoint(tc, &ci); } } /* Otherwise, need to feed it through a normalizer. */ else { MVMNormalizer norm; MVMint32 ready; MVM_unicode_normalizer_init(tc, &norm, form); while (MVM_string_ci_has_more(tc, &ci)) { MVMCodepoint cp; ready = MVM_unicode_normalizer_process_codepoint(tc, &norm, MVM_string_ci_get_codepoint(tc, &ci), &cp); if (ready) { maybe_grow_result(&result, &result_alloc, result_pos + ready); result[result_pos++] = cp; while (--ready > 0) result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm); } } MVM_unicode_normalizer_eof(tc, &norm); ready = MVM_unicode_normalizer_available(tc, &norm); maybe_grow_result(&result, &result_alloc, result_pos + ready); while (ready--) result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm); MVM_unicode_normalizer_cleanup(tc, &norm); } /* Put result into array body. */ ((MVMArray *)out)->body.slots.u32 = result; ((MVMArray *)out)->body.start = 0; ((MVMArray *)out)->body.elems = result_pos; }
void MVM_unicode_normalize_codepoints(MVMThreadContext *tc, MVMObject *in, MVMObject *out, MVMNormalization form) { MVMNormalizer norm; MVMCodepoint *input; MVMCodepoint *result; MVMint64 input_pos, input_codes, result_pos, result_alloc; MVMint32 ready; /* Validate input/output array. */ assert_codepoint_array(tc, in, "Normalization input must be native array of 32-bit integers"); assert_codepoint_array(tc, out, "Normalization output must be native array of 32-bit integers"); /* Get input array; if it's empty, we're done already. */ input = (MVMCodepoint *)((MVMArray *)in)->body.slots.u32 + ((MVMArray *)in)->body.start; input_codes = ((MVMArray *)in)->body.elems; if (input_codes == 0) return; /* Guess output size based on input size. */ result_alloc = input_codes; result = MVM_malloc(result_alloc * sizeof(MVMCodepoint)); /* Perform normalization. */ MVM_unicode_normalizer_init(tc, &norm, form); input_pos = 0; result_pos = 0; while (input_pos < input_codes) { MVMCodepoint cp; ready = MVM_unicode_normalizer_process_codepoint(tc, &norm, input[input_pos], &cp); if (ready) { maybe_grow_result(&result, &result_alloc, result_pos + ready); result[result_pos++] = cp; while (--ready > 0) result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm); } input_pos++; } MVM_unicode_normalizer_eof(tc, &norm); ready = MVM_unicode_normalizer_available(tc, &norm); maybe_grow_result(&result, &result_alloc, result_pos + ready); while (ready--) result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm); MVM_unicode_normalizer_cleanup(tc, &norm); /* Put result into array body. */ ((MVMArray *)out)->body.slots.u32 = result; ((MVMArray *)out)->body.start = 0; ((MVMArray *)out)->body.elems = result_pos; }
/* Processes a codepoint that we regard as a "normalization terminator". These * never have a decomposition, and for all practical purposes will not have a * combiner on them. We treat them specially so we don't, during I/O, block on * seeing a codepoint after them, which for things like REPLs that need to see * input right after a \n makes for problems. */ MVMint32 MVM_unicode_normalizer_process_codepoint_norm_terminator(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint in, MVMCodepoint *out) { /* Add the codepoint into the buffer. */ add_codepoint_to_buffer(tc, n, in); /* Treat this as an "eof", which really means "normalize what ya got". */ MVM_unicode_normalizer_eof(tc, n); /* Hand back a normalized codepoint, and the number available (have to * compensate for the one we steal for *out). */ *out = MVM_unicode_normalizer_get_codepoint(tc, n); return 1 + MVM_unicode_normalizer_available(tc, n); }