static MVMint32 NFD_and_push_collation_values (MVMThreadContext *tc, MVMCodepoint cp, collation_stack *stack, MVMCodepointIter *ci, char *name) { MVMNormalizer norm; MVMCodepoint cp_out; MVMint32 ready, result_pos = 0; MVMCodepoint *result = MVM_malloc(sizeof(MVMCodepoint) * initial_collation_norm_buf_size); MVMint32 result_size = initial_collation_norm_buf_size; MVMint64 rtrn = 0; MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFD); ready = MVM_unicode_normalizer_process_codepoint(tc, &norm, cp, &cp_out); if (ready) { if (result_size <= result_pos + ready) result = MVM_realloc(result, sizeof(MVMCodepoint) * (result_size += initial_collation_norm_buf_size)); result[result_pos++] = cp_out; while (0 < --ready) result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm); } MVM_unicode_normalizer_eof(tc, &norm); ready = MVM_unicode_normalizer_available(tc, &norm); while (ready--) { if (result_size <= result_pos + ready + 1) result = MVM_realloc(result, sizeof(MVMCodepoint) * (result_size += initial_collation_norm_buf_size)); result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm); } /* If the codepoint changed or we now have more than before */ if (result[0] != cp || 1 < result_pos) rtrn = collation_push_cp(tc, stack, ci, result, result_pos, name); if (result) MVM_free(result); return rtrn; }
/* Creates a new decoding stream. */ MVMDecodeStream * MVM_string_decodestream_create(MVMThreadContext *tc, MVMint32 encoding, MVMint64 abs_byte_pos) { MVMDecodeStream *ds = MVM_calloc(1, sizeof(MVMDecodeStream)); ds->encoding = encoding; ds->abs_byte_pos = abs_byte_pos; MVM_unicode_normalizer_init(tc, &(ds->norm), MVM_NORMALIZE_NFG); return ds; }
/* Returns non-zero if the result of concatenating the two strings will freely * leave us in NFG without any further effort. */ MVMint32 MVM_nfg_is_concat_stable(MVMThreadContext *tc, MVMString *a, MVMString *b) { MVMGrapheme32 last_a; MVMGrapheme32 first_b; MVMGrapheme32 crlf; /* If either string is empty, we're good. */ if (a->body.num_graphs == 0 || b->body.num_graphs == 0) return 1; /* Get first and last graphemes of the strings. */ last_a = MVM_string_get_grapheme_at_nocheck(tc, a, a->body.num_graphs - 1); first_b = MVM_string_get_grapheme_at_nocheck(tc, b, 0); /* Put the case where we are adding a lf or crlf line ending */ if (first_b == '\n') /* If we see \r + \n we need to renormalize. Otherwise we're good */ return last_a == '\r' ? 0 : 1; crlf = MVM_nfg_crlf_grapheme(tc); /* As a control code we are always going to break if we see one of these. * Check first_b for speeding up line endings */ if (first_b == crlf || last_a == crlf) return 0; /* If either is synthetic other than "\r\n", assume we'll have to re-normalize * (this is an over-estimate, most likely). Note if you optimize this that it * serves as a guard for what follows. * TODO get the last codepoint of last_a and first codepoint of first_b and call * MVM_unicode_normalize_should_break */ if (last_a < 0 || first_b < 0) return 0; /* If both less than the first significant char for NFC we are good */ if (last_a < MVM_NORMALIZE_FIRST_SIG_NFC && first_b < MVM_NORMALIZE_FIRST_SIG_NFC) { return 1; } else { /* Check if the two codepoints would be joined during normalization. * Returns 1 if they would break and thus is safe under concat, or 0 if * they would be joined. */ MVMNormalizer norm; int rtrn; MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG); /* Since we are only looking at two codepoints, we don't know what came * before. Because of special rules with Regional Indicators, pretend * the previous codepoint was a regional indicator. This will return the * special value of 2 from MVM_unicode_normalize_should_break and trigger * re_nfg if last_a and first_b are both regional indicators and we will * never break NFG regardless of what the codepoint before last_a is. */ norm.regional_indicator = 1; rtrn = MVM_unicode_normalize_should_break(tc, last_a, first_b, &norm); MVM_unicode_normalizer_cleanup(tc, &norm); /* If both CCC are non-zero then it may need to be reordered. For now return 0. * This can be optimized. */ if (MVM_unicode_relative_ccc(tc, last_a) != 0 && MVM_unicode_relative_ccc(tc, first_b) != 0) return 0; return rtrn; } }
/* Creates a new decoding stream. */ MVMDecodeStream * MVM_string_decodestream_create(MVMThreadContext *tc, MVMint32 encoding, MVMint64 abs_byte_pos, MVMint32 translate_newlines) { MVMDecodeStream *ds = MVM_calloc(1, sizeof(MVMDecodeStream)); ds->encoding = encoding; ds->abs_byte_pos = abs_byte_pos; MVM_unicode_normalizer_init(tc, &(ds->norm), MVM_NORMALIZE_NFG); if (translate_newlines) MVM_unicode_normalizer_translate_newlines(tc, &(ds->norm)); ds->result_size_guess = 64; return ds; }
/* Takes an NFG string and populates the array out, which must be a 32-bit * integer array, with codepoints normalized according to the specified * normalization form. */ void MVM_unicode_string_to_codepoints(MVMThreadContext *tc, MVMString *s, MVMNormalization form, MVMObject *out) { MVMCodepoint *result; MVMint64 result_pos, result_alloc; MVMCodepointIter ci; /* Validate output array and set up result storage. */ assert_codepoint_array(tc, out, "Normalization output must be native array of 32-bit integers"); result_alloc = s->body.num_graphs; result = MVM_malloc(result_alloc * sizeof(MVMCodepoint)); result_pos = 0; /* Create codepoint iterator. */ MVM_string_ci_init(tc, &ci, s); /* If we want NFC, just iterate, since NFG is constructed out of NFC. */ if (form == MVM_NORMALIZE_NFC) { while (MVM_string_ci_has_more(tc, &ci)) { maybe_grow_result(&result, &result_alloc, result_pos + 1); result[result_pos++] = MVM_string_ci_get_codepoint(tc, &ci); } } /* Otherwise, need to feed it through a normalizer. */ else { MVMNormalizer norm; MVMint32 ready; MVM_unicode_normalizer_init(tc, &norm, form); while (MVM_string_ci_has_more(tc, &ci)) { MVMCodepoint cp; ready = MVM_unicode_normalizer_process_codepoint(tc, &norm, MVM_string_ci_get_codepoint(tc, &ci), &cp); if (ready) { maybe_grow_result(&result, &result_alloc, result_pos + ready); result[result_pos++] = cp; while (--ready > 0) result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm); } } MVM_unicode_normalizer_eof(tc, &norm); ready = MVM_unicode_normalizer_available(tc, &norm); maybe_grow_result(&result, &result_alloc, result_pos + ready); while (ready--) result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm); MVM_unicode_normalizer_cleanup(tc, &norm); } /* Put result into array body. */ ((MVMArray *)out)->body.slots.u32 = result; ((MVMArray *)out)->body.start = 0; ((MVMArray *)out)->body.elems = result_pos; }
/* Takes an object, which must be of VMArray representation and holding * 32-bit integers. Treats them as Unicode codepoints, normalizes them at * Grapheme level, and returns the resulting NFG string. */ MVMString * MVM_unicode_codepoints_to_nfg_string(MVMThreadContext *tc, MVMObject *codes) { MVMNormalizer norm; MVMCodepoint *input; MVMGrapheme32 *result; MVMint64 input_pos, input_codes, result_pos, result_alloc; MVMint32 ready; MVMString *str; /* Get input array; if it's empty, we're done already. */ assert_codepoint_array(tc, codes, "Code points to string input must be native array of 32-bit integers"); input = (MVMCodepoint *)((MVMArray *)codes)->body.slots.u32 + ((MVMArray *)codes)->body.start; input_codes = ((MVMArray *)codes)->body.elems; if (input_codes == 0) return tc->instance->str_consts.empty; /* Guess output size based on input size. */ result_alloc = input_codes; result = MVM_malloc(result_alloc * sizeof(MVMCodepoint)); /* Perform normalization at grapheme level. */ MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG); input_pos = 0; result_pos = 0; while (input_pos < input_codes) { MVMGrapheme32 g; ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, input[input_pos], &g); if (ready) { maybe_grow_result(&result, &result_alloc, result_pos + ready); result[result_pos++] = g; while (--ready > 0) result[result_pos++] = MVM_unicode_normalizer_get_grapheme(tc, &norm); } input_pos++; } MVM_unicode_normalizer_eof(tc, &norm); ready = MVM_unicode_normalizer_available(tc, &norm); maybe_grow_result(&result, &result_alloc, result_pos + ready); while (ready--) result[result_pos++] = MVM_unicode_normalizer_get_grapheme(tc, &norm); MVM_unicode_normalizer_cleanup(tc, &norm); /* Produce an MVMString of the result. */ str = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString); str->body.storage.blob_32 = result; str->body.storage_type = MVM_STRING_GRAPHEME_32; str->body.num_graphs = result_pos; return str; }
void MVM_unicode_normalize_codepoints(MVMThreadContext *tc, MVMObject *in, MVMObject *out, MVMNormalization form) { MVMNormalizer norm; MVMCodepoint *input; MVMCodepoint *result; MVMint64 input_pos, input_codes, result_pos, result_alloc; MVMint32 ready; /* Validate input/output array. */ assert_codepoint_array(tc, in, "Normalization input must be native array of 32-bit integers"); assert_codepoint_array(tc, out, "Normalization output must be native array of 32-bit integers"); /* Get input array; if it's empty, we're done already. */ input = (MVMCodepoint *)((MVMArray *)in)->body.slots.u32 + ((MVMArray *)in)->body.start; input_codes = ((MVMArray *)in)->body.elems; if (input_codes == 0) return; /* Guess output size based on input size. */ result_alloc = input_codes; result = MVM_malloc(result_alloc * sizeof(MVMCodepoint)); /* Perform normalization. */ MVM_unicode_normalizer_init(tc, &norm, form); input_pos = 0; result_pos = 0; while (input_pos < input_codes) { MVMCodepoint cp; ready = MVM_unicode_normalizer_process_codepoint(tc, &norm, input[input_pos], &cp); if (ready) { maybe_grow_result(&result, &result_alloc, result_pos + ready); result[result_pos++] = cp; while (--ready > 0) result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm); } input_pos++; } MVM_unicode_normalizer_eof(tc, &norm); ready = MVM_unicode_normalizer_available(tc, &norm); maybe_grow_result(&result, &result_alloc, result_pos + ready); while (ready--) result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm); MVM_unicode_normalizer_cleanup(tc, &norm); /* Put result into array body. */ ((MVMArray *)out)->body.slots.u32 = result; ((MVMArray *)out)->body.start = 0; ((MVMArray *)out)->body.elems = result_pos; }
static void compute_case_change(MVMThreadContext *tc, MVMGrapheme32 synth_g, MVMNFGSynthetic *synth_info, MVMint32 case_) { MVMint32 num_result_graphs; MVMGrapheme32 *result = NULL; const MVMCodepoint *result_cps = NULL; /* Transform the base character. */ MVMuint32 num_result_cps = MVM_unicode_get_case_change(tc, synth_info->codes[synth_info->base_index], case_, &result_cps); if (num_result_cps == 0 || (num_result_cps == 1 && result_cps[0] == synth_info->codes[synth_info->base_index])) { /* Base character does not change, so grapheme stays the same. We * install a non-null sentinel for this case, and set the result * grapheme count to zero, which indicates no change. */ result = CASE_UNCHANGED; num_result_graphs = 0; } else { /* We can potentially get multiple graphemes back. We may also get * into situations where we case change the base and suddenly we * can normalize the whole thing to a non-synthetic. So, we take * a trip through the normalizer. We push any codepoints before the * base in the synthetic (only happens with Prepend codepoints). * We then push the first codepoint we get back from the case change * then the codeponits after the base characters (generally Extend * codepoints). * Finally we push anything else the case change produced. This should * do about the right thing for both case changes that produce a * base and a combiner, and those that produce a base and a base, * since the normalizer applies canonical combining class sorting. */ MVMNormalizer norm; MVMint32 i; MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG); if (0 < synth_info->base_index) MVM_unicode_normalizer_push_codepoints(tc, &norm, synth_info->codes, synth_info->base_index); /* Push the first result on */ MVM_unicode_normalizer_push_codepoints(tc, &norm, result_cps, 1); /* Push any combiners after that codepoint so the combiners attach to the * first codepoint of the casechange not the second or more */ MVM_unicode_normalizer_push_codepoints(tc, &norm, synth_info->codes + synth_info->base_index + 1, synth_info->num_codes - synth_info->base_index - 1); if (1 < num_result_cps) MVM_unicode_normalizer_push_codepoints(tc, &norm, result_cps + 1, num_result_cps - 1); MVM_unicode_normalizer_eof(tc, &norm); num_result_graphs = MVM_unicode_normalizer_available(tc, &norm); result = MVM_malloc(num_result_graphs * sizeof(MVMGrapheme32)); for (i = 0; i < num_result_graphs; i++) result[i] = MVM_unicode_normalizer_get_grapheme(tc, &norm); MVM_unicode_normalizer_cleanup(tc, &norm); } switch (case_) { case MVM_unicode_case_change_type_upper: synth_info->case_uc = result; synth_info->case_uc_graphs = num_result_graphs; break; case MVM_unicode_case_change_type_lower: synth_info->case_lc = result; synth_info->case_lc_graphs = num_result_graphs; break; case MVM_unicode_case_change_type_title: synth_info->case_tc = result; synth_info->case_tc_graphs = num_result_graphs; break; case MVM_unicode_case_change_type_fold: synth_info->case_fc = result; synth_info->case_fc_graphs = num_result_graphs; break; default: MVM_panic(1, "NFG: invalid case change %d", case_); } }
static void compute_case_change(MVMThreadContext *tc, MVMGrapheme32 synth, MVMNFGSynthetic *synth_info, MVMint32 case_) { MVMGrapheme32 *result; MVMint32 num_result_graphs; /* Transform the base character. */ const MVMCodepoint *result_cps; MVMuint32 num_result_cps = MVM_unicode_get_case_change(tc, synth_info->base, case_, &result_cps); if (num_result_cps == 0 || *result_cps == synth_info->base) { /* Base character does not change, so grapheme stays the same. We * install a non-null sentinel for this case, and set the result * grapheme count to zero, which indicates no change. */ result = CASE_UNCHANGED; num_result_graphs = 0; } else { /* We can potentially get multiple graphemes back. We may also get * into situations where we case change the base and suddenly we * can normalize the whole thing to a non-synthetic. So, we take * a trip through the normalizer. Note we push the first thing * we get back from the case change, then our combiners, and * finally anything else the case change produced. This should * do about the right thing for both case changes that produce a * base and a combiner, and those that produce a base and a base, * since the normalizer applies Unicode canonical sorting. */ MVMNormalizer norm; MVMint32 i; MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG); MVM_unicode_normalizer_push_codepoints(tc, &norm, result_cps, 1); MVM_unicode_normalizer_push_codepoints(tc, &norm, synth_info->combs, synth_info->num_combs); if (num_result_cps > 1) MVM_unicode_normalizer_push_codepoints(tc, &norm, result_cps + 1, num_result_cps - 1); MVM_unicode_normalizer_eof(tc, &norm); num_result_graphs = MVM_unicode_normalizer_available(tc, &norm); result = MVM_malloc(num_result_graphs * sizeof(MVMGrapheme32)); for (i = 0; i < num_result_graphs; i++) result[i] = MVM_unicode_normalizer_get_grapheme(tc, &norm); MVM_unicode_normalizer_cleanup(tc, &norm); } switch (case_) { case MVM_unicode_case_change_type_upper: synth_info->case_uc = result; synth_info->case_uc_graphs = num_result_graphs; break; case MVM_unicode_case_change_type_lower: synth_info->case_lc = result; synth_info->case_lc_graphs = num_result_graphs; break; case MVM_unicode_case_change_type_title: synth_info->case_tc = result; synth_info->case_tc_graphs = num_result_graphs; break; case MVM_unicode_case_change_type_fold: synth_info->case_fc = result; synth_info->case_fc_graphs = num_result_graphs; break; default: MVM_panic(1, "NFG: invalid case change %d", case_); } }