/* Decompose the codepoint and add it into the buffer. */ static void decomp_codepoint_to_buffer(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint cp) { /* See if we actually need to decompose (can skip if the decomposition * type is None, or we're only doing Canonical decomposition and it is * anything except Canonical). */ const char *type = MVM_unicode_codepoint_get_property_cstr(tc, cp, MVM_UNICODE_PROPERTY_DECOMPOSITION_TYPE); MVMint64 decompose = 1; if (!type) decompose = 0; else if (strcmp(type, "None") == 0) decompose = 0; else if (!MVM_NORMALIZE_COMPAT_DECOMP(n->form) && strcmp(type, "Canonical") != 0) decompose = 0; if (decompose) { /* We need to decompose. Get the decomp spec and go over the things in * it; things without a decomp spec are presumably Hangul and need the * algorithmic treatment. */ char *spec = (char *)MVM_unicode_codepoint_get_property_cstr(tc, cp, MVM_UNICODE_PROPERTY_DECOMP_SPEC); if (spec && spec[0]) { char *end = spec + strlen(spec); while (spec < end) { /* Parse hex character code, and then recurse to do any further * decomposition on it; this recursion terminates when we find a * non-decomposable thing and add it to the buffer. */ MVMCodepoint decomp_char = (MVMCodepoint)strtol(spec, &spec, 16); decomp_codepoint_to_buffer(tc, n, decomp_char); } } else { decomp_hangul_to_buffer(tc, n, cp); } } else { /* Don't need to decompose; add it right into the buffer. */ add_codepoint_to_buffer(tc, n, cp); } }
/* Called when the very fast case of normalization fails (that is, when we get * any two codepoints in a row where at least one is greater than the first * significant codepoint identified by a quick check for the target form). We * may find the quick check itself is enough; if not, we have to do real work * compute the normalization. */ MVMint32 MVM_unicode_normalizer_process_codepoint_full(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint in, MVMCodepoint *out) { /* Do a quickcheck on the codepoint we got in and get its CCC. */ MVMint64 qc_in = passes_quickcheck(tc, n, in); MVMint64 ccc_in = ccc(tc, in); /* Fast cases when we pass quick check and what we got in has CCC = 0. */ if (qc_in && ccc_in == 0) { if (MVM_NORMALIZE_COMPOSE(n->form)) { /* We're composing. If we have exactly one thing in the buffer and * it also passes the quick check, and both it and the thing in the * buffer have a CCC of zero, we can hand back the first of the * two - effectively replacing what's in the buffer with the new * codepoint coming in. */ if (n->buffer_end - n->buffer_start == 1) { MVMCodepoint maybe_result = n->buffer[n->buffer_start]; if (passes_quickcheck(tc, n, maybe_result) && ccc(tc, maybe_result) == 0) { *out = n->buffer[n->buffer_start]; n->buffer[n->buffer_start] = in; return 1; } } } else { /* We're only decomposing. There should probably be nothing in the * buffer in this case; if so we can simply return the codepoint. */ if (n->buffer_start == n->buffer_end) { *out = in; return 1; } } } /* If we didn't pass quick check... */ if (!qc_in) { /* If we're composing, then decompose the last thing placed in the * buffer, if any. We need to do this since it may have passed * quickcheck, but having seen some character that does pass then we * must make sure we decomposed the prior passing one too. */ if (MVM_NORMALIZE_COMPOSE(n->form) && n->buffer_end != n->buffer_start) { MVMCodepoint decomp = n->buffer[n->buffer_end - 1]; n->buffer_end--; decomp_codepoint_to_buffer(tc, n, decomp); } /* Decompose this new character into the buffer. We'll need to see * more before we can go any further. */ decomp_codepoint_to_buffer(tc, n, in); return 0; } /* Since anything we have at this point does pass quick check, add it to * the buffer directly. */ add_codepoint_to_buffer(tc, n, in); /* If the codepoint has a CCC that is non-zero, it's not a starter so we * should see more before normalizing. */ if (ccc_in > 0) return 0; /* If we don't have at least one codepoint in the buffer, it's too early * to hand anything back. */ if (n->buffer_end - n->buffer_start <= 1) return 0; /* Perform canonical sorting on everything from the start of the buffer * up to but excluding the quick-check-passing thing we just added. */ canonical_sort(tc, n, n->buffer_start, n->buffer_end - 1); /* Perform canonical composition and grapheme composition if needed. */ if (MVM_NORMALIZE_COMPOSE(n->form)) { canonical_composition(tc, n, n->buffer_start, n->buffer_end - 1); if (MVM_NORMALIZE_GRAPHEME(n->form)) grapheme_composition(tc, n, n->buffer_start, n->buffer_end - 1); } /* We've now normalized all except the latest, quick-check-passing * codepoint. */ n->buffer_norm_end = n->buffer_end - 1; /* Hand back a codepoint, and flag how many more are available. */ *out = n->buffer[n->buffer_start]; return n->buffer_norm_end - n->buffer_start++; }
/* Push a number of codepoints into the "to normalize" buffer. */ void MVM_unicode_normalizer_push_codepoints(MVMThreadContext *tc, MVMNormalizer *n, const MVMCodepoint *in, MVMint32 num_codepoints) { MVMint32 i; for (i = 0; i < num_codepoints; i++) decomp_codepoint_to_buffer(tc, n, in[i]); }