Ejemplo n.º 1
0
static MVMint32 NFD_and_push_collation_values (MVMThreadContext *tc, MVMCodepoint cp, collation_stack *stack, MVMCodepointIter *ci, char *name) {
    MVMNormalizer norm;
    MVMCodepoint cp_out;
    MVMint32 ready,
             result_pos  = 0;
    MVMCodepoint *result = MVM_malloc(sizeof(MVMCodepoint) * initial_collation_norm_buf_size);
    MVMint32 result_size = initial_collation_norm_buf_size;
    MVMint64 rtrn        = 0;
    MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFD);
    ready = MVM_unicode_normalizer_process_codepoint(tc, &norm, cp, &cp_out);
    if (ready) {
        if (result_size <= result_pos + ready)
            result = MVM_realloc(result, sizeof(MVMCodepoint) * (result_size += initial_collation_norm_buf_size));
        result[result_pos++] = cp_out;
        while (0 < --ready)
            result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm);
    }
    MVM_unicode_normalizer_eof(tc, &norm);
    ready = MVM_unicode_normalizer_available(tc, &norm);
    while (ready--) {
        if (result_size <= result_pos + ready + 1)
            result = MVM_realloc(result, sizeof(MVMCodepoint) * (result_size += initial_collation_norm_buf_size));
        result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm);
    }
    /* If the codepoint changed or we now have more than before */
    if (result[0] != cp || 1 < result_pos)
        rtrn = collation_push_cp(tc, stack, ci, result, result_pos, name);
    if (result)
        MVM_free(result);
    return rtrn;
}
Ejemplo n.º 2
0
/* Processes a codepoint that we regard as a "normalization terminator". These
 * never have a decomposition, and for all practical purposes will not have a
 * combiner on them. We treat them specially so we don't, during I/O, block on
 * seeing a codepoint after them, which for things like REPLs that need to see
 * input right after a \n makes for problems. */
MVMint32 MVM_unicode_normalizer_process_codepoint_norm_terminator(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint in, MVMCodepoint *out) {
    /* Add the codepoint into the buffer. */
    add_codepoint_to_buffer(tc, n, in);

    /* Treat this as an "eof", which really means "normalize what ya got". */
    MVM_unicode_normalizer_eof(tc, n);

    /* Hand back a normalized codepoint, and the number available (have to
     * compensate for the one we steal for *out). */
    *out = MVM_unicode_normalizer_get_codepoint(tc, n);
    return 1 + MVM_unicode_normalizer_available(tc, n);
}
Ejemplo n.º 3
0
/* Takes an NFG string and populates the array out, which must be a 32-bit
 * integer array, with codepoints normalized according to the specified
 * normalization form. */
void MVM_unicode_string_to_codepoints(MVMThreadContext *tc, MVMString *s, MVMNormalization form, MVMObject *out) {
    MVMCodepoint     *result;
    MVMint64          result_pos, result_alloc;
    MVMCodepointIter  ci;

    /* Validate output array and set up result storage. */
    assert_codepoint_array(tc, out, "Normalization output must be native array of 32-bit integers");
    result_alloc = s->body.num_graphs;
    result       = MVM_malloc(result_alloc * sizeof(MVMCodepoint));
    result_pos   = 0;

    /* Create codepoint iterator. */
    MVM_string_ci_init(tc, &ci, s);

    /* If we want NFC, just iterate, since NFG is constructed out of NFC. */
    if (form == MVM_NORMALIZE_NFC) {
        while (MVM_string_ci_has_more(tc, &ci)) {
            maybe_grow_result(&result, &result_alloc, result_pos + 1);
            result[result_pos++] = MVM_string_ci_get_codepoint(tc, &ci);
        }
    }

    /* Otherwise, need to feed it through a normalizer. */
    else {
        MVMNormalizer norm;
        MVMint32      ready;
        MVM_unicode_normalizer_init(tc, &norm, form);
        while (MVM_string_ci_has_more(tc, &ci)) {
            MVMCodepoint cp;
            ready = MVM_unicode_normalizer_process_codepoint(tc, &norm, MVM_string_ci_get_codepoint(tc, &ci), &cp);
            if (ready) {
                maybe_grow_result(&result, &result_alloc, result_pos + ready);
                result[result_pos++] = cp;
                while (--ready > 0)
                    result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm);
            }
        }
        MVM_unicode_normalizer_eof(tc, &norm);
        ready = MVM_unicode_normalizer_available(tc, &norm);
        maybe_grow_result(&result, &result_alloc, result_pos + ready);
        while (ready--)
            result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm);
        MVM_unicode_normalizer_cleanup(tc, &norm);
    }

    /* Put result into array body. */
    ((MVMArray *)out)->body.slots.u32 = result;
    ((MVMArray *)out)->body.start     = 0;
    ((MVMArray *)out)->body.elems     = result_pos;
}
Ejemplo n.º 4
0
/* Takes an object, which must be of VMArray representation and holding
 * 32-bit integers. Treats them as Unicode codepoints, normalizes them at
 * Grapheme level, and returns the resulting NFG string. */
MVMString * MVM_unicode_codepoints_to_nfg_string(MVMThreadContext *tc, MVMObject *codes) {
    MVMNormalizer  norm;
    MVMCodepoint  *input;
    MVMGrapheme32 *result;
    MVMint64       input_pos, input_codes, result_pos, result_alloc;
    MVMint32       ready;
    MVMString     *str;

    /* Get input array; if it's empty, we're done already. */
    assert_codepoint_array(tc, codes, "Code points to string input must be native array of 32-bit integers");
    input       = (MVMCodepoint *)((MVMArray *)codes)->body.slots.u32 + ((MVMArray *)codes)->body.start;
    input_codes = ((MVMArray *)codes)->body.elems;
    if (input_codes == 0)
        return tc->instance->str_consts.empty;

    /* Guess output size based on input size. */
    result_alloc = input_codes;
    result       = MVM_malloc(result_alloc * sizeof(MVMCodepoint));

    /* Perform normalization at grapheme level. */
    MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG);
    input_pos  = 0;
    result_pos = 0;
    while (input_pos < input_codes) {
        MVMGrapheme32 g;
        ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, input[input_pos], &g);
        if (ready) {
            maybe_grow_result(&result, &result_alloc, result_pos + ready);
            result[result_pos++] = g;
            while (--ready > 0)
                result[result_pos++] = MVM_unicode_normalizer_get_grapheme(tc, &norm);
        }
        input_pos++;
    }
    MVM_unicode_normalizer_eof(tc, &norm);
    ready = MVM_unicode_normalizer_available(tc, &norm);
    maybe_grow_result(&result, &result_alloc, result_pos + ready);
    while (ready--)
        result[result_pos++] = MVM_unicode_normalizer_get_grapheme(tc, &norm);
    MVM_unicode_normalizer_cleanup(tc, &norm);

    /* Produce an MVMString of the result. */
    str = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
    str->body.storage.blob_32 = result;
    str->body.storage_type    = MVM_STRING_GRAPHEME_32;
    str->body.num_graphs      = result_pos;
    return str;
}
Ejemplo n.º 5
0
void MVM_unicode_normalize_codepoints(MVMThreadContext *tc, MVMObject *in, MVMObject *out, MVMNormalization form) {
    MVMNormalizer  norm;
    MVMCodepoint  *input;
    MVMCodepoint  *result;
    MVMint64       input_pos, input_codes, result_pos, result_alloc;
    MVMint32       ready;

    /* Validate input/output array. */
    assert_codepoint_array(tc, in, "Normalization input must be native array of 32-bit integers");
    assert_codepoint_array(tc, out, "Normalization output must be native array of 32-bit integers");

    /* Get input array; if it's empty, we're done already. */
    input       = (MVMCodepoint *)((MVMArray *)in)->body.slots.u32 + ((MVMArray *)in)->body.start;
    input_codes = ((MVMArray *)in)->body.elems;
    if (input_codes == 0)
        return;

    /* Guess output size based on input size. */
    result_alloc = input_codes;
    result       = MVM_malloc(result_alloc * sizeof(MVMCodepoint));

    /* Perform normalization. */
    MVM_unicode_normalizer_init(tc, &norm, form);
    input_pos  = 0;
    result_pos = 0;
    while (input_pos < input_codes) {
        MVMCodepoint cp;
        ready = MVM_unicode_normalizer_process_codepoint(tc, &norm, input[input_pos], &cp);
        if (ready) {
            maybe_grow_result(&result, &result_alloc, result_pos + ready);
            result[result_pos++] = cp;
            while (--ready > 0)
                result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm);
        }
        input_pos++;
    }
    MVM_unicode_normalizer_eof(tc, &norm);
    ready = MVM_unicode_normalizer_available(tc, &norm);
    maybe_grow_result(&result, &result_alloc, result_pos + ready);
    while (ready--)
        result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm);
    MVM_unicode_normalizer_cleanup(tc, &norm);

    /* Put result into array body. */
    ((MVMArray *)out)->body.slots.u32 = result;
    ((MVMArray *)out)->body.start     = 0;
    ((MVMArray *)out)->body.elems     = result_pos;
}
Ejemplo n.º 6
0
/* In situations where we have hit EOF, we need to decode what's left and flush
 * the normalization buffer also. */
static void reached_eof(MVMThreadContext *tc, MVMDecodeStream *ds) {
    /* Decode all the things. */
    if (ds->bytes_head)
        run_decode(tc, ds, NULL, NULL);

    /* If there's some things left in the normalization buffer, take them. */
    MVM_unicode_normalizer_eof(tc, &(ds->norm));
    if (MVM_unicode_normalizer_available(tc, &(ds->norm))) {
        MVMint32 ready = MVM_unicode_normalizer_available(tc, &(ds->norm));
        MVMGrapheme32 *buffer = MVM_malloc(ready * sizeof(MVMGrapheme32));
        MVMint32 count = 0;
        while (ready--)
            buffer[count++] = MVM_unicode_normalizer_get_grapheme(tc, &(ds->norm));
        MVM_string_decodestream_add_chars(tc, ds, buffer, count);
    }
}
Ejemplo n.º 7
0
/* Decodes all the buffers, producing a string containing all the decoded
 * characters. */
MVMString * MVM_string_decodestream_get_all(MVMThreadContext *tc, MVMDecodeStream *ds) {
    MVMString *result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
    result->body.storage_type = MVM_STRING_GRAPHEME_32;

    /* Decode all the things. */
    run_decode(tc, ds, NULL, NULL);

    /* If there's some things left in the normalization buffer, take them. */
    MVM_unicode_normalizer_eof(tc, &(ds->norm));
    if (MVM_unicode_normalizer_available(tc, &(ds->norm))) {
        MVMint32 ready = MVM_unicode_normalizer_available(tc, &(ds->norm));
        MVMGrapheme32 *buffer = MVM_malloc(ready * sizeof(MVMGrapheme32));
        MVMint32 count = 0;
        while (ready--)
            buffer[count++] = MVM_unicode_normalizer_get_grapheme(tc, &(ds->norm));
        MVM_string_decodestream_add_chars(tc, ds, buffer, count);
    }

    /* If there's no codepoint buffer, then return the empty string. */
    if (!ds->chars_head) {
        result->body.storage.blob_32 = NULL;
        result->body.num_graphs      = 0;
    }

    /* If there's exactly one resulting codepoint buffer and we swallowed none
     * of it, just use it. */
    else if (ds->chars_head == ds->chars_tail && ds->chars_head_pos == 0) {
        /* Set up result string. */
        result->body.storage.blob_32 = ds->chars_head->chars;
        result->body.num_graphs      = ds->chars_head->length;

        /* Don't free the buffer's memory itself, just the holder, as we
         * stole that for the buffer into the string above. */
        MVM_free(ds->chars_head);
        ds->chars_head = ds->chars_tail = NULL;
    }

    /* Otherwise, need to assemble all the things. */
    else {
        /* Calculate length. */
        MVMint32 length = 0, pos = 0;
        MVMDecodeStreamChars *cur_chars = ds->chars_head;
        while (cur_chars) {
            if (cur_chars == ds->chars_head)
                length += cur_chars->length - ds->chars_head_pos;
            else
                length += cur_chars->length;
            cur_chars = cur_chars->next;
        }

        /* Allocate a result buffer of the right size. */
        result->body.storage.blob_32 = MVM_malloc(length * sizeof(MVMGrapheme32));
        result->body.num_graphs      = length;

        /* Copy all the things into the target, freeing as we go. */
        cur_chars = ds->chars_head;
        while (cur_chars) {
            if (cur_chars == ds->chars_head) {
                MVMint32 to_copy = ds->chars_head->length - ds->chars_head_pos;
                memcpy(result->body.storage.blob_32 + pos, cur_chars->chars + ds->chars_head_pos,
                    cur_chars->length * sizeof(MVMGrapheme32));
                pos += to_copy;
            }
            else {
                memcpy(result->body.storage.blob_32 + pos, cur_chars->chars,
                    cur_chars->length * sizeof(MVMGrapheme32));
                pos += cur_chars->length;
            }
            cur_chars = cur_chars->next;
        }
        ds->chars_head = ds->chars_tail = NULL;
    }

    return result;
}
Ejemplo n.º 8
0
static void compute_case_change(MVMThreadContext *tc, MVMGrapheme32 synth_g, MVMNFGSynthetic *synth_info, MVMint32 case_) {
    MVMint32 num_result_graphs;
    MVMGrapheme32          *result = NULL;
    const MVMCodepoint *result_cps = NULL;
    /* Transform the base character. */
    MVMuint32 num_result_cps = MVM_unicode_get_case_change(tc,
        synth_info->codes[synth_info->base_index], case_, &result_cps);
    if (num_result_cps == 0 || (num_result_cps == 1 && result_cps[0] == synth_info->codes[synth_info->base_index])) {
        /* Base character does not change, so grapheme stays the same. We
         * install a non-null sentinel for this case, and set the result
         * grapheme count to zero, which indicates no change. */
        result = CASE_UNCHANGED;
        num_result_graphs = 0;
    }
    else {
        /* We can potentially get multiple graphemes back. We may also get
         * into situations where we case change the base and suddenly we
         * can normalize the whole thing to a non-synthetic. So, we take
         * a trip through the normalizer. We push any codepoints before the
         * base in the synthetic (only happens with Prepend codepoints).
          * We then push the first codepoint we get back from the case change
         * then the codeponits after the base characters (generally Extend
         * codepoints).
         * Finally we push anything else the case change produced. This should
         * do about the right thing for both case changes that produce a
         * base and a combiner, and those that produce a base and a base,
         * since the normalizer applies canonical combining class sorting. */
        MVMNormalizer norm;
        MVMint32 i;
        MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG);
        if (0 < synth_info->base_index)
            MVM_unicode_normalizer_push_codepoints(tc, &norm,
                synth_info->codes,
                synth_info->base_index);
        /* Push the first result on */
        MVM_unicode_normalizer_push_codepoints(tc, &norm, result_cps, 1);
        /* Push any combiners after that codepoint so the combiners attach to the
         * first codepoint of the casechange not the second or more */
        MVM_unicode_normalizer_push_codepoints(tc, &norm,
            synth_info->codes     + synth_info->base_index + 1,
            synth_info->num_codes - synth_info->base_index - 1);
        if (1 < num_result_cps)
            MVM_unicode_normalizer_push_codepoints(tc, &norm,
                result_cps     + 1,
                num_result_cps - 1);
        MVM_unicode_normalizer_eof(tc, &norm);

        num_result_graphs = MVM_unicode_normalizer_available(tc, &norm);
        result = MVM_malloc(num_result_graphs * sizeof(MVMGrapheme32));
        for (i = 0; i < num_result_graphs; i++)
            result[i] = MVM_unicode_normalizer_get_grapheme(tc, &norm);
        MVM_unicode_normalizer_cleanup(tc, &norm);
    }

    switch (case_) {
        case MVM_unicode_case_change_type_upper:
            synth_info->case_uc        = result;
            synth_info->case_uc_graphs = num_result_graphs;
            break;
        case MVM_unicode_case_change_type_lower:
            synth_info->case_lc        = result;
            synth_info->case_lc_graphs = num_result_graphs;
            break;
        case MVM_unicode_case_change_type_title:
            synth_info->case_tc        = result;
            synth_info->case_tc_graphs = num_result_graphs;
            break;
        case MVM_unicode_case_change_type_fold:
            synth_info->case_fc        = result;
            synth_info->case_fc_graphs = num_result_graphs;
            break;
        default:
            MVM_panic(1, "NFG: invalid case change %d", case_);
    }
}
Ejemplo n.º 9
0
static void compute_case_change(MVMThreadContext *tc, MVMGrapheme32 synth, MVMNFGSynthetic *synth_info, MVMint32 case_) {
    MVMGrapheme32 *result;
    MVMint32 num_result_graphs;

    /* Transform the base character. */
    const MVMCodepoint *result_cps;
    MVMuint32     num_result_cps = MVM_unicode_get_case_change(tc, synth_info->base,
        case_, &result_cps);
    if (num_result_cps == 0 || *result_cps == synth_info->base) {
        /* Base character does not change, so grapheme stays the same. We
         * install a non-null sentinel for this case, and set the result
         * grapheme count to zero, which indicates no change. */
        result = CASE_UNCHANGED;
        num_result_graphs = 0;
    }
    else {
        /* We can potentially get multiple graphemes back. We may also get
         * into situations where we case change the base and suddenly we
         * can normalize the whole thing to a non-synthetic. So, we take
         * a trip through the normalizer. Note we push the first thing
         * we get back from the case change, then our combiners, and
         * finally anything else the case change produced. This should
         * do about the right thing for both case changes that produce a
         * base and a combiner, and those that produce a base and a base,
         * since the normalizer applies Unicode canonical sorting. */
        MVMNormalizer norm;
        MVMint32 i;
        MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG);
        MVM_unicode_normalizer_push_codepoints(tc, &norm, result_cps, 1);
        MVM_unicode_normalizer_push_codepoints(tc, &norm, synth_info->combs,
            synth_info->num_combs);
        if (num_result_cps > 1)
            MVM_unicode_normalizer_push_codepoints(tc, &norm, result_cps + 1,
                num_result_cps - 1);
        MVM_unicode_normalizer_eof(tc, &norm);

        num_result_graphs = MVM_unicode_normalizer_available(tc, &norm);
        result = MVM_malloc(num_result_graphs * sizeof(MVMGrapheme32));
        for (i = 0; i < num_result_graphs; i++)
            result[i] = MVM_unicode_normalizer_get_grapheme(tc, &norm);
        MVM_unicode_normalizer_cleanup(tc, &norm);
    }

    switch (case_) {
    case MVM_unicode_case_change_type_upper:
        synth_info->case_uc = result;
        synth_info->case_uc_graphs = num_result_graphs;
        break;
    case MVM_unicode_case_change_type_lower:
        synth_info->case_lc = result;
        synth_info->case_lc_graphs = num_result_graphs;
        break;
    case MVM_unicode_case_change_type_title:
        synth_info->case_tc = result;
        synth_info->case_tc_graphs = num_result_graphs;
        break;
    case MVM_unicode_case_change_type_fold:
        synth_info->case_fc = result;
        synth_info->case_fc_graphs = num_result_graphs;
        break;
    default:
        MVM_panic(1, "NFG: invalid case change %d", case_);
    }
}