Esempio n. 1
0
/* Takes an object, which must be of VMArray representation and holding
 * 32-bit integers. Treats them as Unicode codepoints, normalizes them at
 * Grapheme level, and returns the resulting NFG string. */
MVMString * MVM_unicode_codepoints_to_nfg_string(MVMThreadContext *tc, MVMObject *codes) {
    MVMNormalizer  norm;
    MVMCodepoint  *input;
    MVMGrapheme32 *result;
    MVMint64       input_pos, input_codes, result_pos, result_alloc;
    MVMint32       ready;
    MVMString     *str;

    /* Get input array; if it's empty, we're done already. */
    assert_codepoint_array(tc, codes, "Code points to string input must be native array of 32-bit integers");
    input       = (MVMCodepoint *)((MVMArray *)codes)->body.slots.u32 + ((MVMArray *)codes)->body.start;
    input_codes = ((MVMArray *)codes)->body.elems;
    if (input_codes == 0)
        return tc->instance->str_consts.empty;

    /* Guess output size based on input size. */
    result_alloc = input_codes;
    result       = MVM_malloc(result_alloc * sizeof(MVMCodepoint));

    /* Perform normalization at grapheme level. */
    MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG);
    input_pos  = 0;
    result_pos = 0;
    while (input_pos < input_codes) {
        MVMGrapheme32 g;
        ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, input[input_pos], &g);
        if (ready) {
            maybe_grow_result(&result, &result_alloc, result_pos + ready);
            result[result_pos++] = g;
            while (--ready > 0)
                result[result_pos++] = MVM_unicode_normalizer_get_grapheme(tc, &norm);
        }
        input_pos++;
    }
    MVM_unicode_normalizer_eof(tc, &norm);
    ready = MVM_unicode_normalizer_available(tc, &norm);
    maybe_grow_result(&result, &result_alloc, result_pos + ready);
    while (ready--)
        result[result_pos++] = MVM_unicode_normalizer_get_grapheme(tc, &norm);
    MVM_unicode_normalizer_cleanup(tc, &norm);

    /* Produce an MVMString of the result. */
    str = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
    str->body.storage.blob_32 = result;
    str->body.storage_type    = MVM_STRING_GRAPHEME_32;
    str->body.num_graphs      = result_pos;
    return str;
}
Esempio n. 2
0
/* In situations where we have hit EOF, we need to decode what's left and flush
 * the normalization buffer also. */
static void reached_eof(MVMThreadContext *tc, MVMDecodeStream *ds) {
    /* Decode all the things. */
    if (ds->bytes_head)
        run_decode(tc, ds, NULL, NULL);

    /* If there's some things left in the normalization buffer, take them. */
    MVM_unicode_normalizer_eof(tc, &(ds->norm));
    if (MVM_unicode_normalizer_available(tc, &(ds->norm))) {
        MVMint32 ready = MVM_unicode_normalizer_available(tc, &(ds->norm));
        MVMGrapheme32 *buffer = MVM_malloc(ready * sizeof(MVMGrapheme32));
        MVMint32 count = 0;
        while (ready--)
            buffer[count++] = MVM_unicode_normalizer_get_grapheme(tc, &(ds->norm));
        MVM_string_decodestream_add_chars(tc, ds, buffer, count);
    }
}
Esempio n. 3
0
/* Decodes all the buffers, producing a string containing all the decoded
 * characters. */
MVMString * MVM_string_decodestream_get_all(MVMThreadContext *tc, MVMDecodeStream *ds) {
    MVMString *result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
    result->body.storage_type = MVM_STRING_GRAPHEME_32;

    /* Decode all the things. */
    run_decode(tc, ds, NULL, NULL);

    /* If there's some things left in the normalization buffer, take them. */
    MVM_unicode_normalizer_eof(tc, &(ds->norm));
    if (MVM_unicode_normalizer_available(tc, &(ds->norm))) {
        MVMint32 ready = MVM_unicode_normalizer_available(tc, &(ds->norm));
        MVMGrapheme32 *buffer = MVM_malloc(ready * sizeof(MVMGrapheme32));
        MVMint32 count = 0;
        while (ready--)
            buffer[count++] = MVM_unicode_normalizer_get_grapheme(tc, &(ds->norm));
        MVM_string_decodestream_add_chars(tc, ds, buffer, count);
    }

    /* If there's no codepoint buffer, then return the empty string. */
    if (!ds->chars_head) {
        result->body.storage.blob_32 = NULL;
        result->body.num_graphs      = 0;
    }

    /* If there's exactly one resulting codepoint buffer and we swallowed none
     * of it, just use it. */
    else if (ds->chars_head == ds->chars_tail && ds->chars_head_pos == 0) {
        /* Set up result string. */
        result->body.storage.blob_32 = ds->chars_head->chars;
        result->body.num_graphs      = ds->chars_head->length;

        /* Don't free the buffer's memory itself, just the holder, as we
         * stole that for the buffer into the string above. */
        MVM_free(ds->chars_head);
        ds->chars_head = ds->chars_tail = NULL;
    }

    /* Otherwise, need to assemble all the things. */
    else {
        /* Calculate length. */
        MVMint32 length = 0, pos = 0;
        MVMDecodeStreamChars *cur_chars = ds->chars_head;
        while (cur_chars) {
            if (cur_chars == ds->chars_head)
                length += cur_chars->length - ds->chars_head_pos;
            else
                length += cur_chars->length;
            cur_chars = cur_chars->next;
        }

        /* Allocate a result buffer of the right size. */
        result->body.storage.blob_32 = MVM_malloc(length * sizeof(MVMGrapheme32));
        result->body.num_graphs      = length;

        /* Copy all the things into the target, freeing as we go. */
        cur_chars = ds->chars_head;
        while (cur_chars) {
            if (cur_chars == ds->chars_head) {
                MVMint32 to_copy = ds->chars_head->length - ds->chars_head_pos;
                memcpy(result->body.storage.blob_32 + pos, cur_chars->chars + ds->chars_head_pos,
                    cur_chars->length * sizeof(MVMGrapheme32));
                pos += to_copy;
            }
            else {
                memcpy(result->body.storage.blob_32 + pos, cur_chars->chars,
                    cur_chars->length * sizeof(MVMGrapheme32));
                pos += cur_chars->length;
            }
            cur_chars = cur_chars->next;
        }
        ds->chars_head = ds->chars_tail = NULL;
    }

    return result;
}
Esempio n. 4
0
static void compute_case_change(MVMThreadContext *tc, MVMGrapheme32 synth_g, MVMNFGSynthetic *synth_info, MVMint32 case_) {
    MVMint32 num_result_graphs;
    MVMGrapheme32          *result = NULL;
    const MVMCodepoint *result_cps = NULL;
    /* Transform the base character. */
    MVMuint32 num_result_cps = MVM_unicode_get_case_change(tc,
        synth_info->codes[synth_info->base_index], case_, &result_cps);
    if (num_result_cps == 0 || (num_result_cps == 1 && result_cps[0] == synth_info->codes[synth_info->base_index])) {
        /* Base character does not change, so grapheme stays the same. We
         * install a non-null sentinel for this case, and set the result
         * grapheme count to zero, which indicates no change. */
        result = CASE_UNCHANGED;
        num_result_graphs = 0;
    }
    else {
        /* We can potentially get multiple graphemes back. We may also get
         * into situations where we case change the base and suddenly we
         * can normalize the whole thing to a non-synthetic. So, we take
         * a trip through the normalizer. We push any codepoints before the
         * base in the synthetic (only happens with Prepend codepoints).
          * We then push the first codepoint we get back from the case change
         * then the codeponits after the base characters (generally Extend
         * codepoints).
         * Finally we push anything else the case change produced. This should
         * do about the right thing for both case changes that produce a
         * base and a combiner, and those that produce a base and a base,
         * since the normalizer applies canonical combining class sorting. */
        MVMNormalizer norm;
        MVMint32 i;
        MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG);
        if (0 < synth_info->base_index)
            MVM_unicode_normalizer_push_codepoints(tc, &norm,
                synth_info->codes,
                synth_info->base_index);
        /* Push the first result on */
        MVM_unicode_normalizer_push_codepoints(tc, &norm, result_cps, 1);
        /* Push any combiners after that codepoint so the combiners attach to the
         * first codepoint of the casechange not the second or more */
        MVM_unicode_normalizer_push_codepoints(tc, &norm,
            synth_info->codes     + synth_info->base_index + 1,
            synth_info->num_codes - synth_info->base_index - 1);
        if (1 < num_result_cps)
            MVM_unicode_normalizer_push_codepoints(tc, &norm,
                result_cps     + 1,
                num_result_cps - 1);
        MVM_unicode_normalizer_eof(tc, &norm);

        num_result_graphs = MVM_unicode_normalizer_available(tc, &norm);
        result = MVM_malloc(num_result_graphs * sizeof(MVMGrapheme32));
        for (i = 0; i < num_result_graphs; i++)
            result[i] = MVM_unicode_normalizer_get_grapheme(tc, &norm);
        MVM_unicode_normalizer_cleanup(tc, &norm);
    }

    switch (case_) {
        case MVM_unicode_case_change_type_upper:
            synth_info->case_uc        = result;
            synth_info->case_uc_graphs = num_result_graphs;
            break;
        case MVM_unicode_case_change_type_lower:
            synth_info->case_lc        = result;
            synth_info->case_lc_graphs = num_result_graphs;
            break;
        case MVM_unicode_case_change_type_title:
            synth_info->case_tc        = result;
            synth_info->case_tc_graphs = num_result_graphs;
            break;
        case MVM_unicode_case_change_type_fold:
            synth_info->case_fc        = result;
            synth_info->case_fc_graphs = num_result_graphs;
            break;
        default:
            MVM_panic(1, "NFG: invalid case change %d", case_);
    }
}
Esempio n. 5
0
static void compute_case_change(MVMThreadContext *tc, MVMGrapheme32 synth, MVMNFGSynthetic *synth_info, MVMint32 case_) {
    MVMGrapheme32 *result;
    MVMint32 num_result_graphs;

    /* Transform the base character. */
    const MVMCodepoint *result_cps;
    MVMuint32     num_result_cps = MVM_unicode_get_case_change(tc, synth_info->base,
        case_, &result_cps);
    if (num_result_cps == 0 || *result_cps == synth_info->base) {
        /* Base character does not change, so grapheme stays the same. We
         * install a non-null sentinel for this case, and set the result
         * grapheme count to zero, which indicates no change. */
        result = CASE_UNCHANGED;
        num_result_graphs = 0;
    }
    else {
        /* We can potentially get multiple graphemes back. We may also get
         * into situations where we case change the base and suddenly we
         * can normalize the whole thing to a non-synthetic. So, we take
         * a trip through the normalizer. Note we push the first thing
         * we get back from the case change, then our combiners, and
         * finally anything else the case change produced. This should
         * do about the right thing for both case changes that produce a
         * base and a combiner, and those that produce a base and a base,
         * since the normalizer applies Unicode canonical sorting. */
        MVMNormalizer norm;
        MVMint32 i;
        MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG);
        MVM_unicode_normalizer_push_codepoints(tc, &norm, result_cps, 1);
        MVM_unicode_normalizer_push_codepoints(tc, &norm, synth_info->combs,
            synth_info->num_combs);
        if (num_result_cps > 1)
            MVM_unicode_normalizer_push_codepoints(tc, &norm, result_cps + 1,
                num_result_cps - 1);
        MVM_unicode_normalizer_eof(tc, &norm);

        num_result_graphs = MVM_unicode_normalizer_available(tc, &norm);
        result = MVM_malloc(num_result_graphs * sizeof(MVMGrapheme32));
        for (i = 0; i < num_result_graphs; i++)
            result[i] = MVM_unicode_normalizer_get_grapheme(tc, &norm);
        MVM_unicode_normalizer_cleanup(tc, &norm);
    }

    switch (case_) {
    case MVM_unicode_case_change_type_upper:
        synth_info->case_uc = result;
        synth_info->case_uc_graphs = num_result_graphs;
        break;
    case MVM_unicode_case_change_type_lower:
        synth_info->case_lc = result;
        synth_info->case_lc_graphs = num_result_graphs;
        break;
    case MVM_unicode_case_change_type_title:
        synth_info->case_tc = result;
        synth_info->case_tc_graphs = num_result_graphs;
        break;
    case MVM_unicode_case_change_type_fold:
        synth_info->case_fc = result;
        synth_info->case_fc_graphs = num_result_graphs;
        break;
    default:
        MVM_panic(1, "NFG: invalid case change %d", case_);
    }
}