Beispiel #1
0
static MVMint32 NFD_and_push_collation_values (MVMThreadContext *tc, MVMCodepoint cp, collation_stack *stack, MVMCodepointIter *ci, char *name) {
    MVMNormalizer norm;
    MVMCodepoint cp_out;
    MVMint32 ready,
             result_pos  = 0;
    MVMCodepoint *result = MVM_malloc(sizeof(MVMCodepoint) * initial_collation_norm_buf_size);
    MVMint32 result_size = initial_collation_norm_buf_size;
    MVMint64 rtrn        = 0;
    MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFD);
    ready = MVM_unicode_normalizer_process_codepoint(tc, &norm, cp, &cp_out);
    if (ready) {
        if (result_size <= result_pos + ready)
            result = MVM_realloc(result, sizeof(MVMCodepoint) * (result_size += initial_collation_norm_buf_size));
        result[result_pos++] = cp_out;
        while (0 < --ready)
            result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm);
    }
    MVM_unicode_normalizer_eof(tc, &norm);
    ready = MVM_unicode_normalizer_available(tc, &norm);
    while (ready--) {
        if (result_size <= result_pos + ready + 1)
            result = MVM_realloc(result, sizeof(MVMCodepoint) * (result_size += initial_collation_norm_buf_size));
        result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm);
    }
    /* If the codepoint changed or we now have more than before */
    if (result[0] != cp || 1 < result_pos)
        rtrn = collation_push_cp(tc, stack, ci, result, result_pos, name);
    if (result)
        MVM_free(result);
    return rtrn;
}
Beispiel #2
0
/* Creates a new decoding stream. */
MVMDecodeStream * MVM_string_decodestream_create(MVMThreadContext *tc, MVMint32 encoding, MVMint64 abs_byte_pos) {
    MVMDecodeStream *ds = MVM_calloc(1, sizeof(MVMDecodeStream));
    ds->encoding        = encoding;
    ds->abs_byte_pos    = abs_byte_pos;
    MVM_unicode_normalizer_init(tc, &(ds->norm), MVM_NORMALIZE_NFG);
    return ds;
}
Beispiel #3
0
/* Returns non-zero if the result of concatenating the two strings will freely
 * leave us in NFG without any further effort. */
MVMint32 MVM_nfg_is_concat_stable(MVMThreadContext *tc, MVMString *a, MVMString *b) {
    MVMGrapheme32 last_a;
    MVMGrapheme32 first_b;
    MVMGrapheme32 crlf;

    /* If either string is empty, we're good. */
    if (a->body.num_graphs == 0 || b->body.num_graphs == 0)
        return 1;

    /* Get first and last graphemes of the strings. */
    last_a  = MVM_string_get_grapheme_at_nocheck(tc, a, a->body.num_graphs - 1);
    first_b = MVM_string_get_grapheme_at_nocheck(tc, b, 0);
    /* Put the case where we are adding a lf or crlf line ending */
    if (first_b == '\n')
        /* If we see \r + \n we need to renormalize. Otherwise we're good */
        return last_a == '\r' ? 0 : 1;

    crlf = MVM_nfg_crlf_grapheme(tc);
    /* As a control code we are always going to break if we see one of these.
     * Check first_b for speeding up line endings */
    if (first_b == crlf || last_a == crlf)
        return 0;
    /* If either is synthetic other than "\r\n", assume we'll have to re-normalize
     * (this is an over-estimate, most likely). Note if you optimize this that it
     * serves as a guard for what follows.
     * TODO get the last codepoint of last_a and first codepoint of first_b and call
     * MVM_unicode_normalize_should_break */
    if (last_a < 0 || first_b < 0)
        return 0;

    /* If both less than the first significant char for NFC we are good */
    if (last_a < MVM_NORMALIZE_FIRST_SIG_NFC && first_b < MVM_NORMALIZE_FIRST_SIG_NFC) {
        return 1;
    }
    else {
        /* Check if the two codepoints would be joined during normalization.
         * Returns 1 if they would break and thus is safe under concat, or 0 if
         * they would be joined. */
        MVMNormalizer norm;
        int rtrn;
        MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG);
        /* Since we are only looking at two codepoints, we don't know what came
         * before. Because of special rules with Regional Indicators, pretend
         * the previous codepoint was a regional indicator. This will return the
         * special value of 2 from MVM_unicode_normalize_should_break and trigger
         * re_nfg if last_a and first_b are both regional indicators and we will
         * never break NFG regardless of what the codepoint before last_a is. */
        norm.regional_indicator = 1;
        rtrn = MVM_unicode_normalize_should_break(tc, last_a, first_b, &norm);
        MVM_unicode_normalizer_cleanup(tc, &norm);
        /* If both CCC are non-zero then it may need to be reordered. For now return 0.
         * This can be optimized. */
        if (MVM_unicode_relative_ccc(tc, last_a) != 0 && MVM_unicode_relative_ccc(tc, first_b) != 0)
            return 0;
        return rtrn;
    }
}
Beispiel #4
0
/* Creates a new decoding stream. */
MVMDecodeStream * MVM_string_decodestream_create(MVMThreadContext *tc, MVMint32 encoding,
        MVMint64 abs_byte_pos, MVMint32 translate_newlines) {
    MVMDecodeStream *ds = MVM_calloc(1, sizeof(MVMDecodeStream));
    ds->encoding        = encoding;
    ds->abs_byte_pos    = abs_byte_pos;
    MVM_unicode_normalizer_init(tc, &(ds->norm), MVM_NORMALIZE_NFG);
    if (translate_newlines)
        MVM_unicode_normalizer_translate_newlines(tc, &(ds->norm));
    ds->result_size_guess = 64;
    return ds;
}
Beispiel #5
0
/* Takes an NFG string and populates the array out, which must be a 32-bit
 * integer array, with codepoints normalized according to the specified
 * normalization form. */
void MVM_unicode_string_to_codepoints(MVMThreadContext *tc, MVMString *s, MVMNormalization form, MVMObject *out) {
    MVMCodepoint     *result;
    MVMint64          result_pos, result_alloc;
    MVMCodepointIter  ci;

    /* Validate output array and set up result storage. */
    assert_codepoint_array(tc, out, "Normalization output must be native array of 32-bit integers");
    result_alloc = s->body.num_graphs;
    result       = MVM_malloc(result_alloc * sizeof(MVMCodepoint));
    result_pos   = 0;

    /* Create codepoint iterator. */
    MVM_string_ci_init(tc, &ci, s);

    /* If we want NFC, just iterate, since NFG is constructed out of NFC. */
    if (form == MVM_NORMALIZE_NFC) {
        while (MVM_string_ci_has_more(tc, &ci)) {
            maybe_grow_result(&result, &result_alloc, result_pos + 1);
            result[result_pos++] = MVM_string_ci_get_codepoint(tc, &ci);
        }
    }

    /* Otherwise, need to feed it through a normalizer. */
    else {
        MVMNormalizer norm;
        MVMint32      ready;
        MVM_unicode_normalizer_init(tc, &norm, form);
        while (MVM_string_ci_has_more(tc, &ci)) {
            MVMCodepoint cp;
            ready = MVM_unicode_normalizer_process_codepoint(tc, &norm, MVM_string_ci_get_codepoint(tc, &ci), &cp);
            if (ready) {
                maybe_grow_result(&result, &result_alloc, result_pos + ready);
                result[result_pos++] = cp;
                while (--ready > 0)
                    result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm);
            }
        }
        MVM_unicode_normalizer_eof(tc, &norm);
        ready = MVM_unicode_normalizer_available(tc, &norm);
        maybe_grow_result(&result, &result_alloc, result_pos + ready);
        while (ready--)
            result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm);
        MVM_unicode_normalizer_cleanup(tc, &norm);
    }

    /* Put result into array body. */
    ((MVMArray *)out)->body.slots.u32 = result;
    ((MVMArray *)out)->body.start     = 0;
    ((MVMArray *)out)->body.elems     = result_pos;
}
Beispiel #6
0
/* Takes an object, which must be of VMArray representation and holding
 * 32-bit integers. Treats them as Unicode codepoints, normalizes them at
 * Grapheme level, and returns the resulting NFG string. */
MVMString * MVM_unicode_codepoints_to_nfg_string(MVMThreadContext *tc, MVMObject *codes) {
    MVMNormalizer  norm;
    MVMCodepoint  *input;
    MVMGrapheme32 *result;
    MVMint64       input_pos, input_codes, result_pos, result_alloc;
    MVMint32       ready;
    MVMString     *str;

    /* Get input array; if it's empty, we're done already. */
    assert_codepoint_array(tc, codes, "Code points to string input must be native array of 32-bit integers");
    input       = (MVMCodepoint *)((MVMArray *)codes)->body.slots.u32 + ((MVMArray *)codes)->body.start;
    input_codes = ((MVMArray *)codes)->body.elems;
    if (input_codes == 0)
        return tc->instance->str_consts.empty;

    /* Guess output size based on input size. */
    result_alloc = input_codes;
    result       = MVM_malloc(result_alloc * sizeof(MVMCodepoint));

    /* Perform normalization at grapheme level. */
    MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG);
    input_pos  = 0;
    result_pos = 0;
    while (input_pos < input_codes) {
        MVMGrapheme32 g;
        ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, input[input_pos], &g);
        if (ready) {
            maybe_grow_result(&result, &result_alloc, result_pos + ready);
            result[result_pos++] = g;
            while (--ready > 0)
                result[result_pos++] = MVM_unicode_normalizer_get_grapheme(tc, &norm);
        }
        input_pos++;
    }
    MVM_unicode_normalizer_eof(tc, &norm);
    ready = MVM_unicode_normalizer_available(tc, &norm);
    maybe_grow_result(&result, &result_alloc, result_pos + ready);
    while (ready--)
        result[result_pos++] = MVM_unicode_normalizer_get_grapheme(tc, &norm);
    MVM_unicode_normalizer_cleanup(tc, &norm);

    /* Produce an MVMString of the result. */
    str = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
    str->body.storage.blob_32 = result;
    str->body.storage_type    = MVM_STRING_GRAPHEME_32;
    str->body.num_graphs      = result_pos;
    return str;
}
Beispiel #7
0
void MVM_unicode_normalize_codepoints(MVMThreadContext *tc, MVMObject *in, MVMObject *out, MVMNormalization form) {
    MVMNormalizer  norm;
    MVMCodepoint  *input;
    MVMCodepoint  *result;
    MVMint64       input_pos, input_codes, result_pos, result_alloc;
    MVMint32       ready;

    /* Validate input/output array. */
    assert_codepoint_array(tc, in, "Normalization input must be native array of 32-bit integers");
    assert_codepoint_array(tc, out, "Normalization output must be native array of 32-bit integers");

    /* Get input array; if it's empty, we're done already. */
    input       = (MVMCodepoint *)((MVMArray *)in)->body.slots.u32 + ((MVMArray *)in)->body.start;
    input_codes = ((MVMArray *)in)->body.elems;
    if (input_codes == 0)
        return;

    /* Guess output size based on input size. */
    result_alloc = input_codes;
    result       = MVM_malloc(result_alloc * sizeof(MVMCodepoint));

    /* Perform normalization. */
    MVM_unicode_normalizer_init(tc, &norm, form);
    input_pos  = 0;
    result_pos = 0;
    while (input_pos < input_codes) {
        MVMCodepoint cp;
        ready = MVM_unicode_normalizer_process_codepoint(tc, &norm, input[input_pos], &cp);
        if (ready) {
            maybe_grow_result(&result, &result_alloc, result_pos + ready);
            result[result_pos++] = cp;
            while (--ready > 0)
                result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm);
        }
        input_pos++;
    }
    MVM_unicode_normalizer_eof(tc, &norm);
    ready = MVM_unicode_normalizer_available(tc, &norm);
    maybe_grow_result(&result, &result_alloc, result_pos + ready);
    while (ready--)
        result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm);
    MVM_unicode_normalizer_cleanup(tc, &norm);

    /* Put result into array body. */
    ((MVMArray *)out)->body.slots.u32 = result;
    ((MVMArray *)out)->body.start     = 0;
    ((MVMArray *)out)->body.elems     = result_pos;
}
Beispiel #8
0
static void compute_case_change(MVMThreadContext *tc, MVMGrapheme32 synth_g, MVMNFGSynthetic *synth_info, MVMint32 case_) {
    MVMint32 num_result_graphs;
    MVMGrapheme32          *result = NULL;
    const MVMCodepoint *result_cps = NULL;
    /* Transform the base character. */
    MVMuint32 num_result_cps = MVM_unicode_get_case_change(tc,
        synth_info->codes[synth_info->base_index], case_, &result_cps);
    if (num_result_cps == 0 || (num_result_cps == 1 && result_cps[0] == synth_info->codes[synth_info->base_index])) {
        /* Base character does not change, so grapheme stays the same. We
         * install a non-null sentinel for this case, and set the result
         * grapheme count to zero, which indicates no change. */
        result = CASE_UNCHANGED;
        num_result_graphs = 0;
    }
    else {
        /* We can potentially get multiple graphemes back. We may also get
         * into situations where we case change the base and suddenly we
         * can normalize the whole thing to a non-synthetic. So, we take
         * a trip through the normalizer. We push any codepoints before the
         * base in the synthetic (only happens with Prepend codepoints).
          * We then push the first codepoint we get back from the case change
         * then the codeponits after the base characters (generally Extend
         * codepoints).
         * Finally we push anything else the case change produced. This should
         * do about the right thing for both case changes that produce a
         * base and a combiner, and those that produce a base and a base,
         * since the normalizer applies canonical combining class sorting. */
        MVMNormalizer norm;
        MVMint32 i;
        MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG);
        if (0 < synth_info->base_index)
            MVM_unicode_normalizer_push_codepoints(tc, &norm,
                synth_info->codes,
                synth_info->base_index);
        /* Push the first result on */
        MVM_unicode_normalizer_push_codepoints(tc, &norm, result_cps, 1);
        /* Push any combiners after that codepoint so the combiners attach to the
         * first codepoint of the casechange not the second or more */
        MVM_unicode_normalizer_push_codepoints(tc, &norm,
            synth_info->codes     + synth_info->base_index + 1,
            synth_info->num_codes - synth_info->base_index - 1);
        if (1 < num_result_cps)
            MVM_unicode_normalizer_push_codepoints(tc, &norm,
                result_cps     + 1,
                num_result_cps - 1);
        MVM_unicode_normalizer_eof(tc, &norm);

        num_result_graphs = MVM_unicode_normalizer_available(tc, &norm);
        result = MVM_malloc(num_result_graphs * sizeof(MVMGrapheme32));
        for (i = 0; i < num_result_graphs; i++)
            result[i] = MVM_unicode_normalizer_get_grapheme(tc, &norm);
        MVM_unicode_normalizer_cleanup(tc, &norm);
    }

    switch (case_) {
        case MVM_unicode_case_change_type_upper:
            synth_info->case_uc        = result;
            synth_info->case_uc_graphs = num_result_graphs;
            break;
        case MVM_unicode_case_change_type_lower:
            synth_info->case_lc        = result;
            synth_info->case_lc_graphs = num_result_graphs;
            break;
        case MVM_unicode_case_change_type_title:
            synth_info->case_tc        = result;
            synth_info->case_tc_graphs = num_result_graphs;
            break;
        case MVM_unicode_case_change_type_fold:
            synth_info->case_fc        = result;
            synth_info->case_fc_graphs = num_result_graphs;
            break;
        default:
            MVM_panic(1, "NFG: invalid case change %d", case_);
    }
}
Beispiel #9
0
static void compute_case_change(MVMThreadContext *tc, MVMGrapheme32 synth, MVMNFGSynthetic *synth_info, MVMint32 case_) {
    MVMGrapheme32 *result;
    MVMint32 num_result_graphs;

    /* Transform the base character. */
    const MVMCodepoint *result_cps;
    MVMuint32     num_result_cps = MVM_unicode_get_case_change(tc, synth_info->base,
        case_, &result_cps);
    if (num_result_cps == 0 || *result_cps == synth_info->base) {
        /* Base character does not change, so grapheme stays the same. We
         * install a non-null sentinel for this case, and set the result
         * grapheme count to zero, which indicates no change. */
        result = CASE_UNCHANGED;
        num_result_graphs = 0;
    }
    else {
        /* We can potentially get multiple graphemes back. We may also get
         * into situations where we case change the base and suddenly we
         * can normalize the whole thing to a non-synthetic. So, we take
         * a trip through the normalizer. Note we push the first thing
         * we get back from the case change, then our combiners, and
         * finally anything else the case change produced. This should
         * do about the right thing for both case changes that produce a
         * base and a combiner, and those that produce a base and a base,
         * since the normalizer applies Unicode canonical sorting. */
        MVMNormalizer norm;
        MVMint32 i;
        MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG);
        MVM_unicode_normalizer_push_codepoints(tc, &norm, result_cps, 1);
        MVM_unicode_normalizer_push_codepoints(tc, &norm, synth_info->combs,
            synth_info->num_combs);
        if (num_result_cps > 1)
            MVM_unicode_normalizer_push_codepoints(tc, &norm, result_cps + 1,
                num_result_cps - 1);
        MVM_unicode_normalizer_eof(tc, &norm);

        num_result_graphs = MVM_unicode_normalizer_available(tc, &norm);
        result = MVM_malloc(num_result_graphs * sizeof(MVMGrapheme32));
        for (i = 0; i < num_result_graphs; i++)
            result[i] = MVM_unicode_normalizer_get_grapheme(tc, &norm);
        MVM_unicode_normalizer_cleanup(tc, &norm);
    }

    switch (case_) {
    case MVM_unicode_case_change_type_upper:
        synth_info->case_uc = result;
        synth_info->case_uc_graphs = num_result_graphs;
        break;
    case MVM_unicode_case_change_type_lower:
        synth_info->case_lc = result;
        synth_info->case_lc_graphs = num_result_graphs;
        break;
    case MVM_unicode_case_change_type_title:
        synth_info->case_tc = result;
        synth_info->case_tc_graphs = num_result_graphs;
        break;
    case MVM_unicode_case_change_type_fold:
        synth_info->case_fc = result;
        synth_info->case_fc_graphs = num_result_graphs;
        break;
    default:
        MVM_panic(1, "NFG: invalid case change %d", case_);
    }
}