Esempio n. 1
0
MVMnum64 MVM_coerce_s_n(MVMThreadContext *tc, MVMString *s) {
    MVMCodepointIter ci;
    MVMCodepoint cp;
    MVMnum64  n = 123;
    MVM_string_ci_init(tc, &ci, s, 0, 0);

    if (get_cp(tc, &ci, &cp)) return 0;

    skip_whitespace(tc, &ci, &cp);

    // Do we have only whitespace
    if (!MVM_string_ci_has_more(tc, &ci) && cp == END_OF_NUM) {
        return 0;
    }

    n = parse_real(tc, &ci, &cp, s);

    skip_whitespace(tc, &ci, &cp);

    if (MVM_string_ci_has_more(tc, &ci) || cp != END_OF_NUM) {
        parse_error(tc, s, "trailing characters");
    }

    return n;
}
Esempio n. 2
0
/* Takes an NFG string and populates the array out, which must be a 32-bit
 * integer array, with codepoints normalized according to the specified
 * normalization form. */
void MVM_unicode_string_to_codepoints(MVMThreadContext *tc, MVMString *s, MVMNormalization form, MVMObject *out) {
    MVMCodepoint     *result;
    MVMint64          result_pos, result_alloc;
    MVMCodepointIter  ci;

    /* Validate output array and set up result storage. */
    assert_codepoint_array(tc, out, "Normalization output must be native array of 32-bit integers");
    result_alloc = s->body.num_graphs;
    result       = MVM_malloc(result_alloc * sizeof(MVMCodepoint));
    result_pos   = 0;

    /* Create codepoint iterator. */
    MVM_string_ci_init(tc, &ci, s);

    /* If we want NFC, just iterate, since NFG is constructed out of NFC. */
    if (form == MVM_NORMALIZE_NFC) {
        while (MVM_string_ci_has_more(tc, &ci)) {
            maybe_grow_result(&result, &result_alloc, result_pos + 1);
            result[result_pos++] = MVM_string_ci_get_codepoint(tc, &ci);
        }
    }

    /* Otherwise, need to feed it through a normalizer. */
    else {
        MVMNormalizer norm;
        MVMint32      ready;
        MVM_unicode_normalizer_init(tc, &norm, form);
        while (MVM_string_ci_has_more(tc, &ci)) {
            MVMCodepoint cp;
            ready = MVM_unicode_normalizer_process_codepoint(tc, &norm, MVM_string_ci_get_codepoint(tc, &ci), &cp);
            if (ready) {
                maybe_grow_result(&result, &result_alloc, result_pos + ready);
                result[result_pos++] = cp;
                while (--ready > 0)
                    result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm);
            }
        }
        MVM_unicode_normalizer_eof(tc, &norm);
        ready = MVM_unicode_normalizer_available(tc, &norm);
        maybe_grow_result(&result, &result_alloc, result_pos + ready);
        while (ready--)
            result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm);
        MVM_unicode_normalizer_cleanup(tc, &norm);
    }

    /* Put result into array body. */
    ((MVMArray *)out)->body.slots.u32 = result;
    ((MVMArray *)out)->body.start     = 0;
    ((MVMArray *)out)->body.elems     = result_pos;
}
Esempio n. 3
0
/* Encodes the specified substring to latin-1. Anything outside of latin-1 range
 * will become a ?. The result string is NULL terminated, but the specified
 * size is the non-null part. */
char * MVM_string_latin1_encode_substr(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size, MVMint64 start, MVMint64 length) {
    /* Latin-1 is a single byte encoding, so each grapheme will just become
     * a single byte. */
    MVMuint32 startu = (MVMuint32)start;
    MVMStringIndex strgraphs = MVM_string_graphs(tc, str);
    MVMuint32 lengthu = (MVMuint32)(length == -1 ? strgraphs - startu : length);
    MVMuint8 *result;
    size_t i;

    /* must check start first since it's used in the length check */
    if (start < 0 || start > strgraphs)
        MVM_exception_throw_adhoc(tc, "start out of range");
    if (length < -1 || start + lengthu > strgraphs)
        MVM_exception_throw_adhoc(tc, "length out of range");

    result = MVM_malloc(lengthu + 1);
    if (str->body.storage_type == MVM_STRING_GRAPHEME_ASCII) {
        /* No encoding needed; directly copy. */
        memcpy(result, str->body.storage.blob_ascii, lengthu);
        result[lengthu] = 0;
    }
    else {
        MVMuint32 i = 0;
        MVMCodepointIter ci;
        MVM_string_ci_init(tc, &ci, str);
        while (MVM_string_ci_has_more(tc, &ci)) {
            MVMCodepoint ord = MVM_string_ci_get_codepoint(tc, &ci);
            if (ord >= 0 && ord <= 255)
                result[i] = (MVMuint8)ord;
            else
                result[i] = '?';
            i++;
        }
        result[i] = 0;
    }
    if (output_size)
        *output_size = lengthu;
    return (char *)result;
}
Esempio n. 4
0
/* Encodes the specified substring to latin-1. Anything outside of latin-1 range
 * will become a ?. The result string is NULL terminated, but the specified
 * size is the non-null part. */
char * MVM_string_latin1_encode_substr(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size, MVMint64 start, MVMint64 length,
        MVMString *replacement, MVMint32 translate_newlines) {
    /* Latin-1 is a single byte encoding, but \r\n is a 2-byte grapheme, so we
     * may have to resize as we go. */
    MVMuint32 startu = (MVMuint32)start;
    MVMStringIndex strgraphs = MVM_string_graphs(tc, str);
    MVMuint32 lengthu = (MVMuint32)(length == -1 ? strgraphs - startu : length);
    MVMuint8 *result;
    size_t result_alloc;
    MVMuint8 *repl_bytes = NULL;
    MVMuint64 repl_length;

    /* must check start first since it's used in the length check */
    if (start < 0 || start > strgraphs)
        MVM_exception_throw_adhoc(tc, "start out of range");
    if (length < -1 || start + lengthu > strgraphs)
        MVM_exception_throw_adhoc(tc, "length out of range");

    if (replacement)
        repl_bytes = (MVMuint8 *) MVM_string_latin1_encode_substr(tc,
            replacement, &repl_length, 0, -1, NULL, translate_newlines);

    result_alloc = lengthu;
    result = MVM_malloc(result_alloc + 1);
    if (str->body.storage_type == MVM_STRING_GRAPHEME_ASCII) {
        /* No encoding needed; directly copy. */
        memcpy(result, str->body.storage.blob_ascii, lengthu);
        result[lengthu] = 0;
        if (output_size)
            *output_size = lengthu;
    }
    else {
        MVMuint32 i = 0;
        MVMCodepointIter ci;
        MVM_string_ci_init(tc, &ci, str, translate_newlines);
        while (MVM_string_ci_has_more(tc, &ci)) {
            MVMCodepoint ord = MVM_string_ci_get_codepoint(tc, &ci);
            if (i == result_alloc) {
                result_alloc += 8;
                result = MVM_realloc(result, result_alloc + 1);
            }
            if (ord >= 0 && ord <= 255) {
                result[i] = (MVMuint8)ord;
                i++;
            }
            else if (replacement) {
                if (repl_length >= result_alloc || i >= result_alloc - repl_length) {
                    result_alloc += repl_length;
                    result = MVM_realloc(result, result_alloc + 1);
                }
                memcpy(result + i, repl_bytes, repl_length);
                i += repl_length;
            }
            else {
                MVM_free(result);
                MVM_free(repl_bytes);
                MVM_exception_throw_adhoc(tc,
                    "Error encoding Latin-1 string: could not encode codepoint %d",
                    ord);
            }
        }
        result[i] = 0;
        if (output_size)
            *output_size = i;
    }
    MVM_free(repl_bytes);
    return (char *)result;
}
Esempio n. 5
0
/* MVM_unicode_string_compare implements the Unicode Collation Algorthm */
MVMint64 MVM_unicode_string_compare(MVMThreadContext *tc, MVMString *a, MVMString *b,
         MVMint64 collation_mode, MVMint64 lang_mode, MVMint64 country_mode) {
    MVMStringIndex alen, blen;
    /* Iteration variables */
    MVMCodepointIter a_ci, b_ci;
    MVMGrapheme32 ai, bi;
    /* Set it all to 0 to start with. We alter this based on the collation_mode later on */
    level_eval level_eval_settings = {
        { {0,0,0}, {0,0,0}, {0,0,0}, {0,0,0} }
    };
    /* The default level_eval settings, used between two non-equal levels */
    union level_eval_u2 level_eval_default = {
        {-1, 0, 1}
    };
    /* Collation stacks */
    collation_stack stack_a;
    collation_stack stack_b;

    ring_buffer buf_a, buf_b;
    /* This value stores what the return value would be if the strings were compared
     * by codepoint. This is used to break collation value ties */
    MVMint64 compare_by_cp_rtrn = 0;
    MVMint64 pos_a = 0, pos_b = 0, i = 0, rtrn = 0;
    MVMint16 grab_a_done = 0, grab_b_done = 0;
    /* From 0 to 2 for primary, secondary, tertiary levels */
    MVMint16   level_a = 0,   level_b = 0;
    MVMint64 skipped_a = 0, skipped_b = 0;
    /* This code sets up level_eval_settings based on the collation_mode */
    #define setmodeup(mode, level, Less, Same, More) {\
        if (collation_mode & mode) {\
            level_eval_settings.a[level].a2[0] +=  Less;\
            level_eval_settings.a[level].a2[1] +=  Same;\
            level_eval_settings.a[level].a2[2] +=  More;\
        }\
    }
    /* Primary */
    setmodeup(MVM_COLLATION_PRIMARY_POSITIVE,    0, -1, 0,  1);
    setmodeup(MVM_COLLATION_PRIMARY_NEGATIVE,    0,  1, 0, -1);
    /* Secondary */
    setmodeup(MVM_COLLATION_SECONDARY_POSITIVE,  1, -1, 0,  1);
    setmodeup(MVM_COLLATION_SECONDARY_NEGATIVE,  1,  1, 0, -1);
    /* Tertiary */
    setmodeup(MVM_COLLATION_TERTIARY_POSITIVE,   2, -1, 0,  1);
    setmodeup(MVM_COLLATION_TERTIARY_NEGATIVE,   2,  1, 0, -1);
    /* Quaternary */
    setmodeup(MVM_COLLATION_QUATERNARY_POSITIVE, 3, -1, 0,  1);
    setmodeup(MVM_COLLATION_QUATERNARY_NEGATIVE, 3,  1, 0, -1);
    DEBUG_COLLATION_MODE_PRINT(level_eval_settings);

    init_stack(tc, &stack_a);
    init_stack(tc, &stack_b);
    MVM_string_check_arg(tc, a, "compare");
    MVM_string_check_arg(tc, b, "compare");
    /* Simple cases when one or both are zero length. */
    alen = MVM_string_graphs_nocheck(tc, a);
    blen = MVM_string_graphs_nocheck(tc, b);
    if (alen == 0 || blen == 0)
        return collation_return_by_quaternary(tc, &level_eval_settings, alen, blen, 0);

    /* Initialize a codepoint iterator
     * For now we decompose utf8-c8 synthetics. Eventually we may want to pass
     * them back and choose some way to generate sorting info for them, similar
     * to how Unassigned codepoints are dealt with */
    MVMROOT(tc, a_ci, {
        MVM_string_ci_init(tc, &a_ci, a, 0, 0);
        MVMROOT(tc, b_ci, {
            MVM_string_ci_init(tc, &b_ci, b, 0, 0);
        });
    });