Exemplo n.º 1
0
static MVMint32 is_spacing_mark(MVMThreadContext *tc, MVMCodepoint cp) {
    const char *genprop = MVM_unicode_codepoint_get_property_cstr(tc, cp,
        MVM_UNICODE_PROPERTY_GENERAL_CATEGORY);
    if (genprop[0] == 'M' && genprop[1] == 'c') {
        const char *gcb = MVM_unicode_codepoint_get_property_cstr(tc, cp,
            MVM_UNICODE_PROPERTY_GRAPHEME_CLUSTER_BREAK);
        return strcmp(gcb, "Extend") != 0;
    }
    else {
        /* Special cases outside of Mc:
         * U+0E33 THAI CHARACTER SARA AM
         * U+0EB3 LAO VOWEL SIGN AM */
        return cp == 0x0E33 || cp == 0x0EB3;
    }
}
Exemplo n.º 2
0
/* Checks if the thing we have is a control character (for the definition in
 * the Unicode Standard Annex #29). Assumes it doesn't have to care about any
 * of the controls in the Latin-1 range, because those were already covered in
 * a fast path. */
static MVMint32 is_control_beyond_latin1(MVMThreadContext *tc, MVMCodepoint in) {
    /* U+200C ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER are excluded. */
    if (in != 0x200C && in != 0x200D) {
        /* Consider general property. */
        const char *genprop = MVM_unicode_codepoint_get_property_cstr(tc, in,
            MVM_UNICODE_PROPERTY_GENERAL_CATEGORY);
        if (genprop[0] == 'Z') {
            /* Line_Separator and Paragraph_Separator are controls. */
            return genprop[1] == 'l' || genprop[1] == 'p';
        }
        if (genprop[0] == 'C') {
            /* Control, Surrogate, and Format are controls. */
            if (genprop[1] == 'c' || genprop[1] == 's' || genprop[1] == 'f') {
                return 1;
            }

            /* Unassigned is, but only for Default_Ignorable_Code_Point. */
            if (genprop[1] == 'n') {
                return MVM_unicode_codepoint_get_property_int(tc, in,
                    MVM_UNICODE_PROPERTY_DEFAULT_IGNORABLE_CODE_POINT) != 0;
            }
        }
    }
    return 0;
}
Exemplo n.º 3
0
/* Gets the canonical combining class for a codepoint. */
static MVMint64 ccc(MVMThreadContext *tc, MVMCodepoint cp) {
    if (cp < MVM_NORMALIZE_FIRST_NONZERO_CCC) {
        return 0;
    }
    else {
        const char *ccc_str = MVM_unicode_codepoint_get_property_cstr(tc, cp, MVM_UNICODE_PROPERTY_CANONICAL_COMBINING_CLASS);
        return !ccc_str || strlen(ccc_str) > 3 ? 0 : atoi(ccc_str);
    }
}
Exemplo n.º 4
0
static MVMint32 should_break(MVMThreadContext *tc, MVMCodepoint a, MVMCodepoint b) {
    /* Don't break between \r and \n, but otherwise break around \r. */
    if (a == 0x0D && b == 0x0A)
        return 0;
    if (a == 0x0D || b == 0x0D)
        return 1;

    /* Hangul. Avoid property lookup with a couple of quick range checks. */
    if (maybe_hangul(a) && maybe_hangul(b)) {
        const char *hst_a = MVM_unicode_codepoint_get_property_cstr(tc, a,
            MVM_UNICODE_PROPERTY_HANGUL_SYLLABLE_TYPE);
        const char *hst_b = MVM_unicode_codepoint_get_property_cstr(tc, b,
            MVM_UNICODE_PROPERTY_HANGUL_SYLLABLE_TYPE);
        if (strcmp(hst_a, "L") == 0)
            return !(strcmp(hst_b, "L") == 0 || strcmp(hst_b, "V") == 0 ||
                     strcmp(hst_b, "LV") == 0 || strcmp(hst_b, "LVT") == 0);
        else if (strcmp(hst_a, "LV") == 0 || strcmp(hst_a, "V") == 0)
            return !(strcmp(hst_b, "V") == 0 || strcmp(hst_b, "T") == 0);
        else if (strcmp(hst_a, "LVT") == 0 || strcmp(hst_a, "T") == 0)
            return !(strcmp(hst_b, "T") == 0);
    }

    /* Don't break between regional indicators. */
    if (is_regional_indicator(a) && is_regional_indicator(b))
        return 0;

    /* Don't break before extenders or ZERO WIDTH JOINER. */
    if (b == 0x200D || is_grapheme_extend(tc, b))
        return 0;

    /* Don't break before spacing marks. (In the Unicode version at the time
     * of implementing, there were no Prepend characters, so we don't worry
     * about that rule for now). */
    if (is_spacing_mark(tc, b))
        return 0;

    /* Otherwise break. */
    return 1;
}
Exemplo n.º 5
0
/* Decompose the codepoint and add it into the buffer. */
static void decomp_codepoint_to_buffer(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint cp) {
    /* See if we actually need to decompose (can skip if the decomposition
     * type is None, or we're only doing Canonical decomposition and it is
     * anything except Canonical). */
    const char *type = MVM_unicode_codepoint_get_property_cstr(tc, cp, MVM_UNICODE_PROPERTY_DECOMPOSITION_TYPE);
    MVMint64 decompose = 1;
    if (!type)
        decompose = 0;
    else if (strcmp(type, "None") == 0)
        decompose = 0;
    else if (!MVM_NORMALIZE_COMPAT_DECOMP(n->form) && strcmp(type, "Canonical") != 0)
        decompose = 0;
    if (decompose) {
        /* We need to decompose. Get the decomp spec and go over the things in
         * it; things without a decomp spec are presumably Hangul and need the
         * algorithmic treatment. */
        char *spec = (char *)MVM_unicode_codepoint_get_property_cstr(tc, cp, MVM_UNICODE_PROPERTY_DECOMP_SPEC);
        if (spec && spec[0]) {
            char *end = spec + strlen(spec);
            while (spec < end) {
                /* Parse hex character code, and then recurse to do any further
                * decomposition on it; this recursion terminates when we find a
                * non-decomposable thing and add it to the buffer. */
                MVMCodepoint decomp_char = (MVMCodepoint)strtol(spec, &spec, 16);
                decomp_codepoint_to_buffer(tc, n, decomp_char);
            }
        }
        else {
            decomp_hangul_to_buffer(tc, n, cp);
        }
    }
    else {
        /* Don't need to decompose; add it right into the buffer. */
        add_codepoint_to_buffer(tc, n, cp);
    }
}
Exemplo n.º 6
0
static int cp_value(MVMThreadContext *tc, MVMCodepoint cp) {
    if (cp >= '0' && cp <= '9') return cp - '0'; /* fast-path for ASCII 0..9 */
    else if (cp >= 'a' && cp <= 'z') return cp - 'a' + 10;
    else if (cp >= 'A' && cp <= 'Z') return cp - 'A' + 10;
    else if (cp >= 0xFF21 && cp <= 0xFF3A) return cp - 0xFF21 + 10; /* uppercase fullwidth */
    else if (cp >= 0xFF41 && cp <= 0xFF5A) return cp - 0xFF41 + 10; /* lowercase fullwidth */
    else if (cp > 0 && MVM_unicode_codepoint_get_property_int(tc, cp, MVM_UNICODE_PROPERTY_NUMERIC_TYPE)
     == MVM_UNICODE_PVALUE_Numeric_Type_DECIMAL) {
        /* as of Unicode 9.0.0, characters with the 'de' Numeric Type (and are
         * thus also of General Category Nd, since 4.0.0) are contiguous
         * sequences of 10 chars whose Numeric Values ascend from 0 through 9.
         */

        /* the string returned for NUMERIC_VALUE_NUMERATOR contains an integer
         * value. We can use numerator because they all are from 0-9 and have
         * denominator of 1 */
        return fast_atoi(MVM_unicode_codepoint_get_property_cstr(tc, cp, MVM_UNICODE_PROPERTY_NUMERIC_VALUE_NUMERATOR));
    }
    return -1;
}
Exemplo n.º 7
0
MVMObject * MVM_radix(MVMThreadContext *tc, MVMint64 radix, MVMString *str, MVMint64 offset, MVMint64 flag) {
    MVMObject *result;
    MVMint64 zvalue = 0;
    MVMint64 zbase  = 1;
    MVMint64 chars  = MVM_string_graphs(tc, str);
    MVMint64 value  = zvalue;
    MVMint64 base   = zbase;
    MVMint64   pos  = -1;
    MVMuint16  neg  = 0;
    MVMint64   ch;

    if (radix > 36) {
        MVM_exception_throw_adhoc(tc, "Cannot convert radix of %"PRId64" (max 36)", radix);
    }

    ch = (offset < chars) ? MVM_string_get_grapheme_at_nocheck(tc, str, offset) : 0;
    if ((flag & 0x02) && (ch == '+' || ch == '-')) {
        neg = (ch == '-');
        offset++;
        ch = (offset < chars) ? MVM_string_get_grapheme_at_nocheck(tc, str, offset) : 0;
    }

    while (offset < chars) {
        if (ch >= '0' && ch <= '9') ch = ch - '0'; /* fast-path for ASCII 0..9 */
        else if (ch >= 'a' && ch <= 'z') ch = ch - 'a' + 10;
        else if (ch >= 'A' && ch <= 'Z') ch = ch - 'A' + 10;
        else if (ch >= 0xFF21 && ch <= 0xFF3A) ch = ch - 0xFF21 + 10; /* uppercase fullwidth */
        else if (ch >= 0xFF41 && ch <= 0xFF5A) ch = ch - 0xFF41 + 10; /* lowercase fullwidth */
        else if (ch > 0 && MVM_unicode_codepoint_has_property_value(tc, ch, MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, 
                MVM_unicode_cname_to_property_value_code(tc, MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Nd")))) {
            /* As of Unicode 6.0.0, we know that Nd category numerals are within
             * the range 0..9
             */

            /* the string returned for NUMERIC_VALUE contains a floating point
             * value, so atoi will stop on the . in the string. This is fine
             * though, since we'd have to truncate the float regardless.
             */
            ch = atoi(MVM_unicode_codepoint_get_property_cstr(tc, ch, MVM_UNICODE_PROPERTY_NUMERIC_VALUE));
        }
        else break;
        if (ch >= radix) break;
        zvalue = zvalue * radix + ch;
        zbase = zbase * radix;
        offset++; pos = offset;
        if (ch != 0 || !(flag & 0x04)) { value=zvalue; base=zbase; }
        if (offset >= chars) break;
        ch = MVM_string_get_grapheme_at_nocheck(tc, str, offset);
        if (ch != '_') continue;
        offset++;
        if (offset >= chars) break;
        ch = MVM_string_get_grapheme_at_nocheck(tc, str, offset);
    }

    if (neg || flag & 0x01) { value = -value; }

    /* initialize the object */
    result = MVM_repr_alloc_init(tc, MVM_hll_current(tc)->slurpy_array_type);
    MVMROOT(tc, result, {
        MVMObject *box_type = MVM_hll_current(tc)->int_box_type;
        MVMROOT(tc, box_type, {
            MVMObject *boxed = MVM_repr_box_int(tc, box_type, value);
            MVM_repr_push_o(tc, result, boxed);
            boxed = MVM_repr_box_int(tc, box_type, base);
            MVM_repr_push_o(tc, result, boxed);
            boxed = MVM_repr_box_int(tc, box_type, pos);
            MVM_repr_push_o(tc, result, boxed);
        });
    });
Exemplo n.º 8
0
/* Gets the canonical combining class for a codepoint. */
static MVMint64 ccc(MVMThreadContext *tc, MVMCodepoint cp) {
    const char *ccc_str = MVM_unicode_codepoint_get_property_cstr(tc, cp, MVM_UNICODE_PROPERTY_CANONICAL_COMBINING_CLASS);
    return !ccc_str || strlen(ccc_str) > 3 ? 0 : atoi(ccc_str);
}
Exemplo n.º 9
0
/* Checks if the specified character answers "yes" on the appropriate quick check. */
static MVMint64 passes_quickcheck(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint cp) {
    const char *pval = MVM_unicode_codepoint_get_property_cstr(tc, cp, n->quick_check_property);
    return pval && pval[0] == 'Y';
}
Exemplo n.º 10
0
/* Returns non-zero if the result of concatenating the two strings will freely
 * leave us in NFG without any further effort. */
static MVMint32 passes_quickcheck_and_zero_ccc(MVMThreadContext *tc, MVMCodepoint cp) {
    const char *qc_str  = MVM_unicode_codepoint_get_property_cstr(tc, cp, MVM_UNICODE_PROPERTY_NFG_QC);
    const char *ccc_str = MVM_unicode_codepoint_get_property_cstr(tc, cp, MVM_UNICODE_PROPERTY_CANONICAL_COMBINING_CLASS);
    return qc_str && qc_str[0] == 'Y' &&
        (!ccc_str || strlen(ccc_str) > 3 || (strlen(ccc_str) == 1 && ccc_str[0] == 0));
}