Exemple #1
0
/* Checks if the thing we have is a control character (for the definition in
 * the Unicode Standard Annex #29). Assumes it doesn't have to care about any
 * of the controls in the Latin-1 range, because those were already covered in
 * a fast path. */
static MVMint32 is_control_beyond_latin1(MVMThreadContext *tc, MVMCodepoint in) {
    /* U+200C ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER are excluded. */
    if (in != 0x200C && in != 0x200D) {
        /* Consider general property. */
        const char *genprop = MVM_unicode_codepoint_get_property_cstr(tc, in,
            MVM_UNICODE_PROPERTY_GENERAL_CATEGORY);
        if (genprop[0] == 'Z') {
            /* Line_Separator and Paragraph_Separator are controls. */
            return genprop[1] == 'l' || genprop[1] == 'p';
        }
        if (genprop[0] == 'C') {
            /* Control, Surrogate, and Format are controls. */
            if (genprop[1] == 'c' || genprop[1] == 's' || genprop[1] == 'f') {
                return 1;
            }

            /* Unassigned is, but only for Default_Ignorable_Code_Point. */
            if (genprop[1] == 'n') {
                return MVM_unicode_codepoint_get_property_int(tc, in,
                    MVM_UNICODE_PROPERTY_DEFAULT_IGNORABLE_CODE_POINT) != 0;
            }
        }
    }
    return 0;
}
Exemple #2
0
static int cp_value(MVMThreadContext *tc, MVMCodepoint cp) {
    if (cp >= '0' && cp <= '9') return cp - '0'; /* fast-path for ASCII 0..9 */
    else if (cp >= 'a' && cp <= 'z') return cp - 'a' + 10;
    else if (cp >= 'A' && cp <= 'Z') return cp - 'A' + 10;
    else if (cp >= 0xFF21 && cp <= 0xFF3A) return cp - 0xFF21 + 10; /* uppercase fullwidth */
    else if (cp >= 0xFF41 && cp <= 0xFF5A) return cp - 0xFF41 + 10; /* lowercase fullwidth */
    else if (cp > 0 && MVM_unicode_codepoint_get_property_int(tc, cp, MVM_UNICODE_PROPERTY_NUMERIC_TYPE)
     == MVM_UNICODE_PVALUE_Numeric_Type_DECIMAL) {
        /* as of Unicode 9.0.0, characters with the 'de' Numeric Type (and are
         * thus also of General Category Nd, since 4.0.0) are contiguous
         * sequences of 10 chars whose Numeric Values ascend from 0 through 9.
         */

        /* the string returned for NUMERIC_VALUE_NUMERATOR contains an integer
         * value. We can use numerator because they all are from 0-9 and have
         * denominator of 1 */
        return fast_atoi(MVM_unicode_codepoint_get_property_cstr(tc, cp, MVM_UNICODE_PROPERTY_NUMERIC_VALUE_NUMERATOR));
    }
    return -1;
}
Exemple #3
0
MVMint32 MVM_unicode_collation_quickcheck (MVMThreadContext *tc, MVMint32 codepoint) {
    return MVM_unicode_codepoint_get_property_int(tc, codepoint, MVM_UNICODE_PROPERTY_MVM_COLLATION_QC);
}
Exemple #4
0
MVMint32 MVM_unicode_collation_tertiary (MVMThreadContext *tc, MVMint32 codepoint) {
     return MVM_unicode_codepoint_get_property_int(tc, codepoint, MVM_UNICODE_PROPERTY_MVM_COLLATION_TERTIARY);
}
Exemple #5
0
/* Returns true for cps with Grapheme_Cluster_Break = Control */
MVM_STATIC_INLINE MVMint32 codepoint_GCB_Control (MVMThreadContext *tc, MVMCodepoint codepoint) {
    return MVM_unicode_codepoint_get_property_int(tc, codepoint,
        MVM_UNICODE_PROPERTY_GRAPHEME_CLUSTER_BREAK)
    ==  MVM_UNICODE_PVALUE_GCB_CONTROL;
}
Exemple #6
0
MVM_STATIC_INLINE MVMint32 passes_quickcheck_and_zero_ccc(MVMThreadContext *tc, MVMCodepoint cp) {
    return MVM_unicode_codepoint_get_property_int(tc, cp, MVM_UNICODE_PROPERTY_NFG_QC)
    &&     MVM_unicode_codepoint_get_property_int(tc, cp,
               MVM_UNICODE_PROPERTY_CANONICAL_COMBINING_CLASS) <= MVM_UNICODE_PVALUE_CCC_0;
}
Exemple #7
0
/* Assumes that we are holding the lock that serializes updates, and already
 * checked that the synthetic does not exist. Adds it to the lookup trie and
 * synthetics table, making sure to do enough copy/free-at-safe-point work to
 * not upset other threads possibly doing concurrent reads. */
static MVMGrapheme32 add_synthetic(MVMThreadContext *tc, MVMCodepoint *codes, MVMint32 num_codes, MVMint32 utf8_c8) {
    MVMNFGState     *nfg = tc->instance->nfg;
    MVMNFGSynthetic *synth;
    MVMGrapheme32    result;

    /* Grow the synthetics table if needed. */
    if (nfg->num_synthetics % MVM_SYNTHETIC_GROW_ELEMS == 0) {
        size_t orig_size = nfg->num_synthetics * sizeof(MVMNFGSynthetic);
        size_t new_size  = (nfg->num_synthetics + MVM_SYNTHETIC_GROW_ELEMS) * sizeof(MVMNFGSynthetic);
        MVMNFGSynthetic *new_synthetics = MVM_fixed_size_alloc(tc, tc->instance->fsa, new_size);
        if (orig_size) {
            memcpy(new_synthetics, nfg->synthetics, orig_size);
            MVM_fixed_size_free_at_safepoint(tc, tc->instance->fsa, orig_size, nfg->synthetics);
        }
        nfg->synthetics = new_synthetics;
    }

    /* Set up the new synthetic entry. */
    synth            = &(nfg->synthetics[nfg->num_synthetics]);
    synth->num_codes = num_codes;
    /* Find which codepoint is the base codepoint. It is always index 0 unless
     * there are Prepend codepoints */
    if (!utf8_c8 && MVM_unicode_codepoint_get_property_int(tc, codes[0], MVM_UNICODE_PROPERTY_GRAPHEME_CLUSTER_BREAK)
        == MVM_UNICODE_PVALUE_GCB_PREPEND) {
        MVMint64 i = 0;
        MVMCodepoint cached = codes[i++];
        MVMint64 cached_GCB = MVM_UNICODE_PVALUE_GCB_PREPEND;
        while (i < num_codes) {
            /* If it's the same codepoint as before, don't need to request
             * the property value again */
            if (cached == codes[i] || MVM_UNICODE_PVALUE_GCB_PREPEND ==
                (cached_GCB = MVM_unicode_codepoint_get_property_int(tc, (cached = codes[i]),
                    MVM_UNICODE_PROPERTY_GRAPHEME_CLUSTER_BREAK))) {
            }
            else {
                /* If we see an Extend then this is a degenerate without any
                 * base character, so set i to num_codes so base_index gets set
                 * to 0 */
                if (cached_GCB == MVM_UNICODE_PVALUE_GCB_EXTEND)
                    i = num_codes;
                break;
            }
            i++;
        }
        /* If all the codepoints were prepend then we need to set it to 0 */
        synth->base_index = num_codes == i ? 0 : i;

    }
    else {
        synth->base_index = 0;
    }


    synth->codes     = MVM_fixed_size_alloc(tc, tc->instance->fsa,
        num_codes * sizeof(MVMCodepoint));
    memcpy(synth->codes, codes, (synth->num_codes * sizeof(MVMCodepoint)));
    synth->case_uc    = 0;
    synth->case_lc    = 0;
    synth->case_tc    = 0;
    synth->case_fc    = 0;
    synth->is_utf8_c8 = utf8_c8;

    /* Memory barrier to make sure the synthetic is fully in place before we
     * bump the count. */
    MVM_barrier();
    nfg->num_synthetics++;

    /* Give the synthetic an ID by negating the new number of synthetics. */
    result = -(nfg->num_synthetics);

    /* Make an entry in the lookup trie for the new synthetic, so we can use
     * it in the future when seeing the same codepoint sequence. */
    add_synthetic_to_trie(tc, codes, num_codes, result);

    return result;
}
Exemple #8
0
static MVMint32 is_grapheme_extend(MVMThreadContext *tc, MVMCodepoint cp) {
    return MVM_unicode_codepoint_get_property_int(tc, cp,
        MVM_UNICODE_PROPERTY_GRAPHEME_EXTEND);
}