/* Checks if the thing we have is a control character (for the definition in * the Unicode Standard Annex #29). Assumes it doesn't have to care about any * of the controls in the Latin-1 range, because those were already covered in * a fast path. */ static MVMint32 is_control_beyond_latin1(MVMThreadContext *tc, MVMCodepoint in) { /* U+200C ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER are excluded. */ if (in != 0x200C && in != 0x200D) { /* Consider general property. */ const char *genprop = MVM_unicode_codepoint_get_property_cstr(tc, in, MVM_UNICODE_PROPERTY_GENERAL_CATEGORY); if (genprop[0] == 'Z') { /* Line_Separator and Paragraph_Separator are controls. */ return genprop[1] == 'l' || genprop[1] == 'p'; } if (genprop[0] == 'C') { /* Control, Surrogate, and Format are controls. */ if (genprop[1] == 'c' || genprop[1] == 's' || genprop[1] == 'f') { return 1; } /* Unassigned is, but only for Default_Ignorable_Code_Point. */ if (genprop[1] == 'n') { return MVM_unicode_codepoint_get_property_int(tc, in, MVM_UNICODE_PROPERTY_DEFAULT_IGNORABLE_CODE_POINT) != 0; } } } return 0; }
static int cp_value(MVMThreadContext *tc, MVMCodepoint cp) { if (cp >= '0' && cp <= '9') return cp - '0'; /* fast-path for ASCII 0..9 */ else if (cp >= 'a' && cp <= 'z') return cp - 'a' + 10; else if (cp >= 'A' && cp <= 'Z') return cp - 'A' + 10; else if (cp >= 0xFF21 && cp <= 0xFF3A) return cp - 0xFF21 + 10; /* uppercase fullwidth */ else if (cp >= 0xFF41 && cp <= 0xFF5A) return cp - 0xFF41 + 10; /* lowercase fullwidth */ else if (cp > 0 && MVM_unicode_codepoint_get_property_int(tc, cp, MVM_UNICODE_PROPERTY_NUMERIC_TYPE) == MVM_UNICODE_PVALUE_Numeric_Type_DECIMAL) { /* as of Unicode 9.0.0, characters with the 'de' Numeric Type (and are * thus also of General Category Nd, since 4.0.0) are contiguous * sequences of 10 chars whose Numeric Values ascend from 0 through 9. */ /* the string returned for NUMERIC_VALUE_NUMERATOR contains an integer * value. We can use numerator because they all are from 0-9 and have * denominator of 1 */ return fast_atoi(MVM_unicode_codepoint_get_property_cstr(tc, cp, MVM_UNICODE_PROPERTY_NUMERIC_VALUE_NUMERATOR)); } return -1; }
MVMint32 MVM_unicode_collation_quickcheck (MVMThreadContext *tc, MVMint32 codepoint) { return MVM_unicode_codepoint_get_property_int(tc, codepoint, MVM_UNICODE_PROPERTY_MVM_COLLATION_QC); }
MVMint32 MVM_unicode_collation_tertiary (MVMThreadContext *tc, MVMint32 codepoint) { return MVM_unicode_codepoint_get_property_int(tc, codepoint, MVM_UNICODE_PROPERTY_MVM_COLLATION_TERTIARY); }
/* Returns true for cps with Grapheme_Cluster_Break = Control */ MVM_STATIC_INLINE MVMint32 codepoint_GCB_Control (MVMThreadContext *tc, MVMCodepoint codepoint) { return MVM_unicode_codepoint_get_property_int(tc, codepoint, MVM_UNICODE_PROPERTY_GRAPHEME_CLUSTER_BREAK) == MVM_UNICODE_PVALUE_GCB_CONTROL; }
MVM_STATIC_INLINE MVMint32 passes_quickcheck_and_zero_ccc(MVMThreadContext *tc, MVMCodepoint cp) { return MVM_unicode_codepoint_get_property_int(tc, cp, MVM_UNICODE_PROPERTY_NFG_QC) && MVM_unicode_codepoint_get_property_int(tc, cp, MVM_UNICODE_PROPERTY_CANONICAL_COMBINING_CLASS) <= MVM_UNICODE_PVALUE_CCC_0; }
/* Assumes that we are holding the lock that serializes updates, and already * checked that the synthetic does not exist. Adds it to the lookup trie and * synthetics table, making sure to do enough copy/free-at-safe-point work to * not upset other threads possibly doing concurrent reads. */ static MVMGrapheme32 add_synthetic(MVMThreadContext *tc, MVMCodepoint *codes, MVMint32 num_codes, MVMint32 utf8_c8) { MVMNFGState *nfg = tc->instance->nfg; MVMNFGSynthetic *synth; MVMGrapheme32 result; /* Grow the synthetics table if needed. */ if (nfg->num_synthetics % MVM_SYNTHETIC_GROW_ELEMS == 0) { size_t orig_size = nfg->num_synthetics * sizeof(MVMNFGSynthetic); size_t new_size = (nfg->num_synthetics + MVM_SYNTHETIC_GROW_ELEMS) * sizeof(MVMNFGSynthetic); MVMNFGSynthetic *new_synthetics = MVM_fixed_size_alloc(tc, tc->instance->fsa, new_size); if (orig_size) { memcpy(new_synthetics, nfg->synthetics, orig_size); MVM_fixed_size_free_at_safepoint(tc, tc->instance->fsa, orig_size, nfg->synthetics); } nfg->synthetics = new_synthetics; } /* Set up the new synthetic entry. */ synth = &(nfg->synthetics[nfg->num_synthetics]); synth->num_codes = num_codes; /* Find which codepoint is the base codepoint. It is always index 0 unless * there are Prepend codepoints */ if (!utf8_c8 && MVM_unicode_codepoint_get_property_int(tc, codes[0], MVM_UNICODE_PROPERTY_GRAPHEME_CLUSTER_BREAK) == MVM_UNICODE_PVALUE_GCB_PREPEND) { MVMint64 i = 0; MVMCodepoint cached = codes[i++]; MVMint64 cached_GCB = MVM_UNICODE_PVALUE_GCB_PREPEND; while (i < num_codes) { /* If it's the same codepoint as before, don't need to request * the property value again */ if (cached == codes[i] || MVM_UNICODE_PVALUE_GCB_PREPEND == (cached_GCB = MVM_unicode_codepoint_get_property_int(tc, (cached = codes[i]), MVM_UNICODE_PROPERTY_GRAPHEME_CLUSTER_BREAK))) { } else { /* If we see an Extend then this is a degenerate without any * base character, so set i to num_codes so base_index gets set * to 0 */ if (cached_GCB == MVM_UNICODE_PVALUE_GCB_EXTEND) i = num_codes; break; } i++; } /* If all the codepoints were prepend then we need to set it to 0 */ synth->base_index = num_codes == i ? 0 : i; } else { synth->base_index = 0; } synth->codes = MVM_fixed_size_alloc(tc, tc->instance->fsa, num_codes * sizeof(MVMCodepoint)); memcpy(synth->codes, codes, (synth->num_codes * sizeof(MVMCodepoint))); synth->case_uc = 0; synth->case_lc = 0; synth->case_tc = 0; synth->case_fc = 0; synth->is_utf8_c8 = utf8_c8; /* Memory barrier to make sure the synthetic is fully in place before we * bump the count. */ MVM_barrier(); nfg->num_synthetics++; /* Give the synthetic an ID by negating the new number of synthetics. */ result = -(nfg->num_synthetics); /* Make an entry in the lookup trie for the new synthetic, so we can use * it in the future when seeing the same codepoint sequence. */ add_synthetic_to_trie(tc, codes, num_codes, result); return result; }
static MVMint32 is_grapheme_extend(MVMThreadContext *tc, MVMCodepoint cp) { return MVM_unicode_codepoint_get_property_int(tc, cp, MVM_UNICODE_PROPERTY_GRAPHEME_EXTEND); }