static MVMint32 is_spacing_mark(MVMThreadContext *tc, MVMCodepoint cp) { const char *genprop = MVM_unicode_codepoint_get_property_cstr(tc, cp, MVM_UNICODE_PROPERTY_GENERAL_CATEGORY); if (genprop[0] == 'M' && genprop[1] == 'c') { const char *gcb = MVM_unicode_codepoint_get_property_cstr(tc, cp, MVM_UNICODE_PROPERTY_GRAPHEME_CLUSTER_BREAK); return strcmp(gcb, "Extend") != 0; } else { /* Special cases outside of Mc: * U+0E33 THAI CHARACTER SARA AM * U+0EB3 LAO VOWEL SIGN AM */ return cp == 0x0E33 || cp == 0x0EB3; } }
/* Checks if the thing we have is a control character (for the definition in * the Unicode Standard Annex #29). Assumes it doesn't have to care about any * of the controls in the Latin-1 range, because those were already covered in * a fast path. */ static MVMint32 is_control_beyond_latin1(MVMThreadContext *tc, MVMCodepoint in) { /* U+200C ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER are excluded. */ if (in != 0x200C && in != 0x200D) { /* Consider general property. */ const char *genprop = MVM_unicode_codepoint_get_property_cstr(tc, in, MVM_UNICODE_PROPERTY_GENERAL_CATEGORY); if (genprop[0] == 'Z') { /* Line_Separator and Paragraph_Separator are controls. */ return genprop[1] == 'l' || genprop[1] == 'p'; } if (genprop[0] == 'C') { /* Control, Surrogate, and Format are controls. */ if (genprop[1] == 'c' || genprop[1] == 's' || genprop[1] == 'f') { return 1; } /* Unassigned is, but only for Default_Ignorable_Code_Point. */ if (genprop[1] == 'n') { return MVM_unicode_codepoint_get_property_int(tc, in, MVM_UNICODE_PROPERTY_DEFAULT_IGNORABLE_CODE_POINT) != 0; } } } return 0; }
/* Gets the canonical combining class for a codepoint. */ static MVMint64 ccc(MVMThreadContext *tc, MVMCodepoint cp) { if (cp < MVM_NORMALIZE_FIRST_NONZERO_CCC) { return 0; } else { const char *ccc_str = MVM_unicode_codepoint_get_property_cstr(tc, cp, MVM_UNICODE_PROPERTY_CANONICAL_COMBINING_CLASS); return !ccc_str || strlen(ccc_str) > 3 ? 0 : atoi(ccc_str); } }
static MVMint32 should_break(MVMThreadContext *tc, MVMCodepoint a, MVMCodepoint b) { /* Don't break between \r and \n, but otherwise break around \r. */ if (a == 0x0D && b == 0x0A) return 0; if (a == 0x0D || b == 0x0D) return 1; /* Hangul. Avoid property lookup with a couple of quick range checks. */ if (maybe_hangul(a) && maybe_hangul(b)) { const char *hst_a = MVM_unicode_codepoint_get_property_cstr(tc, a, MVM_UNICODE_PROPERTY_HANGUL_SYLLABLE_TYPE); const char *hst_b = MVM_unicode_codepoint_get_property_cstr(tc, b, MVM_UNICODE_PROPERTY_HANGUL_SYLLABLE_TYPE); if (strcmp(hst_a, "L") == 0) return !(strcmp(hst_b, "L") == 0 || strcmp(hst_b, "V") == 0 || strcmp(hst_b, "LV") == 0 || strcmp(hst_b, "LVT") == 0); else if (strcmp(hst_a, "LV") == 0 || strcmp(hst_a, "V") == 0) return !(strcmp(hst_b, "V") == 0 || strcmp(hst_b, "T") == 0); else if (strcmp(hst_a, "LVT") == 0 || strcmp(hst_a, "T") == 0) return !(strcmp(hst_b, "T") == 0); } /* Don't break between regional indicators. */ if (is_regional_indicator(a) && is_regional_indicator(b)) return 0; /* Don't break before extenders or ZERO WIDTH JOINER. */ if (b == 0x200D || is_grapheme_extend(tc, b)) return 0; /* Don't break before spacing marks. (In the Unicode version at the time * of implementing, there were no Prepend characters, so we don't worry * about that rule for now). */ if (is_spacing_mark(tc, b)) return 0; /* Otherwise break. */ return 1; }
/* Decompose the codepoint and add it into the buffer. */ static void decomp_codepoint_to_buffer(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint cp) { /* See if we actually need to decompose (can skip if the decomposition * type is None, or we're only doing Canonical decomposition and it is * anything except Canonical). */ const char *type = MVM_unicode_codepoint_get_property_cstr(tc, cp, MVM_UNICODE_PROPERTY_DECOMPOSITION_TYPE); MVMint64 decompose = 1; if (!type) decompose = 0; else if (strcmp(type, "None") == 0) decompose = 0; else if (!MVM_NORMALIZE_COMPAT_DECOMP(n->form) && strcmp(type, "Canonical") != 0) decompose = 0; if (decompose) { /* We need to decompose. Get the decomp spec and go over the things in * it; things without a decomp spec are presumably Hangul and need the * algorithmic treatment. */ char *spec = (char *)MVM_unicode_codepoint_get_property_cstr(tc, cp, MVM_UNICODE_PROPERTY_DECOMP_SPEC); if (spec && spec[0]) { char *end = spec + strlen(spec); while (spec < end) { /* Parse hex character code, and then recurse to do any further * decomposition on it; this recursion terminates when we find a * non-decomposable thing and add it to the buffer. */ MVMCodepoint decomp_char = (MVMCodepoint)strtol(spec, &spec, 16); decomp_codepoint_to_buffer(tc, n, decomp_char); } } else { decomp_hangul_to_buffer(tc, n, cp); } } else { /* Don't need to decompose; add it right into the buffer. */ add_codepoint_to_buffer(tc, n, cp); } }
static int cp_value(MVMThreadContext *tc, MVMCodepoint cp) { if (cp >= '0' && cp <= '9') return cp - '0'; /* fast-path for ASCII 0..9 */ else if (cp >= 'a' && cp <= 'z') return cp - 'a' + 10; else if (cp >= 'A' && cp <= 'Z') return cp - 'A' + 10; else if (cp >= 0xFF21 && cp <= 0xFF3A) return cp - 0xFF21 + 10; /* uppercase fullwidth */ else if (cp >= 0xFF41 && cp <= 0xFF5A) return cp - 0xFF41 + 10; /* lowercase fullwidth */ else if (cp > 0 && MVM_unicode_codepoint_get_property_int(tc, cp, MVM_UNICODE_PROPERTY_NUMERIC_TYPE) == MVM_UNICODE_PVALUE_Numeric_Type_DECIMAL) { /* as of Unicode 9.0.0, characters with the 'de' Numeric Type (and are * thus also of General Category Nd, since 4.0.0) are contiguous * sequences of 10 chars whose Numeric Values ascend from 0 through 9. */ /* the string returned for NUMERIC_VALUE_NUMERATOR contains an integer * value. We can use numerator because they all are from 0-9 and have * denominator of 1 */ return fast_atoi(MVM_unicode_codepoint_get_property_cstr(tc, cp, MVM_UNICODE_PROPERTY_NUMERIC_VALUE_NUMERATOR)); } return -1; }
MVMObject * MVM_radix(MVMThreadContext *tc, MVMint64 radix, MVMString *str, MVMint64 offset, MVMint64 flag) { MVMObject *result; MVMint64 zvalue = 0; MVMint64 zbase = 1; MVMint64 chars = MVM_string_graphs(tc, str); MVMint64 value = zvalue; MVMint64 base = zbase; MVMint64 pos = -1; MVMuint16 neg = 0; MVMint64 ch; if (radix > 36) { MVM_exception_throw_adhoc(tc, "Cannot convert radix of %"PRId64" (max 36)", radix); } ch = (offset < chars) ? MVM_string_get_grapheme_at_nocheck(tc, str, offset) : 0; if ((flag & 0x02) && (ch == '+' || ch == '-')) { neg = (ch == '-'); offset++; ch = (offset < chars) ? MVM_string_get_grapheme_at_nocheck(tc, str, offset) : 0; } while (offset < chars) { if (ch >= '0' && ch <= '9') ch = ch - '0'; /* fast-path for ASCII 0..9 */ else if (ch >= 'a' && ch <= 'z') ch = ch - 'a' + 10; else if (ch >= 'A' && ch <= 'Z') ch = ch - 'A' + 10; else if (ch >= 0xFF21 && ch <= 0xFF3A) ch = ch - 0xFF21 + 10; /* uppercase fullwidth */ else if (ch >= 0xFF41 && ch <= 0xFF5A) ch = ch - 0xFF41 + 10; /* lowercase fullwidth */ else if (ch > 0 && MVM_unicode_codepoint_has_property_value(tc, ch, MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, MVM_unicode_cname_to_property_value_code(tc, MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Nd")))) { /* As of Unicode 6.0.0, we know that Nd category numerals are within * the range 0..9 */ /* the string returned for NUMERIC_VALUE contains a floating point * value, so atoi will stop on the . in the string. This is fine * though, since we'd have to truncate the float regardless. */ ch = atoi(MVM_unicode_codepoint_get_property_cstr(tc, ch, MVM_UNICODE_PROPERTY_NUMERIC_VALUE)); } else break; if (ch >= radix) break; zvalue = zvalue * radix + ch; zbase = zbase * radix; offset++; pos = offset; if (ch != 0 || !(flag & 0x04)) { value=zvalue; base=zbase; } if (offset >= chars) break; ch = MVM_string_get_grapheme_at_nocheck(tc, str, offset); if (ch != '_') continue; offset++; if (offset >= chars) break; ch = MVM_string_get_grapheme_at_nocheck(tc, str, offset); } if (neg || flag & 0x01) { value = -value; } /* initialize the object */ result = MVM_repr_alloc_init(tc, MVM_hll_current(tc)->slurpy_array_type); MVMROOT(tc, result, { MVMObject *box_type = MVM_hll_current(tc)->int_box_type; MVMROOT(tc, box_type, { MVMObject *boxed = MVM_repr_box_int(tc, box_type, value); MVM_repr_push_o(tc, result, boxed); boxed = MVM_repr_box_int(tc, box_type, base); MVM_repr_push_o(tc, result, boxed); boxed = MVM_repr_box_int(tc, box_type, pos); MVM_repr_push_o(tc, result, boxed); }); });
/* Gets the canonical combining class for a codepoint. */ static MVMint64 ccc(MVMThreadContext *tc, MVMCodepoint cp) { const char *ccc_str = MVM_unicode_codepoint_get_property_cstr(tc, cp, MVM_UNICODE_PROPERTY_CANONICAL_COMBINING_CLASS); return !ccc_str || strlen(ccc_str) > 3 ? 0 : atoi(ccc_str); }
/* Checks if the specified character answers "yes" on the appropriate quick check. */ static MVMint64 passes_quickcheck(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint cp) { const char *pval = MVM_unicode_codepoint_get_property_cstr(tc, cp, n->quick_check_property); return pval && pval[0] == 'Y'; }
/* Returns non-zero if the result of concatenating the two strings will freely * leave us in NFG without any further effort. */ static MVMint32 passes_quickcheck_and_zero_ccc(MVMThreadContext *tc, MVMCodepoint cp) { const char *qc_str = MVM_unicode_codepoint_get_property_cstr(tc, cp, MVM_UNICODE_PROPERTY_NFG_QC); const char *ccc_str = MVM_unicode_codepoint_get_property_cstr(tc, cp, MVM_UNICODE_PROPERTY_CANONICAL_COMBINING_CLASS); return qc_str && qc_str[0] == 'Y' && (!ccc_str || strlen(ccc_str) > 3 || (strlen(ccc_str) == 1 && ccc_str[0] == 0)); }