int main () { { unsigned int c; for (c = 0; c < 0x110000; c++) ASSERT (uc_is_general_category (c, uc_general_category (c))); } { uc_general_category_t decimal_digits = uc_general_category ('7'); unsigned int c; for (c = 0x30; c <= 0x39; c++) ASSERT (uc_is_general_category (c, decimal_digits)); for (c = 0x40; c < 0x80; c++) ASSERT (!uc_is_general_category (c, decimal_digits)); } { uc_general_category_t lowercase_letters = uc_general_category ('x'); unsigned int c; for (c = 0x41; c <= 0x5A; c++) ASSERT (!uc_is_general_category (c, lowercase_letters)); for (c = 0x61; c <= 0x7A; c++) ASSERT (uc_is_general_category (c, lowercase_letters)); } return 0; }
bool uc_is_cased (ucs4_t uc) { return (uc_is_property_lowercase (uc) || uc_is_property_uppercase (uc) || uc_is_general_category (uc, UC_TITLECASE_LETTER)); }
int main () { uc_general_category_t ct = _UC_CATEGORY_NONE; unsigned int c; for (c = 0; c < 0x110000; c++) ASSERT (!uc_is_general_category (c, ct)); return 0; }
bool uc_is_property_titlecase (ucs4_t uc) { return uc_is_general_category (uc, UC_CATEGORY_Lt); }
bool uc_is_property_punctuation (ucs4_t uc) { return uc_is_general_category (uc, UC_CATEGORY_P); }
bool uc_is_property_decimal_digit (ucs4_t uc) { return uc_is_general_category (uc, UC_CATEGORY_Nd); }
bool uc_is_property_iso_control (ucs4_t uc) { return uc_is_general_category (uc, UC_CATEGORY_Cc); }
static gboolean get_word_info (TrackerParser *parser, gsize *p_word_length, gboolean *p_is_allowed_word_start, TrackerParserWordType *p_word_type) { ucs4_t first_unichar; gint first_unichar_len; gboolean ascii_only; /* Defaults */ *p_is_allowed_word_start = TRUE; /* Get first character of the word as UCS4 */ first_unichar_len = u8_strmbtouc (&first_unichar, &(parser->txt[parser->cursor])); if (first_unichar_len <= 0) { /* This should only happen if NIL was passed to u8_strmbtouc, * so better just force stop here */ return FALSE; } else { /* If first character has length 1, it's ASCII-7 */ ascii_only = first_unichar_len == 1 ? TRUE : FALSE; } /* Consider word starts with a forced wordbreak */ if (parser->enable_forced_wordbreaks && IS_FORCED_WORDBREAK_UCS4 ((guint32)first_unichar)) { *p_word_length = first_unichar_len; } else { gsize i; /* Find next word break, and in the same loop checking if only ASCII * characters */ i = parser->cursor + first_unichar_len; while (1) { /* Text bounds reached? */ if (i >= parser->txt_size) break; /* Proper unicode word break detected? */ if (parser->word_break_flags[i]) break; /* Forced word break detected? */ if (parser->enable_forced_wordbreaks && IS_FORCED_WORDBREAK_UCS4 ((guint32)parser->txt[i])) break; if (ascii_only && !IS_ASCII_UCS4 ((guint32)parser->txt[i])) { ascii_only = FALSE; } i++; } /* Word end is the first byte after the word, which is either the * start of next word or the end of the string */ *p_word_length = i - parser->cursor; } /* We only want the words where the first character * in the word is either a letter, a number or a symbol. * This is needed because the word break algorithm also * considers word breaks after for example commas or other * punctuation marks. * Note that looking at the first character in the string * should be compatible with all Unicode normalization * methods. */ if (!IS_UNDERSCORE_UCS4 ((guint32)first_unichar) && !uc_is_general_category (first_unichar, parser->allowed_start)) { *p_is_allowed_word_start = FALSE; return TRUE; } /* Decide word type */ if (ascii_only) { *p_word_type = TRACKER_PARSER_WORD_TYPE_ASCII; } else if (IS_CJK_UCS4 (first_unichar)) { *p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC; } else { *p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_UNAC; } return TRUE; }
bool uc_is_property_currency_symbol (ucs4_t uc) { return uc_is_general_category (uc, UC_CATEGORY_Sc); }
/* Checks whether the provided string is in the valid set of FreeFormClass (RFC7564 * as an RFC7613 requirement), and converts all spaces to the ASCII-space. */ static int check_for_valid_freeformclass(uint32_t *ucs4, unsigned ucs4_size) { unsigned i; int rc; uint32_t tmp[4]; size_t tmp_size; uint32_t *nrm; uc_general_category_t cat; unsigned is_invalid; /* make the union of Valid categories, excluding any invalid (i.e., control) */ cat = uc_general_category_or(UC_CATEGORY_Ll, UC_CATEGORY_Lu); /* LetterDigits */ cat = uc_general_category_or(cat, UC_CATEGORY_Lo); cat = uc_general_category_or(cat, UC_CATEGORY_Nd); cat = uc_general_category_or(cat, UC_CATEGORY_Lm); cat = uc_general_category_or(cat, UC_CATEGORY_Mn); cat = uc_general_category_or(cat, UC_CATEGORY_Mc); cat = uc_general_category_or(cat, UC_CATEGORY_Lt); /* OtherLetterDigits */ cat = uc_general_category_or(cat, UC_CATEGORY_Nl); cat = uc_general_category_or(cat, UC_CATEGORY_No); cat = uc_general_category_or(cat, UC_CATEGORY_Me); cat = uc_general_category_or(cat, UC_CATEGORY_Sm); /* Symbols */ cat = uc_general_category_or(cat, UC_CATEGORY_Sc); cat = uc_general_category_or(cat, UC_CATEGORY_So); cat = uc_general_category_or(cat, UC_CATEGORY_Sk); cat = uc_general_category_or(cat, UC_CATEGORY_Pc); /* Punctuation */ cat = uc_general_category_or(cat, UC_CATEGORY_Pd); cat = uc_general_category_or(cat, UC_CATEGORY_Ps); cat = uc_general_category_or(cat, UC_CATEGORY_Pe); cat = uc_general_category_or(cat, UC_CATEGORY_Pi); cat = uc_general_category_or(cat, UC_CATEGORY_Pf); cat = uc_general_category_or(cat, UC_CATEGORY_Po); cat = uc_general_category_or(cat, UC_CATEGORY_Zs); /* Spaces */ cat = uc_general_category_and_not(cat, UC_CATEGORY_Cc); /* Not in Control */ /* check for being in the allowed sets in rfc7564#section-4.3 */ for (i=0;i<ucs4_size;i++) { is_invalid = 0; /* Disallowed o Old Hangul Jamo characters, i.e., the OldHangulJamo ("I") category [FIXME: not handled in this code] o Control characters, i.e., the Controls ("L") category o Ignorable characters, i.e., the PrecisIgnorableProperties ("M") */ if (uc_is_property_default_ignorable_code_point(ucs4[i]) || uc_is_property_not_a_character(ucs4[i])) { return gnutls_assert_val(GNUTLS_E_INVALID_UTF8_STRING); } /* Contextual rules - we do not implement them / we reject chars from these sets o A number of characters from the Exceptions ("F") category defined o Joining characters, i.e., the JoinControl ("H") category defined */ rc = is_allowed_exception(ucs4[i]); if (rc == 0 || uc_is_property_join_control(ucs4[i])) return gnutls_assert_val(GNUTLS_E_INVALID_UTF8_STRING); if (rc == 1) /* exceptionally allowed, continue */ continue; /* Replace all spaces; an RFC7613 requirement */ if (uc_is_general_category(ucs4[i], UC_CATEGORY_Zs)) /* replace */ ucs4[i] = 0x20; /* Valid */ if ((ucs4[i] < 0x21 || ucs4[i] > 0x7E) && !uc_is_general_category(ucs4[i], cat)) is_invalid = 1; /* HasCompat */ if (is_invalid) { tmp_size = sizeof(tmp)/sizeof(tmp[0]); nrm = u32_normalize(UNINORM_NFKC, &ucs4[i], 1, tmp, &tmp_size); if (nrm == NULL || (tmp_size == 1 && nrm[0] == ucs4[i])) return gnutls_assert_val(GNUTLS_E_INVALID_UTF8_STRING); } } return 0; }
bool uc_is_property_space (ucs4_t uc) { return uc_is_general_category (uc, UC_CATEGORY_Zs); }