int main () { ASSERT (strcmp (uc_general_category_name (UC_CATEGORY_Z), "Z") == 0); ASSERT (strcmp (uc_general_category_name (UC_CATEGORY_Nl), "Nl") == 0); ASSERT (uc_general_category_name (uc_general_category_or (UC_CATEGORY_Z, UC_CATEGORY_Nl)) == NULL); return 0; }
int main () { ASSERT (strcmp (uc_general_category_long_name (UC_CATEGORY_Z), "Separator") == 0); ASSERT (strcmp (uc_general_category_long_name (UC_CATEGORY_Nl), "Letter Number") == 0); ASSERT (uc_general_category_long_name (uc_general_category_or (UC_CATEGORY_Z, UC_CATEGORY_Nl)) == NULL); return 0; }
void tracker_parser_reset (TrackerParser *parser, const gchar *txt, gint txt_size, guint max_word_length, gboolean enable_stemmer, gboolean enable_unaccent, gboolean ignore_stop_words, gboolean ignore_reserved_words, gboolean ignore_numbers) { g_return_if_fail (parser != NULL); g_return_if_fail (txt != NULL); parser->max_word_length = max_word_length; parser->enable_stemmer = enable_stemmer; parser->enable_unaccent = enable_unaccent; parser->ignore_stop_words = ignore_stop_words; parser->ignore_reserved_words = ignore_reserved_words; parser->ignore_numbers = ignore_numbers; /* Note: We're forcing some unicode characters to behave * as wordbreakers: e.g, the '.' The main reason for this * is to enable FTS searches matching file extension. */ parser->enable_forced_wordbreaks = TRUE; parser->txt_size = txt_size; parser->txt = txt; g_free (parser->word); parser->word = NULL; parser->word_position = 0; parser->cursor = 0; g_free (parser->word_break_flags); /* Create array of flags, same size as original text. */ parser->word_break_flags = g_malloc (txt_size); /* Get wordbreak flags in the whole string */ u8_wordbreaks ((const uint8_t *)txt, (size_t) txt_size, (char *)parser->word_break_flags); /* Prepare a custom category which is a combination of the * desired ones */ parser->allowed_start = UC_LETTER; if (!parser->ignore_numbers) { parser->allowed_start = uc_general_category_or (parser->allowed_start, UC_NUMBER); } }
/* Checks whether the provided string is in the valid set of FreeFormClass (RFC7564 * as an RFC7613 requirement), and converts all spaces to the ASCII-space. */ static int check_for_valid_freeformclass(uint32_t *ucs4, unsigned ucs4_size) { unsigned i; int rc; uint32_t tmp[4]; size_t tmp_size; uint32_t *nrm; uc_general_category_t cat; unsigned is_invalid; /* make the union of Valid categories, excluding any invalid (i.e., control) */ cat = uc_general_category_or(UC_CATEGORY_Ll, UC_CATEGORY_Lu); /* LetterDigits */ cat = uc_general_category_or(cat, UC_CATEGORY_Lo); cat = uc_general_category_or(cat, UC_CATEGORY_Nd); cat = uc_general_category_or(cat, UC_CATEGORY_Lm); cat = uc_general_category_or(cat, UC_CATEGORY_Mn); cat = uc_general_category_or(cat, UC_CATEGORY_Mc); cat = uc_general_category_or(cat, UC_CATEGORY_Lt); /* OtherLetterDigits */ cat = uc_general_category_or(cat, UC_CATEGORY_Nl); cat = uc_general_category_or(cat, UC_CATEGORY_No); cat = uc_general_category_or(cat, UC_CATEGORY_Me); cat = uc_general_category_or(cat, UC_CATEGORY_Sm); /* Symbols */ cat = uc_general_category_or(cat, UC_CATEGORY_Sc); cat = uc_general_category_or(cat, UC_CATEGORY_So); cat = uc_general_category_or(cat, UC_CATEGORY_Sk); cat = uc_general_category_or(cat, UC_CATEGORY_Pc); /* Punctuation */ cat = uc_general_category_or(cat, UC_CATEGORY_Pd); cat = uc_general_category_or(cat, UC_CATEGORY_Ps); cat = uc_general_category_or(cat, UC_CATEGORY_Pe); cat = uc_general_category_or(cat, UC_CATEGORY_Pi); cat = uc_general_category_or(cat, UC_CATEGORY_Pf); cat = uc_general_category_or(cat, UC_CATEGORY_Po); cat = uc_general_category_or(cat, UC_CATEGORY_Zs); /* Spaces */ cat = uc_general_category_and_not(cat, UC_CATEGORY_Cc); /* Not in Control */ /* check for being in the allowed sets in rfc7564#section-4.3 */ for (i=0;i<ucs4_size;i++) { is_invalid = 0; /* Disallowed o Old Hangul Jamo characters, i.e., the OldHangulJamo ("I") category [FIXME: not handled in this code] o Control characters, i.e., the Controls ("L") category o Ignorable characters, i.e., the PrecisIgnorableProperties ("M") */ if (uc_is_property_default_ignorable_code_point(ucs4[i]) || uc_is_property_not_a_character(ucs4[i])) { return gnutls_assert_val(GNUTLS_E_INVALID_UTF8_STRING); } /* Contextual rules - we do not implement them / we reject chars from these sets o A number of characters from the Exceptions ("F") category defined o Joining characters, i.e., the JoinControl ("H") category defined */ rc = is_allowed_exception(ucs4[i]); if (rc == 0 || uc_is_property_join_control(ucs4[i])) return gnutls_assert_val(GNUTLS_E_INVALID_UTF8_STRING); if (rc == 1) /* exceptionally allowed, continue */ continue; /* Replace all spaces; an RFC7613 requirement */ if (uc_is_general_category(ucs4[i], UC_CATEGORY_Zs)) /* replace */ ucs4[i] = 0x20; /* Valid */ if ((ucs4[i] < 0x21 || ucs4[i] > 0x7E) && !uc_is_general_category(ucs4[i], cat)) is_invalid = 1; /* HasCompat */ if (is_invalid) { tmp_size = sizeof(tmp)/sizeof(tmp[0]); nrm = u32_normalize(UNINORM_NFKC, &ucs4[i], 1, tmp, &tmp_size); if (nrm == NULL || (tmp_size == 1 && nrm[0] == ucs4[i])) return gnutls_assert_val(GNUTLS_E_INVALID_UTF8_STRING); } } return 0; }