Example #1
0
int
main ()
{
  {
    unsigned int c;
    for (c = 0; c < 0x110000; c++)
      ASSERT (uc_is_general_category (c, uc_general_category (c)));
  }

  {
    uc_general_category_t decimal_digits = uc_general_category ('7');
    unsigned int c;

    for (c = 0x30; c <= 0x39; c++)
      ASSERT (uc_is_general_category (c, decimal_digits));
    for (c = 0x40; c < 0x80; c++)
      ASSERT (!uc_is_general_category (c, decimal_digits));
  }

  {
    uc_general_category_t lowercase_letters = uc_general_category ('x');
    unsigned int c;

    for (c = 0x41; c <= 0x5A; c++)
      ASSERT (!uc_is_general_category (c, lowercase_letters));
    for (c = 0x61; c <= 0x7A; c++)
      ASSERT (uc_is_general_category (c, lowercase_letters));
  }

  return 0;
}
Example #2
0
bool
uc_is_cased (ucs4_t uc)
{
  return (uc_is_property_lowercase (uc)
          || uc_is_property_uppercase (uc)
          || uc_is_general_category (uc, UC_TITLECASE_LETTER));
}
Example #3
0
int
main ()
{
  uc_general_category_t ct = _UC_CATEGORY_NONE;
  unsigned int c;

  for (c = 0; c < 0x110000; c++)
    ASSERT (!uc_is_general_category (c, ct));

  return 0;
}
Example #4
0
bool
uc_is_property_titlecase (ucs4_t uc)
{
  return uc_is_general_category (uc, UC_CATEGORY_Lt);
}
Example #5
0
bool
uc_is_property_punctuation (ucs4_t uc)
{
  return uc_is_general_category (uc, UC_CATEGORY_P);
}
Example #6
0
bool
uc_is_property_decimal_digit (ucs4_t uc)
{
  return uc_is_general_category (uc, UC_CATEGORY_Nd);
}
Example #7
0
bool
uc_is_property_iso_control (ucs4_t uc)
{
  return uc_is_general_category (uc, UC_CATEGORY_Cc);
}
static gboolean
get_word_info (TrackerParser         *parser,
               gsize                 *p_word_length,
               gboolean              *p_is_allowed_word_start,
               TrackerParserWordType *p_word_type)
{
	ucs4_t first_unichar;
	gint first_unichar_len;
	gboolean ascii_only;

	/* Defaults */
	*p_is_allowed_word_start = TRUE;

	/* Get first character of the word as UCS4 */
	first_unichar_len = u8_strmbtouc (&first_unichar,
	                                  &(parser->txt[parser->cursor]));
	if (first_unichar_len <= 0) {
		/* This should only happen if NIL was passed to u8_strmbtouc,
		 *  so better just force stop here */
		return FALSE;
	} else  {
		/* If first character has length 1, it's ASCII-7 */
		ascii_only = first_unichar_len == 1 ? TRUE : FALSE;
	}

	/* Consider word starts with a forced wordbreak */
	if (parser->enable_forced_wordbreaks &&
	    IS_FORCED_WORDBREAK_UCS4 ((guint32)first_unichar)) {
		*p_word_length = first_unichar_len;
	} else {
		gsize i;

		/* Find next word break, and in the same loop checking if only ASCII
		 *  characters */
		i = parser->cursor + first_unichar_len;
		while (1) {
			/* Text bounds reached? */
			if (i >= parser->txt_size)
				break;
			/* Proper unicode word break detected? */
			if (parser->word_break_flags[i])
				break;
			/* Forced word break detected? */
			if (parser->enable_forced_wordbreaks &&
			    IS_FORCED_WORDBREAK_UCS4 ((guint32)parser->txt[i]))
				break;

			if (ascii_only &&
			    !IS_ASCII_UCS4 ((guint32)parser->txt[i])) {
				ascii_only = FALSE;
			}

			i++;
		}

		/* Word end is the first byte after the word, which is either the
		 *  start of next word or the end of the string */
		*p_word_length = i - parser->cursor;
	}

	/* We only want the words where the first character
	 *  in the word is either a letter, a number or a symbol.
	 * This is needed because the word break algorithm also
	 *  considers word breaks after for example commas or other
	 *  punctuation marks.
	 * Note that looking at the first character in the string
	 *  should be compatible with all Unicode normalization
	 *  methods.
	 */
	if (!IS_UNDERSCORE_UCS4 ((guint32)first_unichar) &&
	    !uc_is_general_category (first_unichar,
	                             parser->allowed_start)) {
		*p_is_allowed_word_start = FALSE;
		return TRUE;
	}

	/* Decide word type */
	if (ascii_only) {
		*p_word_type = TRACKER_PARSER_WORD_TYPE_ASCII;
	} else if (IS_CJK_UCS4 (first_unichar)) {
		*p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC;
	} else {
		*p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_UNAC;
	}
	return TRUE;
}
bool
uc_is_property_currency_symbol (ucs4_t uc)
{
  return uc_is_general_category (uc, UC_CATEGORY_Sc);
}
Example #10
0
/* Checks whether the provided string is in the valid set of FreeFormClass (RFC7564
 * as an RFC7613 requirement), and converts all spaces to the ASCII-space. */
static int check_for_valid_freeformclass(uint32_t *ucs4, unsigned ucs4_size)
{
	unsigned i;
	int rc;
	uint32_t tmp[4];
	size_t tmp_size;
	uint32_t *nrm;
	uc_general_category_t cat;
	unsigned is_invalid;

	/* make the union of Valid categories, excluding any invalid (i.e., control) */
	cat = uc_general_category_or(UC_CATEGORY_Ll, UC_CATEGORY_Lu); /* LetterDigits */
	cat = uc_general_category_or(cat, UC_CATEGORY_Lo);
	cat = uc_general_category_or(cat, UC_CATEGORY_Nd);
	cat = uc_general_category_or(cat, UC_CATEGORY_Lm);
	cat = uc_general_category_or(cat, UC_CATEGORY_Mn);
	cat = uc_general_category_or(cat, UC_CATEGORY_Mc);
	cat = uc_general_category_or(cat, UC_CATEGORY_Lt); /* OtherLetterDigits */
	cat = uc_general_category_or(cat, UC_CATEGORY_Nl);
	cat = uc_general_category_or(cat, UC_CATEGORY_No);
	cat = uc_general_category_or(cat, UC_CATEGORY_Me);
	cat = uc_general_category_or(cat, UC_CATEGORY_Sm); /* Symbols */
	cat = uc_general_category_or(cat, UC_CATEGORY_Sc);
	cat = uc_general_category_or(cat, UC_CATEGORY_So);
	cat = uc_general_category_or(cat, UC_CATEGORY_Sk);
	cat = uc_general_category_or(cat, UC_CATEGORY_Pc); /* Punctuation */
	cat = uc_general_category_or(cat, UC_CATEGORY_Pd);
	cat = uc_general_category_or(cat, UC_CATEGORY_Ps);
	cat = uc_general_category_or(cat, UC_CATEGORY_Pe);
	cat = uc_general_category_or(cat, UC_CATEGORY_Pi);
	cat = uc_general_category_or(cat, UC_CATEGORY_Pf);
	cat = uc_general_category_or(cat, UC_CATEGORY_Po);
	cat = uc_general_category_or(cat, UC_CATEGORY_Zs); /* Spaces */
	cat = uc_general_category_and_not(cat, UC_CATEGORY_Cc); /* Not in Control */

	/* check for being in the allowed sets in rfc7564#section-4.3 */
	for (i=0;i<ucs4_size;i++) {
		is_invalid = 0;

		/* Disallowed 
		   o  Old Hangul Jamo characters, i.e., the OldHangulJamo ("I") category
		      [FIXME: not handled in this code]

		   o  Control characters, i.e., the Controls ("L") category

		   o  Ignorable characters, i.e., the PrecisIgnorableProperties ("M")
		 */
		if (uc_is_property_default_ignorable_code_point(ucs4[i]) ||
		    uc_is_property_not_a_character(ucs4[i])) {
			return gnutls_assert_val(GNUTLS_E_INVALID_UTF8_STRING);
		}


		/* Contextual rules - we do not implement them / we reject chars from these sets
		   o  A number of characters from the Exceptions ("F") category defined

		   o  Joining characters, i.e., the JoinControl ("H") category defined
		 */
		rc = is_allowed_exception(ucs4[i]);
		if (rc == 0 || uc_is_property_join_control(ucs4[i]))
			return gnutls_assert_val(GNUTLS_E_INVALID_UTF8_STRING);

		if (rc == 1) /* exceptionally allowed, continue */
			continue;


		/* Replace all spaces; an RFC7613 requirement
		 */
		if (uc_is_general_category(ucs4[i], UC_CATEGORY_Zs)) /* replace */
			ucs4[i] = 0x20;

		/* Valid */
		if ((ucs4[i] < 0x21 || ucs4[i] > 0x7E) && !uc_is_general_category(ucs4[i], cat))
			is_invalid = 1;

		/* HasCompat */
		if (is_invalid) {
			tmp_size = sizeof(tmp)/sizeof(tmp[0]);
			nrm = u32_normalize(UNINORM_NFKC, &ucs4[i], 1, tmp, &tmp_size);
			if (nrm == NULL || (tmp_size == 1 && nrm[0] == ucs4[i]))
				return gnutls_assert_val(GNUTLS_E_INVALID_UTF8_STRING);
		}
	}

	return 0;
}
Example #11
0
bool
uc_is_property_space (ucs4_t uc)
{
  return uc_is_general_category (uc, UC_CATEGORY_Zs);
}