コード例 #1
0
ファイル: test-categ_name.c プロジェクト: Distrotech/gnulib
int
main ()
{
  ASSERT (strcmp (uc_general_category_name (UC_CATEGORY_Z), "Z") == 0);
  ASSERT (strcmp (uc_general_category_name (UC_CATEGORY_Nl), "Nl") == 0);
  ASSERT (uc_general_category_name (uc_general_category_or (UC_CATEGORY_Z, UC_CATEGORY_Nl)) == NULL);

  return 0;
}
コード例 #2
0
int
main ()
{
  ASSERT (strcmp (uc_general_category_long_name (UC_CATEGORY_Z), "Separator") == 0);
  ASSERT (strcmp (uc_general_category_long_name (UC_CATEGORY_Nl), "Letter Number") == 0);
  ASSERT (uc_general_category_long_name (uc_general_category_or (UC_CATEGORY_Z, UC_CATEGORY_Nl)) == NULL);

  return 0;
}
コード例 #3
0
void
tracker_parser_reset (TrackerParser *parser,
                      const gchar   *txt,
                      gint           txt_size,
                      guint          max_word_length,
                      gboolean       enable_stemmer,
                      gboolean       enable_unaccent,
                      gboolean       ignore_stop_words,
                      gboolean       ignore_reserved_words,
                      gboolean       ignore_numbers)
{
	g_return_if_fail (parser != NULL);
	g_return_if_fail (txt != NULL);

	parser->max_word_length = max_word_length;
	parser->enable_stemmer = enable_stemmer;
	parser->enable_unaccent = enable_unaccent;
	parser->ignore_stop_words = ignore_stop_words;
	parser->ignore_reserved_words = ignore_reserved_words;
	parser->ignore_numbers = ignore_numbers;

	/* Note: We're forcing some unicode characters to behave
	 * as wordbreakers: e.g, the '.' The main reason for this
	 * is to enable FTS searches matching file extension. */
	parser->enable_forced_wordbreaks = TRUE;

	parser->txt_size = txt_size;
	parser->txt = txt;

	g_free (parser->word);
	parser->word = NULL;

	parser->word_position = 0;

	parser->cursor = 0;

	g_free (parser->word_break_flags);

	/* Create array of flags, same size as original text. */
	parser->word_break_flags = g_malloc (txt_size);

	/* Get wordbreak flags in the whole string */
	u8_wordbreaks ((const uint8_t *)txt,
	               (size_t) txt_size,
	               (char *)parser->word_break_flags);

	/* Prepare a custom category which is a combination of the
	 * desired ones */
	parser->allowed_start = UC_LETTER;
	if (!parser->ignore_numbers) {
		parser->allowed_start = uc_general_category_or (parser->allowed_start, UC_NUMBER);
	}
}
コード例 #4
0
/* Checks whether the provided string is in the valid set of FreeFormClass (RFC7564
 * as an RFC7613 requirement), and converts all spaces to the ASCII-space. */
static int check_for_valid_freeformclass(uint32_t *ucs4, unsigned ucs4_size)
{
	unsigned i;
	int rc;
	uint32_t tmp[4];
	size_t tmp_size;
	uint32_t *nrm;
	uc_general_category_t cat;
	unsigned is_invalid;

	/* make the union of Valid categories, excluding any invalid (i.e., control) */
	cat = uc_general_category_or(UC_CATEGORY_Ll, UC_CATEGORY_Lu); /* LetterDigits */
	cat = uc_general_category_or(cat, UC_CATEGORY_Lo);
	cat = uc_general_category_or(cat, UC_CATEGORY_Nd);
	cat = uc_general_category_or(cat, UC_CATEGORY_Lm);
	cat = uc_general_category_or(cat, UC_CATEGORY_Mn);
	cat = uc_general_category_or(cat, UC_CATEGORY_Mc);
	cat = uc_general_category_or(cat, UC_CATEGORY_Lt); /* OtherLetterDigits */
	cat = uc_general_category_or(cat, UC_CATEGORY_Nl);
	cat = uc_general_category_or(cat, UC_CATEGORY_No);
	cat = uc_general_category_or(cat, UC_CATEGORY_Me);
	cat = uc_general_category_or(cat, UC_CATEGORY_Sm); /* Symbols */
	cat = uc_general_category_or(cat, UC_CATEGORY_Sc);
	cat = uc_general_category_or(cat, UC_CATEGORY_So);
	cat = uc_general_category_or(cat, UC_CATEGORY_Sk);
	cat = uc_general_category_or(cat, UC_CATEGORY_Pc); /* Punctuation */
	cat = uc_general_category_or(cat, UC_CATEGORY_Pd);
	cat = uc_general_category_or(cat, UC_CATEGORY_Ps);
	cat = uc_general_category_or(cat, UC_CATEGORY_Pe);
	cat = uc_general_category_or(cat, UC_CATEGORY_Pi);
	cat = uc_general_category_or(cat, UC_CATEGORY_Pf);
	cat = uc_general_category_or(cat, UC_CATEGORY_Po);
	cat = uc_general_category_or(cat, UC_CATEGORY_Zs); /* Spaces */
	cat = uc_general_category_and_not(cat, UC_CATEGORY_Cc); /* Not in Control */

	/* check for being in the allowed sets in rfc7564#section-4.3 */
	for (i=0;i<ucs4_size;i++) {
		is_invalid = 0;

		/* Disallowed 
		   o  Old Hangul Jamo characters, i.e., the OldHangulJamo ("I") category
		      [FIXME: not handled in this code]

		   o  Control characters, i.e., the Controls ("L") category

		   o  Ignorable characters, i.e., the PrecisIgnorableProperties ("M")
		 */
		if (uc_is_property_default_ignorable_code_point(ucs4[i]) ||
		    uc_is_property_not_a_character(ucs4[i])) {
			return gnutls_assert_val(GNUTLS_E_INVALID_UTF8_STRING);
		}


		/* Contextual rules - we do not implement them / we reject chars from these sets
		   o  A number of characters from the Exceptions ("F") category defined

		   o  Joining characters, i.e., the JoinControl ("H") category defined
		 */
		rc = is_allowed_exception(ucs4[i]);
		if (rc == 0 || uc_is_property_join_control(ucs4[i]))
			return gnutls_assert_val(GNUTLS_E_INVALID_UTF8_STRING);

		if (rc == 1) /* exceptionally allowed, continue */
			continue;


		/* Replace all spaces; an RFC7613 requirement
		 */
		if (uc_is_general_category(ucs4[i], UC_CATEGORY_Zs)) /* replace */
			ucs4[i] = 0x20;

		/* Valid */
		if ((ucs4[i] < 0x21 || ucs4[i] > 0x7E) && !uc_is_general_category(ucs4[i], cat))
			is_invalid = 1;

		/* HasCompat */
		if (is_invalid) {
			tmp_size = sizeof(tmp)/sizeof(tmp[0]);
			nrm = u32_normalize(UNINORM_NFKC, &ucs4[i], 1, tmp, &tmp_size);
			if (nrm == NULL || (tmp_size == 1 && nrm[0] == ucs4[i]))
				return gnutls_assert_val(GNUTLS_E_INVALID_UTF8_STRING);
		}
	}

	return 0;
}