Ejemplo n.º 1
0
int
main (int argc, char *argv[])
{
  const char *expected;
  const char *language;

  /* configure should already have checked that the locale is supported.  */
  if (setlocale (LC_ALL, "") == NULL)
    return 1;

  expected = argv[1];

  language = uc_locale_language ();
  ASSERT (strcmp (language, expected) == 0);

  return 0;
}
Ejemplo n.º 2
0
int
main (int argc, char * argv[])
{
  setlocale (LC_ALL, "");
  if (argc == 1)
    {
      /* Display the upper case of the input string.  */
      char *input = read_file (stdin);
      int length = strlen (input);
      size_t output_length;
      uint8_t *output =
        u8_toupper ((uint8_t *) input, length, uc_locale_language (),
                    NULL,
                    NULL, &output_length);

      fwrite (output, 1, output_length, stdout);

      return 0;
    }
  else
    return 1;
}
Ejemplo n.º 3
0
static gchar *
process_word_utf8 (TrackerParser         *parser,
                   const gchar           *word,
                   gint                   length,
                   TrackerParserWordType  type,
                   gboolean              *stop_word)
{
	gchar word_buffer [WORD_BUFFER_LENGTH];
	gchar *normalized = NULL;
	gchar *stemmed = NULL;
	size_t new_word_length;

	g_return_val_if_fail (parser != NULL, NULL);
	g_return_val_if_fail (word != NULL, NULL);

	/* If length is set as -1, the input word MUST be NIL-terminated.
	 * Otherwise, this restriction is not needed as the length to process
	 * is given as input argument */
	if (length < 0) {
		length = strlen (word);
	}

	/* Log original word */
	tracker_parser_message_hex ("ORIGINAL word",
	                            word, length);

	/* Normalization and case-folding ONLY for non-ASCII */
	if (type != TRACKER_PARSER_WORD_TYPE_ASCII) {
		/* Leave space for last NIL */
		new_word_length = WORD_BUFFER_LENGTH - 1;

		/* Casefold and NFKD normalization in output.
		 * NOTE: if the output buffer is not big enough, u8_casefold will
		 * return a newly-allocated buffer. */
		normalized = u8_casefold ((const uint8_t *)word,
		                          length,
		                          uc_locale_language (),
		                          UNINORM_NFKD,
		                          word_buffer,
		                          &new_word_length);

		/* Case folding + Normalization failed, ignore this word */
		g_return_val_if_fail (normalized != NULL, NULL);

		/* If output buffer is not the same as the one passed to
		 * u8_casefold, we know it was newly-allocated, so need
		 * to resize it in 1 byte to add last NIL */
		if (normalized != word_buffer) {
			normalized = g_realloc (normalized, new_word_length + 1);
		}

		/* Log after Normalization */
		tracker_parser_message_hex (" After Casefolding and NFKD normalization",
		                            normalized, new_word_length);
	} else {
		/* For ASCII-only, just tolower() each character */
		gsize i;

		normalized = length > WORD_BUFFER_LENGTH ? g_malloc (length + 1) : word_buffer;

		for (i = 0; i < length; i++) {
			normalized[i] = g_ascii_tolower (word[i]);
		}

		new_word_length = length;

		/* Log after tolower */
		tracker_parser_message_hex (" After Lowercasing",
		                            normalized, new_word_length);
	}

	/* Set output NIL */
	normalized[new_word_length] = '\0';

	/* UNAC stripping needed? (for non-CJK and non-ASCII) */
	if (parser->enable_unaccent &&
	    type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC &&
	    tracker_parser_unaccent_nfkd_string (normalized, &new_word_length)) {
		/* Log after UNAC stripping */
		tracker_parser_message_hex ("  After UNAC stripping",
		                            normalized, new_word_length);
	}

	/* Check if stop word */
	if (parser->ignore_stop_words) {
		*stop_word = tracker_language_is_stop_word (parser->language,
		                                            normalized);
	}

	/* Stemming needed? */
	if (parser->enable_stemmer) {
		stemmed = tracker_language_stem_word (parser->language,
		                                      normalized,
		                                      new_word_length);

		/* Log after stemming */
		tracker_parser_message_hex ("   After stemming",
		                            stemmed, strlen (stemmed));
	}

	/* If stemmed wanted and succeeded, free previous and return it */
	if (stemmed) {
		if (normalized != word_buffer) {
			g_free (normalized);
		}
		return stemmed;
	}

	/* It may be the case that no stripping and no stemming was needed, and
	 * that the output buffer in stack was enough for case-folding and
	 * normalization. In this case, need to strdup() the string to return it */
	return normalized == word_buffer ? g_strdup (word_buffer) : normalized;
}