Ejemplo n.º 1
0
static int
check (const uint8_t *input, size_t input_length,
       const char *iso639_language, uninorm_t nf,
       const uint8_t *expected, size_t expected_length)
{
  size_t length;
  uint8_t *result;

  /* Test return conventions with resultbuf == NULL.  */
  result = u8_casefold (input, input_length, iso639_language, nf, NULL, &length);
  if (!(result != NULL))
    return 1;
  if (!(length == expected_length))
    return 2;
  if (!(u8_cmp (result, expected, expected_length) == 0))
    return 3;
  free (result);

  /* Test return conventions with resultbuf too small.  */
  if (expected_length > 0)
    {
      uint8_t *preallocated;

      length = expected_length - 1;
      preallocated = (uint8_t *) malloc (length * sizeof (uint8_t));
      result = u8_casefold (input, input_length, iso639_language, nf, preallocated, &length);
      if (!(result != NULL))
        return 4;
      if (!(result != preallocated))
        return 5;
      if (!(length == expected_length))
        return 6;
      if (!(u8_cmp (result, expected, expected_length) == 0))
        return 7;
      free (result);
      free (preallocated);
    }

  /* Test return conventions with resultbuf large enough.  */
  {
    uint8_t *preallocated;

    length = expected_length;
    preallocated = (uint8_t *) malloc (length * sizeof (uint8_t));
    result = u8_casefold (input, input_length, iso639_language, nf, preallocated, &length);
    if (!(result != NULL))
      return 8;
    if (!(preallocated == NULL || result == preallocated))
      return 9;
    if (!(length == expected_length))
      return 10;
    if (!(u8_cmp (result, expected, expected_length) == 0))
      return 11;
    free (preallocated);
  }

  return 0;
}
Ejemplo n.º 2
0
int
main (int argc, char * argv[])
{
  setlocale (LC_ALL, "");
  if (argc == 1)
    {
      /* Display the case folded input string.  */
      char *input = read_file (stdin);
      int length = strlen (input);
      size_t output_length;
      uint8_t *output =
        u8_casefold ((uint8_t *) input, length, uc_locale_language (),
                    NULL,
                    NULL, &output_length);

      fwrite (output, 1, output_length, stdout);

      return 0;
    }
  else
    return 1;
}
Ejemplo n.º 3
0
static uint8_t *
ulc_u8_casefold (const char *s, size_t n, const char *iso639_language,
                 uninorm_t nf,
                 uint8_t *resultbuf, size_t *lengthp)
{
  uint8_t convbuf[2048 / sizeof (uint8_t)];
  uint8_t *conv;
  size_t conv_length;
  uint8_t *result;

  /* Convert the string to UTF-8.  */
  conv_length = sizeof (convbuf) / sizeof (uint8_t);
  conv =
    u8_conv_from_encoding (locale_charset (), iconveh_error, s, n, NULL,
                           convbuf, &conv_length);
  if (conv == NULL)
    /* errno is set here.  */
    return NULL;

  /* Case-fold and normalize.  */
  result = u8_casefold (conv, conv_length, iso639_language, nf,
                        resultbuf, lengthp);
  if (result == NULL)
    {
      if (conv != convbuf)
        {
          int saved_errno = errno;
          free (conv);
          errno = saved_errno;
        }
      return NULL;
    }

  if (conv != convbuf)
    free (conv);
  return result;
}
Ejemplo n.º 4
0
static gchar *
process_word_utf8 (TrackerParser         *parser,
                   const gchar           *word,
                   gint                   length,
                   TrackerParserWordType  type,
                   gboolean              *stop_word)
{
	gchar word_buffer [WORD_BUFFER_LENGTH];
	gchar *normalized = NULL;
	gchar *stemmed = NULL;
	size_t new_word_length;

	g_return_val_if_fail (parser != NULL, NULL);
	g_return_val_if_fail (word != NULL, NULL);

	/* If length is set as -1, the input word MUST be NIL-terminated.
	 * Otherwise, this restriction is not needed as the length to process
	 * is given as input argument */
	if (length < 0) {
		length = strlen (word);
	}

	/* Log original word */
	tracker_parser_message_hex ("ORIGINAL word",
	                            word, length);

	/* Normalization and case-folding ONLY for non-ASCII */
	if (type != TRACKER_PARSER_WORD_TYPE_ASCII) {
		/* Leave space for last NIL */
		new_word_length = WORD_BUFFER_LENGTH - 1;

		/* Casefold and NFKD normalization in output.
		 * NOTE: if the output buffer is not big enough, u8_casefold will
		 * return a newly-allocated buffer. */
		normalized = u8_casefold ((const uint8_t *)word,
		                          length,
		                          uc_locale_language (),
		                          UNINORM_NFKD,
		                          word_buffer,
		                          &new_word_length);

		/* Case folding + Normalization failed, ignore this word */
		g_return_val_if_fail (normalized != NULL, NULL);

		/* If output buffer is not the same as the one passed to
		 * u8_casefold, we know it was newly-allocated, so need
		 * to resize it in 1 byte to add last NIL */
		if (normalized != word_buffer) {
			normalized = g_realloc (normalized, new_word_length + 1);
		}

		/* Log after Normalization */
		tracker_parser_message_hex (" After Casefolding and NFKD normalization",
		                            normalized, new_word_length);
	} else {
		/* For ASCII-only, just tolower() each character */
		gsize i;

		normalized = length > WORD_BUFFER_LENGTH ? g_malloc (length + 1) : word_buffer;

		for (i = 0; i < length; i++) {
			normalized[i] = g_ascii_tolower (word[i]);
		}

		new_word_length = length;

		/* Log after tolower */
		tracker_parser_message_hex (" After Lowercasing",
		                            normalized, new_word_length);
	}

	/* Set output NIL */
	normalized[new_word_length] = '\0';

	/* UNAC stripping needed? (for non-CJK and non-ASCII) */
	if (parser->enable_unaccent &&
	    type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC &&
	    tracker_parser_unaccent_nfkd_string (normalized, &new_word_length)) {
		/* Log after UNAC stripping */
		tracker_parser_message_hex ("  After UNAC stripping",
		                            normalized, new_word_length);
	}

	/* Check if stop word */
	if (parser->ignore_stop_words) {
		*stop_word = tracker_language_is_stop_word (parser->language,
		                                            normalized);
	}

	/* Stemming needed? */
	if (parser->enable_stemmer) {
		stemmed = tracker_language_stem_word (parser->language,
		                                      normalized,
		                                      new_word_length);

		/* Log after stemming */
		tracker_parser_message_hex ("   After stemming",
		                            stemmed, strlen (stemmed));
	}

	/* If stemmed wanted and succeeded, free previous and return it */
	if (stemmed) {
		if (normalized != word_buffer) {
			g_free (normalized);
		}
		return stemmed;
	}

	/* It may be the case that no stripping and no stemming was needed, and
	 * that the output buffer in stack was enough for case-folding and
	 * normalization. In this case, need to strdup() the string to return it */
	return normalized == word_buffer ? g_strdup (word_buffer) : normalized;
}