static int check (const uint8_t *input, size_t input_length, const char *iso639_language, uninorm_t nf, const uint8_t *expected, size_t expected_length) { size_t length; uint8_t *result; /* Test return conventions with resultbuf == NULL. */ result = u8_casefold (input, input_length, iso639_language, nf, NULL, &length); if (!(result != NULL)) return 1; if (!(length == expected_length)) return 2; if (!(u8_cmp (result, expected, expected_length) == 0)) return 3; free (result); /* Test return conventions with resultbuf too small. */ if (expected_length > 0) { uint8_t *preallocated; length = expected_length - 1; preallocated = (uint8_t *) malloc (length * sizeof (uint8_t)); result = u8_casefold (input, input_length, iso639_language, nf, preallocated, &length); if (!(result != NULL)) return 4; if (!(result != preallocated)) return 5; if (!(length == expected_length)) return 6; if (!(u8_cmp (result, expected, expected_length) == 0)) return 7; free (result); free (preallocated); } /* Test return conventions with resultbuf large enough. */ { uint8_t *preallocated; length = expected_length; preallocated = (uint8_t *) malloc (length * sizeof (uint8_t)); result = u8_casefold (input, input_length, iso639_language, nf, preallocated, &length); if (!(result != NULL)) return 8; if (!(preallocated == NULL || result == preallocated)) return 9; if (!(length == expected_length)) return 10; if (!(u8_cmp (result, expected, expected_length) == 0)) return 11; free (preallocated); } return 0; }
int main (int argc, char * argv[]) { setlocale (LC_ALL, ""); if (argc == 1) { /* Display the case folded input string. */ char *input = read_file (stdin); int length = strlen (input); size_t output_length; uint8_t *output = u8_casefold ((uint8_t *) input, length, uc_locale_language (), NULL, NULL, &output_length); fwrite (output, 1, output_length, stdout); return 0; } else return 1; }
static uint8_t * ulc_u8_casefold (const char *s, size_t n, const char *iso639_language, uninorm_t nf, uint8_t *resultbuf, size_t *lengthp) { uint8_t convbuf[2048 / sizeof (uint8_t)]; uint8_t *conv; size_t conv_length; uint8_t *result; /* Convert the string to UTF-8. */ conv_length = sizeof (convbuf) / sizeof (uint8_t); conv = u8_conv_from_encoding (locale_charset (), iconveh_error, s, n, NULL, convbuf, &conv_length); if (conv == NULL) /* errno is set here. */ return NULL; /* Case-fold and normalize. */ result = u8_casefold (conv, conv_length, iso639_language, nf, resultbuf, lengthp); if (result == NULL) { if (conv != convbuf) { int saved_errno = errno; free (conv); errno = saved_errno; } return NULL; } if (conv != convbuf) free (conv); return result; }
static gchar * process_word_utf8 (TrackerParser *parser, const gchar *word, gint length, TrackerParserWordType type, gboolean *stop_word) { gchar word_buffer [WORD_BUFFER_LENGTH]; gchar *normalized = NULL; gchar *stemmed = NULL; size_t new_word_length; g_return_val_if_fail (parser != NULL, NULL); g_return_val_if_fail (word != NULL, NULL); /* If length is set as -1, the input word MUST be NIL-terminated. * Otherwise, this restriction is not needed as the length to process * is given as input argument */ if (length < 0) { length = strlen (word); } /* Log original word */ tracker_parser_message_hex ("ORIGINAL word", word, length); /* Normalization and case-folding ONLY for non-ASCII */ if (type != TRACKER_PARSER_WORD_TYPE_ASCII) { /* Leave space for last NIL */ new_word_length = WORD_BUFFER_LENGTH - 1; /* Casefold and NFKD normalization in output. * NOTE: if the output buffer is not big enough, u8_casefold will * return a newly-allocated buffer. */ normalized = u8_casefold ((const uint8_t *)word, length, uc_locale_language (), UNINORM_NFKD, word_buffer, &new_word_length); /* Case folding + Normalization failed, ignore this word */ g_return_val_if_fail (normalized != NULL, NULL); /* If output buffer is not the same as the one passed to * u8_casefold, we know it was newly-allocated, so need * to resize it in 1 byte to add last NIL */ if (normalized != word_buffer) { normalized = g_realloc (normalized, new_word_length + 1); } /* Log after Normalization */ tracker_parser_message_hex (" After Casefolding and NFKD normalization", normalized, new_word_length); } else { /* For ASCII-only, just tolower() each character */ gsize i; normalized = length > WORD_BUFFER_LENGTH ? g_malloc (length + 1) : word_buffer; for (i = 0; i < length; i++) { normalized[i] = g_ascii_tolower (word[i]); } new_word_length = length; /* Log after tolower */ tracker_parser_message_hex (" After Lowercasing", normalized, new_word_length); } /* Set output NIL */ normalized[new_word_length] = '\0'; /* UNAC stripping needed? (for non-CJK and non-ASCII) */ if (parser->enable_unaccent && type == TRACKER_PARSER_WORD_TYPE_OTHER_UNAC && tracker_parser_unaccent_nfkd_string (normalized, &new_word_length)) { /* Log after UNAC stripping */ tracker_parser_message_hex (" After UNAC stripping", normalized, new_word_length); } /* Check if stop word */ if (parser->ignore_stop_words) { *stop_word = tracker_language_is_stop_word (parser->language, normalized); } /* Stemming needed? */ if (parser->enable_stemmer) { stemmed = tracker_language_stem_word (parser->language, normalized, new_word_length); /* Log after stemming */ tracker_parser_message_hex (" After stemming", stemmed, strlen (stemmed)); } /* If stemmed wanted and succeeded, free previous and return it */ if (stemmed) { if (normalized != word_buffer) { g_free (normalized); } return stemmed; } /* It may be the case that no stripping and no stemming was needed, and * that the output buffer in stack was enough for case-folding and * normalization. In this case, need to strdup() the string to return it */ return normalized == word_buffer ? g_strdup (word_buffer) : normalized; }