/* The input word in this method MUST be normalized in NFKD form, * and given in UTF-8, where str_length is the byte-length */ gboolean tracker_parser_unaccent_nfkd_string (gpointer str, gsize *str_length) { gchar *word; gsize word_length; gsize i; gsize j; g_return_val_if_fail (str != NULL, FALSE); g_return_val_if_fail (str_length != NULL, FALSE); g_return_val_if_fail (*str_length > 0, FALSE); word = (gchar *)str; word_length = *str_length; i = 0; j = 0; while (i < word_length) { ucs4_t unichar; gint utf8_len; /* Get next character of the word as UCS4 */ utf8_len = u8_strmbtouc (&unichar, &word[i]); /* Invalid UTF-8 character or end of original string. */ if (utf8_len <= 0) { break; } /* If the given unichar is a combining diacritical mark, * just update the original index, not the output one */ if (IS_CDM_UCS4 ((guint32) unichar)) { i += utf8_len; continue; } /* If already found a previous combining * diacritical mark, indexes are different so * need to copy characters. As output and input * buffers may overlap, need to use memmove * instead of memcpy */ if (i != j) { memmove (&word[j], &word[i], utf8_len); } /* Update both indexes */ i += utf8_len; j += utf8_len; } /* Force proper string end */ word[j] = '\0'; /* Set new output length */ *str_length = j; return TRUE; }
const uint8_t * u8_next (ucs4_t *puc, const uint8_t *s) { int count; count = u8_strmbtouc (puc, s); if (count > 0) return s + count; else { if (count < 0) *puc = 0xfffd; return NULL; } }
static gboolean get_word_info (TrackerParser *parser, gsize *p_word_length, gboolean *p_is_allowed_word_start, TrackerParserWordType *p_word_type) { ucs4_t first_unichar; gint first_unichar_len; gboolean ascii_only; /* Defaults */ *p_is_allowed_word_start = TRUE; /* Get first character of the word as UCS4 */ first_unichar_len = u8_strmbtouc (&first_unichar, &(parser->txt[parser->cursor])); if (first_unichar_len <= 0) { /* This should only happen if NIL was passed to u8_strmbtouc, * so better just force stop here */ return FALSE; } else { /* If first character has length 1, it's ASCII-7 */ ascii_only = first_unichar_len == 1 ? TRUE : FALSE; } /* Consider word starts with a forced wordbreak */ if (parser->enable_forced_wordbreaks && IS_FORCED_WORDBREAK_UCS4 ((guint32)first_unichar)) { *p_word_length = first_unichar_len; } else { gsize i; /* Find next word break, and in the same loop checking if only ASCII * characters */ i = parser->cursor + first_unichar_len; while (1) { /* Text bounds reached? */ if (i >= parser->txt_size) break; /* Proper unicode word break detected? */ if (parser->word_break_flags[i]) break; /* Forced word break detected? */ if (parser->enable_forced_wordbreaks && IS_FORCED_WORDBREAK_UCS4 ((guint32)parser->txt[i])) break; if (ascii_only && !IS_ASCII_UCS4 ((guint32)parser->txt[i])) { ascii_only = FALSE; } i++; } /* Word end is the first byte after the word, which is either the * start of next word or the end of the string */ *p_word_length = i - parser->cursor; } /* We only want the words where the first character * in the word is either a letter, a number or a symbol. * This is needed because the word break algorithm also * considers word breaks after for example commas or other * punctuation marks. * Note that looking at the first character in the string * should be compatible with all Unicode normalization * methods. */ if (!IS_UNDERSCORE_UCS4 ((guint32)first_unichar) && !uc_is_general_category (first_unichar, parser->allowed_start)) { *p_is_allowed_word_start = FALSE; return TRUE; } /* Decide word type */ if (ascii_only) { *p_word_type = TRACKER_PARSER_WORD_TYPE_ASCII; } else if (IS_CJK_UCS4 (first_unichar)) { *p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC; } else { *p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_UNAC; } return TRUE; }
int main () { ucs4_t uc; int ret; /* Test NUL unit input. */ { static const uint8_t input[] = ""; uc = 0xBADFACE; ret = u8_strmbtouc (&uc, input); ASSERT (ret == 0); ASSERT (uc == 0); } /* Test ISO 646 unit input. */ { ucs4_t c; uint8_t buf[2]; for (c = 1; c < 0x80; c++) { buf[0] = c; buf[1] = 0; uc = 0xBADFACE; ret = u8_strmbtouc (&uc, buf); ASSERT (ret == 1); ASSERT (uc == c); } } /* Test 2-byte character input. */ { static const uint8_t input[] = { 0xC3, 0x97, 0 }; uc = 0xBADFACE; ret = u8_strmbtouc (&uc, input); ASSERT (ret == 2); ASSERT (uc == 0x00D7); } /* Test 3-byte character input. */ { static const uint8_t input[] = { 0xE2, 0x82, 0xAC, 0 }; uc = 0xBADFACE; ret = u8_strmbtouc (&uc, input); ASSERT (ret == 3); ASSERT (uc == 0x20AC); } /* Test 4-byte character input. */ { static const uint8_t input[] = { 0xF4, 0x8F, 0xBF, 0xBD, 0 }; uc = 0xBADFACE; ret = u8_strmbtouc (&uc, input); ASSERT (ret == 4); ASSERT (uc == 0x10FFFD); } /* Test incomplete/invalid 1-byte input. */ { static const uint8_t input[] = { 0xC1, 0 }; uc = 0xBADFACE; ret = u8_strmbtouc (&uc, input); ASSERT (ret == -1); ASSERT (uc == 0xBADFACE); } { static const uint8_t input[] = { 0xC3, 0 }; uc = 0xBADFACE; ret = u8_strmbtouc (&uc, input); ASSERT (ret == -1); ASSERT (uc == 0xBADFACE); } { static const uint8_t input[] = { 0xE2, 0 }; uc = 0xBADFACE; ret = u8_strmbtouc (&uc, input); ASSERT (ret == -1); ASSERT (uc == 0xBADFACE); } { static const uint8_t input[] = { 0xF4, 0 }; uc = 0xBADFACE; ret = u8_strmbtouc (&uc, input); ASSERT (ret == -1); ASSERT (uc == 0xBADFACE); } { static const uint8_t input[] = { 0xFE, 0 }; uc = 0xBADFACE; ret = u8_strmbtouc (&uc, input); ASSERT (ret == -1); ASSERT (uc == 0xBADFACE); } /* Test incomplete/invalid 2-byte input. */ { static const uint8_t input[] = { 0xE0, 0x9F, 0 }; uc = 0xBADFACE; ret = u8_strmbtouc (&uc, input); ASSERT (ret == -1); ASSERT (uc == 0xBADFACE); } { static const uint8_t input[] = { 0xE2, 0x82, 0 }; uc = 0xBADFACE; ret = u8_strmbtouc (&uc, input); ASSERT (ret == -1); ASSERT (uc == 0xBADFACE); } { static const uint8_t input[] = { 0xE2, 0xD0, 0 }; uc = 0xBADFACE; ret = u8_strmbtouc (&uc, input); ASSERT (ret == -1); ASSERT (uc == 0xBADFACE); } { static const uint8_t input[] = { 0xF0, 0x8F, 0 }; uc = 0xBADFACE; ret = u8_strmbtouc (&uc, input); ASSERT (ret == -1); ASSERT (uc == 0xBADFACE); } { static const uint8_t input[] = { 0xF3, 0x8F, 0 }; uc = 0xBADFACE; ret = u8_strmbtouc (&uc, input); ASSERT (ret == -1); ASSERT (uc == 0xBADFACE); } { static const uint8_t input[] = { 0xF3, 0xD0, 0 }; uc = 0xBADFACE; ret = u8_strmbtouc (&uc, input); ASSERT (ret == -1); ASSERT (uc == 0xBADFACE); } /* Test incomplete/invalid 3-byte input. */ { static const uint8_t input[] = { 0xF3, 0x8F, 0xBF, 0 }; uc = 0xBADFACE; ret = u8_strmbtouc (&uc, input); ASSERT (ret == -1); ASSERT (uc == 0xBADFACE); } { static const uint8_t input[] = { 0xF3, 0xD0, 0xBF, 0 }; uc = 0xBADFACE; ret = u8_strmbtouc (&uc, input); ASSERT (ret == -1); ASSERT (uc == 0xBADFACE); } { static const uint8_t input[] = { 0xF3, 0x8F, 0xD0, 0 }; uc = 0xBADFACE; ret = u8_strmbtouc (&uc, input); ASSERT (ret == -1); ASSERT (uc == 0xBADFACE); } return 0; }