int main () { /* Test case n = 0. */ u8_wordbreaks (NULL, 0, NULL); { static const uint8_t input[91] = /* "Grüß Gott. Здравствуйте! x=(-b±sqrt(b²-4ac))/(2a) 日本語,中文,한글" */ "Gr\303\274\303\237 Gott. \320\227\320\264\321\200\320\260\320\262\321\201\321\202\320\262\321\203\320\271\321\202\320\265! x=(-b\302\261sqrt(b\302\262-4ac))/(2a) \346\227\245\346\234\254\350\252\236,\344\270\255\346\226\207,\355\225\234\352\270\200\n"; char *p = (char *) malloc (SIZEOF (input)); size_t i; u8_wordbreaks (input, SIZEOF (input), p); for (i = 0; i < 91; i++) { ASSERT (p[i] == ((i >= 6 && i <= 7) || (i >= 11 && i <= 13) || (i >= 37 && i <= 44) || i == 46 || (i >= 50 && i <= 52) || (i >= 54 && i <= 55) || (i >= 58 && i <= 62) || (i >= 64 && i <= 67) || i == 70 || i == 73 || i == 76 || i == 77 || i == 80 || i == 83 || i == 84 || i == 90 ? 1 : 0)); } free (p); } { /* Same input string, decomposed. */ static const uint8_t input[106] = /* "Grüß Gott. Здравствуйте! x=(-b±sqrt(b²-4ac))/(2a) 日本語,中文,한글" */ "Gru\314\210\303\237 Gott. \320\227\320\264\321\200\320\260\320\262\321\201\321\202\320\262\321\203\320\270\314\206\321\202\320\265! x=(-b\302\261sqrt(b\302\262-4ac))/(2a) \346\227\245\346\234\254\350\252\236,\344\270\255\346\226\207,\341\204\222\341\205\241\341\206\253\341\204\200\341\205\263\341\206\257\n"; char *p = (char *) malloc (SIZEOF (input)); size_t i; u8_wordbreaks (input, SIZEOF (input), p); for (i = 0; i < 106; i++) { ASSERT (p[i] == ((i >= 7 && i <= 8) || (i >= 12 && i <= 14) || (i >= 40 && i <= 47) || i == 49 || (i >= 53 && i <= 55) || (i >= 57 && i <= 58) || (i >= 61 && i <= 65) || (i >= 67 && i <= 70) || i == 73 || i == 76 || i == 79 || i == 80 || i == 83 || i == 86 || i == 87 || i == 105 ? 1 : 0)); } free (p); } return 0; }
void tracker_parser_reset (TrackerParser *parser, const gchar *txt, gint txt_size, guint max_word_length, gboolean enable_stemmer, gboolean enable_unaccent, gboolean ignore_stop_words, gboolean ignore_reserved_words, gboolean ignore_numbers) { g_return_if_fail (parser != NULL); g_return_if_fail (txt != NULL); parser->max_word_length = max_word_length; parser->enable_stemmer = enable_stemmer; parser->enable_unaccent = enable_unaccent; parser->ignore_stop_words = ignore_stop_words; parser->ignore_reserved_words = ignore_reserved_words; parser->ignore_numbers = ignore_numbers; /* Note: We're forcing some unicode characters to behave * as wordbreakers: e.g, the '.' The main reason for this * is to enable FTS searches matching file extension. */ parser->enable_forced_wordbreaks = TRUE; parser->txt_size = txt_size; parser->txt = txt; g_free (parser->word); parser->word = NULL; parser->word_position = 0; parser->cursor = 0; g_free (parser->word_break_flags); /* Create array of flags, same size as original text. */ parser->word_break_flags = g_malloc (txt_size); /* Get wordbreak flags in the whole string */ u8_wordbreaks ((const uint8_t *)txt, (size_t) txt_size, (char *)parser->word_break_flags); /* Prepare a custom category which is a combination of the * desired ones */ parser->allowed_start = UC_LETTER; if (!parser->ignore_numbers) { parser->allowed_start = uc_general_category_or (parser->allowed_start, UC_NUMBER); } }
int main (int argc, char * argv[]) { if (argc == 1) { /* Display all the word breaks in the input string. */ char *input = read_file (stdin); int length = strlen (input); char *breaks = malloc (length); int i; u8_wordbreaks ((uint8_t *) input, length, breaks); for (i = 0; i < length; i++) { switch (breaks[i]) { case 1: /* U+2027 in UTF-8 encoding */ putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout); break; case 0: break; default: abort (); } putc (input[i], stdout); } free (breaks); return 0; } else return 1; }
void ulc_wordbreaks (const char *s, size_t n, char *p) { if (n > 0) { const char *encoding = locale_charset (); if (is_utf8_encoding (encoding)) u8_wordbreaks ((const uint8_t *) s, n, p); else { /* Convert the string to UTF-8 and build a translation table from offsets into s to offsets into the translated string. */ size_t *offsets = (size_t *) malloc (n * sizeof (size_t)); if (offsets != NULL) { uint8_t *t; size_t m; t = u8_conv_from_encoding (encoding, iconveh_question_mark, s, n, offsets, NULL, &m); if (t != NULL) { char *q = (char *) (m > 0 ? malloc (m) : NULL); if (m == 0 || q != NULL) { size_t i; /* Determine the word breaks of the UTF-8 string. */ u8_wordbreaks (t, m, q); /* Translate the result back to the original string. */ memset (p, 0, n); for (i = 0; i < n; i++) if (offsets[i] != (size_t)(-1)) p[i] = q[offsets[i]]; free (q); free (t); free (offsets); return; } free (t); } free (offsets); } /* Impossible to convert. */ #if C_CTYPE_ASCII if (is_all_ascii (s, n)) { /* ASCII is a subset of UTF-8. */ u8_wordbreaks ((const uint8_t *) s, n, p); return; } #endif /* We have a non-ASCII string and cannot convert it. Don't produce any word breaks. */ memset (p, 0, n); } } }