int elh_iterate_words__xany__UTF8(const char *buf, size_t bufsize, lh_word_check_t *check, lh_word_callback_t *callback, void *userdata) { unichar check_buf[WORD_MAX_CHARS]; int prop; const char *curr = buf; const char *buf_end = buf+bufsize; const char *word_begin = curr; const char *word_end; unichar uchr; size_t word_length; while (curr < buf_end) { word_begin = curr; uchr = eh_decode_char__UTF8 (&curr, buf_end); prop = unichar_getprops (uchr); if (prop & UCP_ALPHA) { check_buf[0] = uchr; word_length = 1; for(;;) { word_end = curr; uchr = eh_decode_char__UTF8 (&curr, buf_end); if (uchr < 0) { if ((UNICHAR_NO_DATA == uchr) || (UNICHAR_BAD_ENCODING == uchr)) return uchr; if (UNICHAR_EOD == uchr) break; } prop = unichar_getprops (uchr); if (!(prop & UCP_ALPHA)) break; if (WORD_MAX_CHARS > word_length) check_buf[word_length] = uchr; word_length++; } if (WORD_MAX_CHARS < word_length) goto done_word; if (NULL!=check && 0 == check (check_buf, word_length)) goto done_word; callback ((utf8char *)(word_begin), word_end-word_begin, userdata); done_word: if (prop & UCP_IDEO) goto proc_ideo; continue; } if (prop & UCP_IDEO) { proc_ideo: check_buf[0] = uchr; if (NULL!=check && 0 == check (check_buf, 1)) continue; callback ((utf8char *)(word_begin), curr-word_begin, userdata); continue; } if ((uchr < 0) && ((UNICHAR_NO_DATA == uchr) || (UNICHAR_BAD_ENCODING == uchr))) return uchr; } return 0; }
int elh_iterate_patched_words__xany__UTF8(const char *buf, size_t bufsize, lh_word_check_t *check, lh_word_patch_t *patch, lh_word_callback_t *callback, void *userdata) { unichar check_buf[WORD_MAX_CHARS]; int prop; const char *curr = buf; const char *buf_end = buf+bufsize; const char *word_begin = curr; const char *word_end = NULL; unichar uchr; size_t word_length; unichar patch_buf[WORD_MAX_CHARS]; const unichar *arg_begin; size_t arg_length; char word_buf[BUFSIZEOF__UTF8_WORD]; char *hugeword_buf = NULL; size_t hugeword_buf_size = 0; while (curr < buf_end) { word_begin = curr; uchr = eh_decode_char__UTF8 (&curr, buf_end); prop = unichar_getprops (uchr); if (prop & UCP_ALPHA) { check_buf[0] = uchr; word_length = 1; for(;;) { word_end = curr; uchr = eh_decode_char__UTF8 (&curr, buf_end); if (uchr < 0) { if ((UNICHAR_NO_DATA == uchr) || (UNICHAR_BAD_ENCODING == uchr)) return uchr; if (UNICHAR_EOD == uchr) break; } prop = unichar_getprops (uchr); if (!(prop & UCP_ALPHA)) break; if (WORD_MAX_CHARS > word_length) check_buf[word_length] = uchr; word_length++; } if (WORD_MAX_CHARS < word_length) goto done_word; if (NULL!=check && 0 == check (check_buf, word_length)) goto done_word; if (NULL != patch) { if (0 == patch (check_buf, word_length, patch_buf, &arg_length)) goto done_word; arg_begin = patch_buf; } else { callback ((utf8char *) word_begin, word_end-word_begin, userdata); goto done_word; } word_end = eh_encode_buffer__UTF8 (arg_begin, arg_begin+arg_length, word_buf, word_buf+BUFSIZEOF__UTF8_WORD); if (NULL != word_end) { callback ((utf8char *)(word_buf), word_end-word_buf, userdata); goto done_word; } if (hugeword_buf_size<(word_length*MAX_UTF8_CHAR)) { if (hugeword_buf_size) dk_free (hugeword_buf, hugeword_buf_size); hugeword_buf_size = word_length*MAX_UTF8_CHAR; hugeword_buf = (char *) dk_alloc (hugeword_buf_size); } word_end = eh_encode_buffer__UTF8 (arg_begin, arg_begin+arg_length, hugeword_buf, hugeword_buf+hugeword_buf_size); callback ((utf8char *)(hugeword_buf), word_end-hugeword_buf, userdata); done_word: if (prop & UCP_IDEO) goto proc_ideo; continue; } if (prop & UCP_IDEO) { proc_ideo: check_buf[0] = uchr; if (NULL!=check && 0 == check (check_buf, 1)) continue; if (NULL != patch) { if (0 == patch (check_buf, 1, patch_buf, &arg_length)) continue; arg_begin = patch_buf; } else { callback ((utf8char *) word_begin, curr-word_begin, userdata); continue; } word_end = eh_encode_buffer__UTF8 (arg_begin, arg_begin+arg_length, word_buf, word_buf+BUFSIZEOF__UTF8_WORD); callback ((utf8char *)(word_buf), word_end-word_buf, userdata); continue; } if ((uchr < 0) && ((UNICHAR_NO_DATA == uchr) || (UNICHAR_BAD_ENCODING == uchr))) goto cleanup; /* see below */ } uchr = 0; cleanup: if (hugeword_buf_size) dk_free (hugeword_buf, hugeword_buf_size); return uchr; }
int elh_count_words__xViAny__UTF8(const char *buf, size_t bufsize, lh_word_check_t *check) { unichar check_buf[WORD_MAX_CHARS]; int res = 0; int prop; const char *curr = buf; const char *buf_end = buf+bufsize; const char *word_begin = curr; const char *word_end = NULL; unichar uchr; size_t word_length; while (curr < buf_end) { word_begin = curr; uchr = eh_decode_char__UTF8 (&curr, buf_end); prop = unichar_getprops (uchr); if (prop & UCP_ALPHA) { check_buf[0] = uchr; word_length = 1; for(;;) { word_end = curr; uchr = eh_decode_char__UTF8 (&curr, buf_end); if (uchr < 0) { if ((UNICHAR_NO_DATA == uchr) || (UNICHAR_BAD_ENCODING == uchr)) return uchr; if (UNICHAR_EOD == uchr) break; } prop = unichar_getprops (uchr); if (!(prop & UCP_ALPHA) && !IS_CONNECTIVE) break; if (WORD_MAX_CHARS > word_length) check_buf[word_length] = uchr; word_length++; } if (WORD_MAX_CHARS < word_length) goto done_word; if (NULL!=check && 0 == check(check_buf, word_length)) goto done_word; res++; done_word: if (prop & UCP_IDEO) goto proc_ideo; continue; } if (prop & UCP_IDEO) { proc_ideo: check_buf[0] = uchr; if (NULL!=check && 0 == check(check_buf, 1)) continue; res++; continue; } if ((uchr < 0) && ((UNICHAR_NO_DATA == uchr) || (UNICHAR_BAD_ENCODING == uchr))) return uchr; } return res; }