void LH_ITERATE_WORDS_NAME(const unichar *buf, size_t bufsize, lh_word_check_t *check, lh_word_callback_t *callback, void *userdata) { size_t pos = 0; size_t word_start; size_t word_length; utf8char word_buf[BUFSIZEOF__UTF8_WORD]; utf8char *hugeword_buf = NULL; size_t hugeword_buf_size = 0; utf8char *word_end; int prop; #ifdef LH_ITERATOR_DEBUG int wordctr = 0, wordcount = LH_COUNT_WORDS_NAME (buf, bufsize, check); #define wordctr_INC1 wordctr++ #else #define wordctr_INC1 #endif while (pos < bufsize) { prop = UNICHAR_GETPROPS_EXPN (buf, bufsize, pos); if (prop & UCP_ALPHA) { word_start = pos; do pos++; while ((pos < bufsize) && (UNICHAR_GETPROPS_EXPN (buf, bufsize, pos) & UCP_ALPHA)); word_length = pos - word_start; if (WORD_MAX_CHARS < word_length) continue; if (NULL!=check && 0 == check(buf+word_start, word_length)) continue; word_end = (utf8char *)eh_encode_buffer__UTF8 (buf+word_start, buf+pos, (char *)word_buf, (char *)(word_buf+BUFSIZEOF__UTF8_WORD)); if (NULL != word_end) { callback (word_buf, word_end-word_buf, userdata); wordctr_INC1; continue; } if (hugeword_buf_size<(word_length*MAX_UTF8_CHAR)) { if (hugeword_buf_size) dk_free (hugeword_buf, hugeword_buf_size); hugeword_buf_size = word_length*MAX_UTF8_CHAR; hugeword_buf = (utf8char *) dk_alloc (hugeword_buf_size); } word_end = (utf8char *)eh_encode_buffer__UTF8 (buf+word_start, buf+pos, (char *)hugeword_buf, (char *)(hugeword_buf+hugeword_buf_size)); callback (hugeword_buf, word_end-hugeword_buf, userdata); wordctr_INC1; continue; } if (prop & UCP_IDEO) { word_start = pos; pos++; if (NULL!=check && 0 == check(buf+pos-1, 1)) continue; word_end = (utf8char *)eh_encode_buffer__UTF8 (buf+word_start, buf+pos, (char *)(word_buf), (char *)(word_buf+BUFSIZEOF__UTF8_WORD)); callback (word_buf, word_end-word_buf, userdata); wordctr_INC1; continue; } pos++; } if (hugeword_buf_size) dk_free (hugeword_buf, hugeword_buf_size); #ifdef LH_ITERATOR_DEBUG if (wordctr != wordcount) GPF_T; #endif }
int elh_iterate_patched_words__xany__UTF8(const char *buf, size_t bufsize, lh_word_check_t *check, lh_word_patch_t *patch, lh_word_callback_t *callback, void *userdata) { unichar check_buf[WORD_MAX_CHARS]; int prop; const char *curr = buf; const char *buf_end = buf+bufsize; const char *word_begin = curr; const char *word_end = NULL; unichar uchr; size_t word_length; unichar patch_buf[WORD_MAX_CHARS]; const unichar *arg_begin; size_t arg_length; char word_buf[BUFSIZEOF__UTF8_WORD]; char *hugeword_buf = NULL; size_t hugeword_buf_size = 0; while (curr < buf_end) { word_begin = curr; uchr = eh_decode_char__UTF8 (&curr, buf_end); prop = unichar_getprops (uchr); if (prop & UCP_ALPHA) { check_buf[0] = uchr; word_length = 1; for(;;) { word_end = curr; uchr = eh_decode_char__UTF8 (&curr, buf_end); if (uchr < 0) { if ((UNICHAR_NO_DATA == uchr) || (UNICHAR_BAD_ENCODING == uchr)) return uchr; if (UNICHAR_EOD == uchr) break; } prop = unichar_getprops (uchr); if (!(prop & UCP_ALPHA)) break; if (WORD_MAX_CHARS > word_length) check_buf[word_length] = uchr; word_length++; } if (WORD_MAX_CHARS < word_length) goto done_word; if (NULL!=check && 0 == check (check_buf, word_length)) goto done_word; if (NULL != patch) { if (0 == patch (check_buf, word_length, patch_buf, &arg_length)) goto done_word; arg_begin = patch_buf; } else { callback ((utf8char *) word_begin, word_end-word_begin, userdata); goto done_word; } word_end = eh_encode_buffer__UTF8 (arg_begin, arg_begin+arg_length, word_buf, word_buf+BUFSIZEOF__UTF8_WORD); if (NULL != word_end) { callback ((utf8char *)(word_buf), word_end-word_buf, userdata); goto done_word; } if (hugeword_buf_size<(word_length*MAX_UTF8_CHAR)) { if (hugeword_buf_size) dk_free (hugeword_buf, hugeword_buf_size); hugeword_buf_size = word_length*MAX_UTF8_CHAR; hugeword_buf = (char *) dk_alloc (hugeword_buf_size); } word_end = eh_encode_buffer__UTF8 (arg_begin, arg_begin+arg_length, hugeword_buf, hugeword_buf+hugeword_buf_size); callback ((utf8char *)(hugeword_buf), word_end-hugeword_buf, userdata); done_word: if (prop & UCP_IDEO) goto proc_ideo; continue; } if (prop & UCP_IDEO) { proc_ideo: check_buf[0] = uchr; if (NULL!=check && 0 == check (check_buf, 1)) continue; if (NULL != patch) { if (0 == patch (check_buf, 1, patch_buf, &arg_length)) continue; arg_begin = patch_buf; } else { callback ((utf8char *) word_begin, curr-word_begin, userdata); continue; } word_end = eh_encode_buffer__UTF8 (arg_begin, arg_begin+arg_length, word_buf, word_buf+BUFSIZEOF__UTF8_WORD); callback ((utf8char *)(word_buf), word_end-word_buf, userdata); continue; } if ((uchr < 0) && ((UNICHAR_NO_DATA == uchr) || (UNICHAR_BAD_ENCODING == uchr))) goto cleanup; /* see below */ } uchr = 0; cleanup: if (hugeword_buf_size) dk_free (hugeword_buf, hugeword_buf_size); return uchr; }
void LH_ITERATE_PATCHED_WORDS_NAME(const unichar *buf, size_t bufsize, lh_word_check_t *check, lh_word_patch_t *patch, lh_word_callback_t *callback, void *userdata) { size_t pos = 0; size_t word_start; size_t word_length; unichar patch_buf[WORD_MAX_CHARS]; const unichar *arg_begin; size_t arg_length; utf8char word_buf[BUFSIZEOF__UTF8_WORD]; utf8char *hugeword_buf = NULL; size_t hugeword_buf_size = 0; utf8char *word_end; int prop; #ifdef LH_ITERATOR_DEBUG int wordctr = 0, wordcount = LH_COUNT_WORDS_NAME (buf, bufsize, check); #define wordctr_INC1 wordctr++ #else #define wordctr_INC1 #endif while (pos < bufsize) { prop = UNICHAR_GETPROPS_EXPN(buf,bufsize,pos); if (prop & UCP_ALPHA) { word_start = pos; do pos++; while ((pos < bufsize) && (UNICHAR_GETPROPS_EXPN(buf,bufsize,pos) & UCP_ALPHA)); word_length = pos - word_start; if (WORD_MAX_CHARS < word_length) continue; if (NULL!=check && 0 == check(buf+word_start, word_length)) { DBG_PRINTF_NOISE_WORD(word_start,word_length); continue; } if (NULL != patch) { /* word should be patched */ if (0 == patch (buf+word_start, word_length, patch_buf, &arg_length)) { DBG_PRINTF_PATCH_FAILED(word_start,word_length); continue; } arg_begin = patch_buf; } else { /* argument should be taken right from \c buf */ arg_begin = buf+word_start; arg_length = word_length; } word_end = (utf8char *)eh_encode_buffer__UTF8 (arg_begin, arg_begin+arg_length, (char *)(word_buf), (char *)(word_buf+BUFSIZEOF__UTF8_WORD)); if (NULL != word_end) { callback (word_buf, word_end-word_buf, userdata); wordctr_INC1; continue; } if (hugeword_buf_size<(word_length*MAX_UTF8_CHAR)) { /* overflow danger detected */ if (hugeword_buf_size) dk_free (hugeword_buf, hugeword_buf_size); hugeword_buf_size = word_length*MAX_UTF8_CHAR; hugeword_buf = (utf8char *) dk_alloc (hugeword_buf_size); } word_end = (utf8char *)eh_encode_buffer__UTF8 (arg_begin, arg_begin+arg_length, (char *)(hugeword_buf), (char *)(hugeword_buf+hugeword_buf_size)); callback (hugeword_buf, word_end-hugeword_buf, userdata); wordctr_INC1; continue; } if (prop & UCP_IDEO) { word_start = pos; pos++; word_length = pos - word_start; if (NULL!=check && 0 == check(buf+word_start, word_length)) { DBG_PRINTF_NOISE_IDEO(word_start,word_length); continue; } if (NULL != patch) { /* word should be patched */ if (0 == patch (buf+word_start, word_length, patch_buf, &arg_length)) { DBG_PRINTF_IDEO_PATCH_FAILED(word_start,word_length); continue; } arg_begin = patch_buf; } else { /* argument should be taken right from \c buf */ arg_begin = buf+word_start; arg_length = word_length; } word_end = (utf8char *)eh_encode_buffer__UTF8 (arg_begin, arg_begin+arg_length, (char *)(word_buf), (char *)(word_buf+BUFSIZEOF__UTF8_WORD)); callback (word_buf, word_end-word_buf, userdata); wordctr_INC1; continue; } pos++; } if (hugeword_buf_size) dk_free (hugeword_buf, hugeword_buf_size); #ifdef LH_ITERATOR_DEBUG if (wordctr != wordcount) GPF_T; #endif }