void LH_ITERATE_WORDS_NAME(const unichar *buf, size_t bufsize, lh_word_check_t *check, lh_word_callback_t *callback, void *userdata)
{
  size_t pos = 0;
  size_t word_start;
  size_t word_length;
  utf8char word_buf[BUFSIZEOF__UTF8_WORD];
  utf8char *hugeword_buf = NULL;
  size_t hugeword_buf_size = 0;
  utf8char *word_end;
  int prop;
#ifdef LH_ITERATOR_DEBUG
  int wordctr = 0, wordcount = LH_COUNT_WORDS_NAME (buf, bufsize, check);
#define wordctr_INC1 wordctr++
#else
#define wordctr_INC1
#endif
  while (pos < bufsize)
    {
      prop = UNICHAR_GETPROPS_EXPN (buf, bufsize, pos);
      if (prop & UCP_ALPHA)
	{
	  word_start = pos;
	  do pos++; while ((pos < bufsize) && (UNICHAR_GETPROPS_EXPN (buf, bufsize, pos) & UCP_ALPHA));
	  word_length = pos - word_start;
	  if (WORD_MAX_CHARS < word_length)
	    continue;
	  if (NULL!=check && 0 == check(buf+word_start, word_length))
	    continue;
	  word_end = (utf8char *)eh_encode_buffer__UTF8 (buf+word_start, buf+pos, (char *)word_buf, (char *)(word_buf+BUFSIZEOF__UTF8_WORD));
	  if (NULL != word_end)
	    {
	      callback (word_buf, word_end-word_buf, userdata);
              wordctr_INC1;
	      continue;
	    }
	  if (hugeword_buf_size<(word_length*MAX_UTF8_CHAR))
	    {
	      if (hugeword_buf_size)
		dk_free (hugeword_buf, hugeword_buf_size);
	      hugeword_buf_size = word_length*MAX_UTF8_CHAR;
	      hugeword_buf = (utf8char *) dk_alloc (hugeword_buf_size);
	    }
	  word_end = (utf8char *)eh_encode_buffer__UTF8 (buf+word_start, buf+pos, (char *)hugeword_buf, (char *)(hugeword_buf+hugeword_buf_size));
	  callback (hugeword_buf, word_end-hugeword_buf, userdata);
          wordctr_INC1;
	  continue;
	}
      if (prop & UCP_IDEO)
	{
	  word_start = pos;
	  pos++;
	  if (NULL!=check && 0 == check(buf+pos-1, 1))
	    continue;
	  word_end = (utf8char *)eh_encode_buffer__UTF8 (buf+word_start, buf+pos, (char *)(word_buf), (char *)(word_buf+BUFSIZEOF__UTF8_WORD));
	  callback (word_buf, word_end-word_buf, userdata);
          wordctr_INC1;
	  continue;
	}
      pos++;
    }
  if (hugeword_buf_size)
    dk_free (hugeword_buf, hugeword_buf_size);
#ifdef LH_ITERATOR_DEBUG
  if (wordctr != wordcount)
    GPF_T;
#endif
}
Esempio n. 2
0
int elh_iterate_patched_words__xany__UTF8(const char *buf, size_t bufsize, lh_word_check_t *check, lh_word_patch_t *patch, lh_word_callback_t *callback, void *userdata)
{
  unichar check_buf[WORD_MAX_CHARS];
  int prop;
  const char *curr = buf;
  const char *buf_end = buf+bufsize;
  const char *word_begin = curr;
  const char *word_end = NULL;
  unichar uchr;
  size_t word_length;
  unichar patch_buf[WORD_MAX_CHARS];
  const unichar *arg_begin;
  size_t arg_length;
  char word_buf[BUFSIZEOF__UTF8_WORD];
  char *hugeword_buf = NULL;
  size_t hugeword_buf_size = 0;
  while (curr < buf_end)
    {
      word_begin = curr;
      uchr = eh_decode_char__UTF8 (&curr, buf_end);
      prop = unichar_getprops (uchr);
      if (prop & UCP_ALPHA)
	{
	  check_buf[0] = uchr;
	  word_length = 1;
	  for(;;)
	    {
	      word_end = curr;
	      uchr = eh_decode_char__UTF8 (&curr, buf_end);
	      if (uchr < 0)
		{
		  if ((UNICHAR_NO_DATA == uchr) || (UNICHAR_BAD_ENCODING == uchr))
		    return uchr;
		  if (UNICHAR_EOD == uchr)
		    break;
		}
	      prop = unichar_getprops (uchr);
	      if (!(prop & UCP_ALPHA))
		break;
	      if (WORD_MAX_CHARS > word_length)
		check_buf[word_length] = uchr;
	      word_length++;
	    }
	  if (WORD_MAX_CHARS < word_length)
	    goto done_word;
	  if (NULL!=check && 0 == check (check_buf, word_length))
	    goto done_word;
	  if (NULL != patch)
	    {
	      if (0 == patch (check_buf, word_length, patch_buf, &arg_length))
		goto done_word;
	      arg_begin = patch_buf;
	    }
	  else
	    {
	      callback ((utf8char *) word_begin, word_end-word_begin, userdata);
	      goto done_word;
	    }
	  word_end = eh_encode_buffer__UTF8 (arg_begin, arg_begin+arg_length, word_buf, word_buf+BUFSIZEOF__UTF8_WORD);
	  if (NULL != word_end)
	    {
	      callback ((utf8char *)(word_buf), word_end-word_buf, userdata);
	      goto done_word;
	    }
	  if (hugeword_buf_size<(word_length*MAX_UTF8_CHAR))
	    {
	      if (hugeword_buf_size)
		dk_free (hugeword_buf, hugeword_buf_size);
	      hugeword_buf_size = word_length*MAX_UTF8_CHAR;
	      hugeword_buf = (char *) dk_alloc (hugeword_buf_size);
	    }
	  word_end = eh_encode_buffer__UTF8 (arg_begin, arg_begin+arg_length, hugeword_buf, hugeword_buf+hugeword_buf_size);
	  callback ((utf8char *)(hugeword_buf), word_end-hugeword_buf, userdata);
done_word:
	  if (prop & UCP_IDEO)
	    goto proc_ideo;
	  continue;
	}
      if (prop & UCP_IDEO)
	{
proc_ideo:
	  check_buf[0] = uchr;
	  if (NULL!=check && 0 == check (check_buf, 1))
	    continue;
	  if (NULL != patch)
	    {
	      if (0 == patch (check_buf, 1, patch_buf, &arg_length))
		continue;
	      arg_begin = patch_buf;
	    }
	  else
	    {
	      callback ((utf8char *) word_begin, curr-word_begin, userdata);
	      continue;
	    }
	  word_end = eh_encode_buffer__UTF8 (arg_begin, arg_begin+arg_length, word_buf, word_buf+BUFSIZEOF__UTF8_WORD);
	  callback ((utf8char *)(word_buf), word_end-word_buf, userdata);
	  continue;
	}
      if ((uchr < 0) && ((UNICHAR_NO_DATA == uchr) || (UNICHAR_BAD_ENCODING == uchr)))
	goto cleanup; /* see below */
    }
  uchr = 0;
cleanup:
  if (hugeword_buf_size)
    dk_free (hugeword_buf, hugeword_buf_size);
  return uchr;
}
void LH_ITERATE_PATCHED_WORDS_NAME(const unichar *buf, size_t bufsize, lh_word_check_t *check, lh_word_patch_t *patch, lh_word_callback_t *callback, void *userdata)
{
  size_t pos = 0;
  size_t word_start;
  size_t word_length;
  unichar patch_buf[WORD_MAX_CHARS];
  const unichar *arg_begin;
  size_t arg_length;
  utf8char word_buf[BUFSIZEOF__UTF8_WORD];
  utf8char *hugeword_buf = NULL;
  size_t hugeword_buf_size = 0;
  utf8char *word_end;
  int prop;
#ifdef LH_ITERATOR_DEBUG
  int wordctr = 0, wordcount = LH_COUNT_WORDS_NAME (buf, bufsize, check);
#define wordctr_INC1 wordctr++
#else
#define wordctr_INC1
#endif
  while (pos < bufsize)
    {
      prop = UNICHAR_GETPROPS_EXPN(buf,bufsize,pos);
      if (prop & UCP_ALPHA)
	{
	  word_start = pos;
	  do pos++; while ((pos < bufsize) && (UNICHAR_GETPROPS_EXPN(buf,bufsize,pos) & UCP_ALPHA));
	  word_length = pos - word_start;
	  if (WORD_MAX_CHARS < word_length)
	    continue;
	  if (NULL!=check && 0 == check(buf+word_start, word_length))
	    {
	      DBG_PRINTF_NOISE_WORD(word_start,word_length);
	      continue;
	    }
	  if (NULL != patch)
	    { /* word should be patched */
	      if (0 == patch (buf+word_start, word_length, patch_buf, &arg_length))
		{
		  DBG_PRINTF_PATCH_FAILED(word_start,word_length);
		  continue;
		}
	      arg_begin = patch_buf;
	    }
	  else
	    { /* argument should be taken right from \c buf */
	      arg_begin = buf+word_start;
	      arg_length = word_length;
	    }
	  word_end = (utf8char *)eh_encode_buffer__UTF8 (arg_begin, arg_begin+arg_length, (char *)(word_buf), (char *)(word_buf+BUFSIZEOF__UTF8_WORD));
	  if (NULL != word_end)
	    {
	      callback (word_buf, word_end-word_buf, userdata);
              wordctr_INC1;
	      continue;
	    }
	  if (hugeword_buf_size<(word_length*MAX_UTF8_CHAR))
	    { /* overflow danger detected */
	      if (hugeword_buf_size)
		dk_free (hugeword_buf, hugeword_buf_size);
	      hugeword_buf_size = word_length*MAX_UTF8_CHAR;
	      hugeword_buf = (utf8char *) dk_alloc (hugeword_buf_size);
	    }
	  word_end = (utf8char *)eh_encode_buffer__UTF8 (arg_begin, arg_begin+arg_length, (char *)(hugeword_buf), (char *)(hugeword_buf+hugeword_buf_size));
	  callback (hugeword_buf, word_end-hugeword_buf, userdata);
          wordctr_INC1;
	  continue;
	}
      if (prop & UCP_IDEO)
	{
	  word_start = pos;
	  pos++;
	  word_length = pos - word_start;
	  if (NULL!=check && 0 == check(buf+word_start, word_length))
	    {
	      DBG_PRINTF_NOISE_IDEO(word_start,word_length);
	      continue;
	    }
	  if (NULL != patch)
	    { /* word should be patched */
	      if (0 == patch (buf+word_start, word_length, patch_buf, &arg_length))
		{
		  DBG_PRINTF_IDEO_PATCH_FAILED(word_start,word_length);
		  continue;
		}
	      arg_begin = patch_buf;
	    }
	  else
	    { /* argument should be taken right from \c buf */
	      arg_begin = buf+word_start;
	      arg_length = word_length;
	    }
	  word_end = (utf8char *)eh_encode_buffer__UTF8 (arg_begin, arg_begin+arg_length, (char *)(word_buf), (char *)(word_buf+BUFSIZEOF__UTF8_WORD));
	  callback (word_buf, word_end-word_buf, userdata);
          wordctr_INC1;
	  continue;
	}
      pos++;
    }
  if (hugeword_buf_size)
    dk_free (hugeword_buf, hugeword_buf_size);
#ifdef LH_ITERATOR_DEBUG
  if (wordctr != wordcount)
    GPF_T;
#endif
}