Exemplo n.º 1
0
int elh_iterate_words__xany__UTF8(const char *buf, size_t bufsize, lh_word_check_t *check, lh_word_callback_t *callback, void *userdata)
{
  unichar check_buf[WORD_MAX_CHARS];
  int prop;
  const char *curr = buf;
  const char *buf_end = buf+bufsize;
  const char *word_begin = curr;
  const char *word_end;
  unichar uchr;
  size_t word_length;
  while (curr < buf_end)
    {
      word_begin = curr;
      uchr = eh_decode_char__UTF8 (&curr, buf_end);
      prop = unichar_getprops (uchr);
      if (prop & UCP_ALPHA)
	{
	  check_buf[0] = uchr;
	  word_length = 1;
	  for(;;)
	    {
	      word_end = curr;
	      uchr = eh_decode_char__UTF8 (&curr, buf_end);
	      if (uchr < 0)
		{
		  if ((UNICHAR_NO_DATA == uchr) || (UNICHAR_BAD_ENCODING == uchr))
		    return uchr;
		  if (UNICHAR_EOD == uchr)
		    break;
		}
	      prop = unichar_getprops (uchr);
	      if (!(prop & UCP_ALPHA))
		break;
	      if (WORD_MAX_CHARS > word_length)
		check_buf[word_length] = uchr;
	      word_length++;
	    }
	  if (WORD_MAX_CHARS < word_length)
	    goto done_word;
	  if (NULL!=check && 0 == check (check_buf, word_length))
	    goto done_word;
	  callback ((utf8char *)(word_begin), word_end-word_begin, userdata);
done_word:
	  if (prop & UCP_IDEO)
	    goto proc_ideo;
	  continue;
	}
      if (prop & UCP_IDEO)
	{
proc_ideo:
	  check_buf[0] = uchr;
	  if (NULL!=check && 0 == check (check_buf, 1))
	    continue;
	  callback ((utf8char *)(word_begin), curr-word_begin, userdata);
	  continue;
	}
      if ((uchr < 0) && ((UNICHAR_NO_DATA == uchr) || (UNICHAR_BAD_ENCODING == uchr)))
	return uchr;
    }
  return 0;
}
Exemplo n.º 2
0
int elh_iterate_patched_words__xany__UTF8(const char *buf, size_t bufsize, lh_word_check_t *check, lh_word_patch_t *patch, lh_word_callback_t *callback, void *userdata)
{
  unichar check_buf[WORD_MAX_CHARS];
  int prop;
  const char *curr = buf;
  const char *buf_end = buf+bufsize;
  const char *word_begin = curr;
  const char *word_end = NULL;
  unichar uchr;
  size_t word_length;
  unichar patch_buf[WORD_MAX_CHARS];
  const unichar *arg_begin;
  size_t arg_length;
  char word_buf[BUFSIZEOF__UTF8_WORD];
  char *hugeword_buf = NULL;
  size_t hugeword_buf_size = 0;
  while (curr < buf_end)
    {
      word_begin = curr;
      uchr = eh_decode_char__UTF8 (&curr, buf_end);
      prop = unichar_getprops (uchr);
      if (prop & UCP_ALPHA)
	{
	  check_buf[0] = uchr;
	  word_length = 1;
	  for(;;)
	    {
	      word_end = curr;
	      uchr = eh_decode_char__UTF8 (&curr, buf_end);
	      if (uchr < 0)
		{
		  if ((UNICHAR_NO_DATA == uchr) || (UNICHAR_BAD_ENCODING == uchr))
		    return uchr;
		  if (UNICHAR_EOD == uchr)
		    break;
		}
	      prop = unichar_getprops (uchr);
	      if (!(prop & UCP_ALPHA))
		break;
	      if (WORD_MAX_CHARS > word_length)
		check_buf[word_length] = uchr;
	      word_length++;
	    }
	  if (WORD_MAX_CHARS < word_length)
	    goto done_word;
	  if (NULL!=check && 0 == check (check_buf, word_length))
	    goto done_word;
	  if (NULL != patch)
	    {
	      if (0 == patch (check_buf, word_length, patch_buf, &arg_length))
		goto done_word;
	      arg_begin = patch_buf;
	    }
	  else
	    {
	      callback ((utf8char *) word_begin, word_end-word_begin, userdata);
	      goto done_word;
	    }
	  word_end = eh_encode_buffer__UTF8 (arg_begin, arg_begin+arg_length, word_buf, word_buf+BUFSIZEOF__UTF8_WORD);
	  if (NULL != word_end)
	    {
	      callback ((utf8char *)(word_buf), word_end-word_buf, userdata);
	      goto done_word;
	    }
	  if (hugeword_buf_size<(word_length*MAX_UTF8_CHAR))
	    {
	      if (hugeword_buf_size)
		dk_free (hugeword_buf, hugeword_buf_size);
	      hugeword_buf_size = word_length*MAX_UTF8_CHAR;
	      hugeword_buf = (char *) dk_alloc (hugeword_buf_size);
	    }
	  word_end = eh_encode_buffer__UTF8 (arg_begin, arg_begin+arg_length, hugeword_buf, hugeword_buf+hugeword_buf_size);
	  callback ((utf8char *)(hugeword_buf), word_end-hugeword_buf, userdata);
done_word:
	  if (prop & UCP_IDEO)
	    goto proc_ideo;
	  continue;
	}
      if (prop & UCP_IDEO)
	{
proc_ideo:
	  check_buf[0] = uchr;
	  if (NULL!=check && 0 == check (check_buf, 1))
	    continue;
	  if (NULL != patch)
	    {
	      if (0 == patch (check_buf, 1, patch_buf, &arg_length))
		continue;
	      arg_begin = patch_buf;
	    }
	  else
	    {
	      callback ((utf8char *) word_begin, curr-word_begin, userdata);
	      continue;
	    }
	  word_end = eh_encode_buffer__UTF8 (arg_begin, arg_begin+arg_length, word_buf, word_buf+BUFSIZEOF__UTF8_WORD);
	  callback ((utf8char *)(word_buf), word_end-word_buf, userdata);
	  continue;
	}
      if ((uchr < 0) && ((UNICHAR_NO_DATA == uchr) || (UNICHAR_BAD_ENCODING == uchr)))
	goto cleanup; /* see below */
    }
  uchr = 0;
cleanup:
  if (hugeword_buf_size)
    dk_free (hugeword_buf, hugeword_buf_size);
  return uchr;
}
int elh_count_words__xViAny__UTF8(const char *buf, size_t bufsize, lh_word_check_t *check)
{
  unichar check_buf[WORD_MAX_CHARS];
  int res = 0;
  int prop;
  const char *curr = buf;
  const char *buf_end = buf+bufsize;
  const char *word_begin = curr;
  const char *word_end = NULL;
  unichar uchr;
  size_t word_length;
  while (curr < buf_end)
    {
      word_begin = curr;
      uchr = eh_decode_char__UTF8 (&curr, buf_end);
      prop = unichar_getprops (uchr);
      if (prop & UCP_ALPHA)
	{
	  check_buf[0] = uchr;
	  word_length = 1;
	  for(;;)
	    {
	      word_end = curr;
	      uchr = eh_decode_char__UTF8 (&curr, buf_end);
	      if (uchr < 0)
		{
		  if ((UNICHAR_NO_DATA == uchr) || (UNICHAR_BAD_ENCODING == uchr))
		    return uchr;
		  if (UNICHAR_EOD == uchr)
		    break;
		}
	      prop = unichar_getprops (uchr);
	      if (!(prop & UCP_ALPHA) && !IS_CONNECTIVE)
		break;
	      if (WORD_MAX_CHARS > word_length)
		check_buf[word_length] = uchr;
	      word_length++;
	    }
	  if (WORD_MAX_CHARS < word_length)
	    goto done_word;
	  if (NULL!=check && 0 == check(check_buf, word_length))
	    goto done_word;
	  res++;
done_word:
	  if (prop & UCP_IDEO)
	    goto proc_ideo;
	  continue;
	}
      if (prop & UCP_IDEO)
	{
proc_ideo:
	  check_buf[0] = uchr;
	  if (NULL!=check && 0 == check(check_buf, 1))
	    continue;
	  res++;
	  continue;
	}
      if ((uchr < 0) && ((UNICHAR_NO_DATA == uchr) || (UNICHAR_BAD_ENCODING == uchr)))
	return uchr;
    }
  return res;
}