예제 #1
0
/* The input word in this method MUST be normalized in NFKD form,
 * and given in UTF-8, where str_length is the byte-length */
gboolean
tracker_parser_unaccent_nfkd_string (gpointer  str,
                                     gsize    *str_length)
{
	gchar *word;
	gsize word_length;
	gsize i;
	gsize j;

	g_return_val_if_fail (str != NULL, FALSE);
	g_return_val_if_fail (str_length != NULL, FALSE);
	g_return_val_if_fail (*str_length > 0, FALSE);

	word = (gchar *)str;
	word_length = *str_length;

	i = 0;
	j = 0;
	while (i < word_length) {
		ucs4_t unichar;
		gint utf8_len;

		/* Get next character of the word as UCS4 */
		utf8_len = u8_strmbtouc (&unichar, &word[i]);

		/* Invalid UTF-8 character or end of original string. */
		if (utf8_len <= 0) {
			break;
		}

		/* If the given unichar is a combining diacritical mark,
		 * just update the original index, not the output one */
		if (IS_CDM_UCS4 ((guint32) unichar)) {
			i += utf8_len;
			continue;
		}

		/* If already found a previous combining
		 * diacritical mark, indexes are different so
		 * need to copy characters. As output and input
		 * buffers may overlap, need to use memmove
		 * instead of memcpy */
		if (i != j) {
			memmove (&word[j], &word[i], utf8_len);
		}

		/* Update both indexes */
		i += utf8_len;
		j += utf8_len;
	}

	/* Force proper string end */
	word[j] = '\0';

	/* Set new output length */
	*str_length = j;

	return TRUE;
}
예제 #2
0
파일: u8-next.c 프로젝트: iauther/x
const uint8_t *
u8_next (ucs4_t *puc, const uint8_t *s)
{
  int count;

  count = u8_strmbtouc (puc, s);
  if (count > 0)
    return s + count;
  else
    {
      if (count < 0)
        *puc = 0xfffd;
      return NULL;
    }
}
예제 #3
0
static gboolean
get_word_info (TrackerParser         *parser,
               gsize                 *p_word_length,
               gboolean              *p_is_allowed_word_start,
               TrackerParserWordType *p_word_type)
{
	ucs4_t first_unichar;
	gint first_unichar_len;
	gboolean ascii_only;

	/* Defaults */
	*p_is_allowed_word_start = TRUE;

	/* Get first character of the word as UCS4 */
	first_unichar_len = u8_strmbtouc (&first_unichar,
	                                  &(parser->txt[parser->cursor]));
	if (first_unichar_len <= 0) {
		/* This should only happen if NIL was passed to u8_strmbtouc,
		 *  so better just force stop here */
		return FALSE;
	} else  {
		/* If first character has length 1, it's ASCII-7 */
		ascii_only = first_unichar_len == 1 ? TRUE : FALSE;
	}

	/* Consider word starts with a forced wordbreak */
	if (parser->enable_forced_wordbreaks &&
	    IS_FORCED_WORDBREAK_UCS4 ((guint32)first_unichar)) {
		*p_word_length = first_unichar_len;
	} else {
		gsize i;

		/* Find next word break, and in the same loop checking if only ASCII
		 *  characters */
		i = parser->cursor + first_unichar_len;
		while (1) {
			/* Text bounds reached? */
			if (i >= parser->txt_size)
				break;
			/* Proper unicode word break detected? */
			if (parser->word_break_flags[i])
				break;
			/* Forced word break detected? */
			if (parser->enable_forced_wordbreaks &&
			    IS_FORCED_WORDBREAK_UCS4 ((guint32)parser->txt[i]))
				break;

			if (ascii_only &&
			    !IS_ASCII_UCS4 ((guint32)parser->txt[i])) {
				ascii_only = FALSE;
			}

			i++;
		}

		/* Word end is the first byte after the word, which is either the
		 *  start of next word or the end of the string */
		*p_word_length = i - parser->cursor;
	}

	/* We only want the words where the first character
	 *  in the word is either a letter, a number or a symbol.
	 * This is needed because the word break algorithm also
	 *  considers word breaks after for example commas or other
	 *  punctuation marks.
	 * Note that looking at the first character in the string
	 *  should be compatible with all Unicode normalization
	 *  methods.
	 */
	if (!IS_UNDERSCORE_UCS4 ((guint32)first_unichar) &&
	    !uc_is_general_category (first_unichar,
	                             parser->allowed_start)) {
		*p_is_allowed_word_start = FALSE;
		return TRUE;
	}

	/* Decide word type */
	if (ascii_only) {
		*p_word_type = TRACKER_PARSER_WORD_TYPE_ASCII;
	} else if (IS_CJK_UCS4 (first_unichar)) {
		*p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_NO_UNAC;
	} else {
		*p_word_type = TRACKER_PARSER_WORD_TYPE_OTHER_UNAC;
	}
	return TRUE;
}
예제 #4
0
int
main ()
{
  ucs4_t uc;
  int ret;

  /* Test NUL unit input.  */
  {
    static const uint8_t input[] = "";
    uc = 0xBADFACE;
    ret = u8_strmbtouc (&uc, input);
    ASSERT (ret == 0);
    ASSERT (uc == 0);
  }

  /* Test ISO 646 unit input.  */
  {
    ucs4_t c;
    uint8_t buf[2];

    for (c = 1; c < 0x80; c++)
      {
        buf[0] = c;
        buf[1] = 0;
        uc = 0xBADFACE;
        ret = u8_strmbtouc (&uc, buf);
        ASSERT (ret == 1);
        ASSERT (uc == c);
      }
  }

  /* Test 2-byte character input.  */
  {
    static const uint8_t input[] = { 0xC3, 0x97, 0 };
    uc = 0xBADFACE;
    ret = u8_strmbtouc (&uc, input);
    ASSERT (ret == 2);
    ASSERT (uc == 0x00D7);
  }

  /* Test 3-byte character input.  */
  {
    static const uint8_t input[] = { 0xE2, 0x82, 0xAC, 0 };
    uc = 0xBADFACE;
    ret = u8_strmbtouc (&uc, input);
    ASSERT (ret == 3);
    ASSERT (uc == 0x20AC);
  }

  /* Test 4-byte character input.  */
  {
    static const uint8_t input[] = { 0xF4, 0x8F, 0xBF, 0xBD, 0 };
    uc = 0xBADFACE;
    ret = u8_strmbtouc (&uc, input);
    ASSERT (ret == 4);
    ASSERT (uc == 0x10FFFD);
  }

  /* Test incomplete/invalid 1-byte input.  */
  {
    static const uint8_t input[] = { 0xC1, 0 };
    uc = 0xBADFACE;
    ret = u8_strmbtouc (&uc, input);
    ASSERT (ret == -1);
    ASSERT (uc == 0xBADFACE);
  }
  {
    static const uint8_t input[] = { 0xC3, 0 };
    uc = 0xBADFACE;
    ret = u8_strmbtouc (&uc, input);
    ASSERT (ret == -1);
    ASSERT (uc == 0xBADFACE);
  }
  {
    static const uint8_t input[] = { 0xE2, 0 };
    uc = 0xBADFACE;
    ret = u8_strmbtouc (&uc, input);
    ASSERT (ret == -1);
    ASSERT (uc == 0xBADFACE);
  }
  {
    static const uint8_t input[] = { 0xF4, 0 };
    uc = 0xBADFACE;
    ret = u8_strmbtouc (&uc, input);
    ASSERT (ret == -1);
    ASSERT (uc == 0xBADFACE);
  }
  {
    static const uint8_t input[] = { 0xFE, 0 };
    uc = 0xBADFACE;
    ret = u8_strmbtouc (&uc, input);
    ASSERT (ret == -1);
    ASSERT (uc == 0xBADFACE);
  }

  /* Test incomplete/invalid 2-byte input.  */
  {
    static const uint8_t input[] = { 0xE0, 0x9F, 0 };
    uc = 0xBADFACE;
    ret = u8_strmbtouc (&uc, input);
    ASSERT (ret == -1);
    ASSERT (uc == 0xBADFACE);
  }
  {
    static const uint8_t input[] = { 0xE2, 0x82, 0 };
    uc = 0xBADFACE;
    ret = u8_strmbtouc (&uc, input);
    ASSERT (ret == -1);
    ASSERT (uc == 0xBADFACE);
  }
  {
    static const uint8_t input[] = { 0xE2, 0xD0, 0 };
    uc = 0xBADFACE;
    ret = u8_strmbtouc (&uc, input);
    ASSERT (ret == -1);
    ASSERT (uc == 0xBADFACE);
  }
  {
    static const uint8_t input[] = { 0xF0, 0x8F, 0 };
    uc = 0xBADFACE;
    ret = u8_strmbtouc (&uc, input);
    ASSERT (ret == -1);
    ASSERT (uc == 0xBADFACE);
  }
  {
    static const uint8_t input[] = { 0xF3, 0x8F, 0 };
    uc = 0xBADFACE;
    ret = u8_strmbtouc (&uc, input);
    ASSERT (ret == -1);
    ASSERT (uc == 0xBADFACE);
  }
  {
    static const uint8_t input[] = { 0xF3, 0xD0, 0 };
    uc = 0xBADFACE;
    ret = u8_strmbtouc (&uc, input);
    ASSERT (ret == -1);
    ASSERT (uc == 0xBADFACE);
  }

  /* Test incomplete/invalid 3-byte input.  */
  {
    static const uint8_t input[] = { 0xF3, 0x8F, 0xBF, 0 };
    uc = 0xBADFACE;
    ret = u8_strmbtouc (&uc, input);
    ASSERT (ret == -1);
    ASSERT (uc == 0xBADFACE);
  }
  {
    static const uint8_t input[] = { 0xF3, 0xD0, 0xBF, 0 };
    uc = 0xBADFACE;
    ret = u8_strmbtouc (&uc, input);
    ASSERT (ret == -1);
    ASSERT (uc == 0xBADFACE);
  }
  {
    static const uint8_t input[] = { 0xF3, 0x8F, 0xD0, 0 };
    uc = 0xBADFACE;
    ret = u8_strmbtouc (&uc, input);
    ASSERT (ret == -1);
    ASSERT (uc == 0xBADFACE);
  }

  return 0;
}