示例#1
0
/**
 * Sets the word breaking information for a generic input string.
 *
 * @param[in]  s			input string
 * @param[in]  len			length of the input
 * @param[in]  lang			language of the input
 * @param[out] brks			pointer to the output breaking data, containing
 *							#WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
 *							#WORDBREAK_INSIDEACHAR
 * @param[in] get_next_char	function to get the next UTF-32 character
 */
static void set_wordbreaks(
		const void *s,
		size_t len,
		const char *lang,
		char *brks,
		get_next_char_t get_next_char)
{
	enum WordBreakClass wbcLast = WBP_Undefined;
	/* wbcSeqStart is the class that started the current sequence.
	 * WBP_Undefined is a special case that means "sot".
	 * This value is the class that is at the start of the current rule
	 * matching sequence. For example, in case of Numeric+MidNum+Numeric
	 * it'll be Numeric all the way.
	 */
	enum WordBreakClass wbcSeqStart = WBP_Undefined;
	utf32_t ch;
	size_t posNext = 0;
	size_t posCur = 0;
	size_t posLast = 0;

	/* TODO: Language-specific specialization. */
	(void) lang;

	/* Init brks. */
	memset(brks, WORDBREAK_BREAK, len);

	ch = get_next_char(s, len, &posNext);

	while (ch != EOS)
	{
		enum WordBreakClass wbcCur;
		wbcCur = get_char_wb_class(ch, wb_prop_default,
								   ARRAY_LEN(wb_prop_default));

		switch (wbcCur)
		{
	    case WBP_CR:
			/* WB3b */
			set_brks_to(s, brks, posLast, posCur, len,
						WORDBREAK_BREAK, get_next_char);
			wbcSeqStart = wbcCur;
			posLast = posCur;
			break;

	    case WBP_LF:
			if (wbcSeqStart == WBP_CR) /* WB3 */
			{
				set_brks_to(s, brks, posLast, posCur, len,
							WORDBREAK_NOBREAK, get_next_char);
				wbcSeqStart = wbcCur;
				posLast = posCur;
				break;
			}
			/* Fall off */

	    case WBP_Newline:
			/* WB3a,3b */
			set_brks_to(s, brks, posLast, posCur, len,
						WORDBREAK_BREAK, get_next_char);
			wbcSeqStart = wbcCur;
			posLast = posCur;
			break;

	    case WBP_Extend:
	    case WBP_Format:
			/* WB4 - If not the first char/after a newline (WB3a,3b), skip
			 * this class, set it to be the same as the prev, and mark
			 * brks not to break before them. */
			if ((wbcSeqStart == WBP_Undefined) || IS_WB3ab(wbcSeqStart))
			{
				set_brks_to(s, brks, posLast, posCur, len,
							WORDBREAK_BREAK, get_next_char);
				wbcSeqStart = wbcCur;
			}
			else
			{
				/* It's surely not the first */
				brks[posCur - 1] = WORDBREAK_NOBREAK;
				/* "inherit" the previous class. */
				wbcCur = wbcLast;
			}
			break;

	    case WBP_Katakana:
			if ((wbcSeqStart == WBP_Katakana) || /* WB13 */
					(wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
			{
				set_brks_to(s, brks, posLast, posCur, len,
							WORDBREAK_NOBREAK, get_next_char);
			}
			/* No rule found, reset */
			else
			{
				set_brks_to(s, brks, posLast, posCur, len,
							WORDBREAK_BREAK, get_next_char);
			}
			wbcSeqStart = wbcCur;
			posLast = posCur;
			break;

	    case WBP_ALetter:
			if ((wbcSeqStart == WBP_ALetter) || /* WB5,6,7 */
					(wbcLast == WBP_Numeric) || /* WB10 */
					(wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
			{
				set_brks_to(s, brks, posLast, posCur, len,
							WORDBREAK_NOBREAK, get_next_char);
			}
			/* No rule found, reset */
			else
			{
				set_brks_to(s, brks, posLast, posCur, len,
							WORDBREAK_BREAK, get_next_char);
			}
			wbcSeqStart = wbcCur;
			posLast = posCur;
			break;

	    case WBP_MidNumLet:
			if ((wbcLast == WBP_ALetter) || /* WB6,7 */
					(wbcLast == WBP_Numeric)) /* WB11,12 */
			{
				/* Go on */
			}
			else
			{
				set_brks_to(s, brks, posLast, posCur, len,
							WORDBREAK_BREAK, get_next_char);
				wbcSeqStart = wbcCur;
				posLast = posCur;
			}
			break;

	    case WBP_MidLetter:
			if (wbcLast == WBP_ALetter) /* WB6,7 */
			{
				/* Go on */
			}
			else
			{
				set_brks_to(s, brks, posLast, posCur, len,
							WORDBREAK_BREAK, get_next_char);
				wbcSeqStart = wbcCur;
				posLast = posCur;
			}
			break;

	    case WBP_MidNum:
			if (wbcLast == WBP_Numeric) /* WB11,12 */
			{
				/* Go on */
			}
			else
			{
				set_brks_to(s, brks, posLast, posCur, len,
							WORDBREAK_BREAK, get_next_char);
				wbcSeqStart = wbcCur;
				posLast = posCur;
			}
			break;

	    case WBP_Numeric:
			if ((wbcSeqStart == WBP_Numeric) || /* WB8,11,12 */
					(wbcLast == WBP_ALetter) || /* WB9 */
					(wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
			{
				set_brks_to(s, brks, posLast, posCur, len,
							WORDBREAK_NOBREAK, get_next_char);
			}
			/* No rule found, reset */
			else
			{
				set_brks_to(s, brks, posLast, posCur, len,
							WORDBREAK_BREAK, get_next_char);
			}
			wbcSeqStart = wbcCur;
			posLast = posCur;
			break;

	    case WBP_ExtendNumLet:
			/* WB13a,13b */
			if ((wbcSeqStart == wbcLast) &&
				((wbcLast == WBP_ALetter) ||
				 (wbcLast == WBP_Numeric) ||
				 (wbcLast == WBP_Katakana) ||
				 (wbcLast == WBP_ExtendNumLet)))
			{
				set_brks_to(s, brks, posLast, posCur, len,
							WORDBREAK_NOBREAK, get_next_char);
			}
			/* No rule found, reset */
			else
			{
				set_brks_to(s, brks, posLast, posCur, len,
							WORDBREAK_BREAK, get_next_char);
			}
			wbcSeqStart = wbcCur;
			posLast = posCur;
			break;

		 case WBP_Any:
			/* Allow breaks and reset */
			set_brks_to(s, brks, posLast, posCur, len,
						WORDBREAK_BREAK, get_next_char);
			wbcSeqStart = wbcCur;
			posLast = posCur;
			break;

	    default:
			/* Error, should never get here! */
			assert(0);
			break;
		}

		wbcLast = wbcCur;
		posCur = posNext;
		ch = get_next_char(s, len, &posNext);
    }

	/* WB2 */
	set_brks_to(s, brks, posLast, posNext, len,
				WORDBREAK_BREAK, get_next_char);
}
示例#2
0
/**
 * Sets the word breaking information for a generic input string.
 *
 * @param[in]  s             input string
 * @param[in]  len           length of the input
 * @param[in]  lang          language of the input (reserved for future use)
 * @param[out] brks          pointer to the output breaking data, containing
 *                           #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
 *                           #WORDBREAK_INSIDEACHAR
 * @param[in] get_next_char  function to get the next UTF-32 character
 */
static void set_wordbreaks(
        const void *s,
        size_t len,
        const char *lang,
        char *brks,
        get_next_char_t get_next_char)
{
    /* Counter of how many time we cam across RI */
    int riCounter = 0;
    enum WordBreakClass wbcLast = WBP_Undefined;
    /* wbcSeqStart is the class that started the current sequence.
     * WBP_Undefined is a special case that means "sot".
     * This value is the class that is at the start of the current rule
     * matching sequence. For example, in case of Numeric+MidNum+Numeric
     * it'll be Numeric all the way.
     */
    enum WordBreakClass wbcSeqStart = WBP_Undefined;
    utf32_t ch;
    size_t posNext = 0;
    size_t posCur = 0;
    size_t posLast = 0;

    /* TODO: Language-specific specialization. */
    (void) lang;

    /* Init brks. */
    memset(brks, WORDBREAK_BREAK, len);

    ch = get_next_char(s, len, &posNext);

    while (ch != EOS)
    {
        enum WordBreakClass wbcCur;
        wbcCur = get_char_wb_class(ch, wb_prop_default,
                                   ARRAY_LEN(wb_prop_default));

        switch (wbcCur)
        {
        case WBP_CR:
            /* WB3b */
            set_brks_to(s, brks, posLast, posCur, len,
                        WORDBREAK_BREAK, get_next_char);
            wbcSeqStart = wbcCur;
            posLast = posCur;
            break;

        case WBP_LF:
            if (wbcSeqStart == WBP_CR) /* WB3 */
            {
                set_brks_to(s, brks, posLast, posCur, len,
                            WORDBREAK_NOBREAK, get_next_char);
                wbcSeqStart = wbcCur;
                posLast = posCur;
                break;
            }
            /* Fall through */

        case WBP_Newline:
            /* WB3a,3b */
            set_brks_to(s, brks, posLast, posCur, len,
                        WORDBREAK_BREAK, get_next_char);
            wbcSeqStart = wbcCur;
            posLast = posCur;
            break;

        case WBP_ZWJ:
        case WBP_Extend:
        case WBP_Format:
            /* WB4 - If not the first char/after a newline (WB3a,3b), skip
             * this class, set it to be the same as the prev, and mark
             * brks not to break before them. */
            if ((wbcSeqStart == WBP_Undefined) || IS_WB3ab(wbcSeqStart))
            {
                set_brks_to(s, brks, posLast, posCur, len,
                            WORDBREAK_BREAK, get_next_char);
                wbcSeqStart = wbcCur;
                posLast = posCur;
            }
            else
            {
                /* It's surely not the first */
                brks[posCur - 1] = WORDBREAK_NOBREAK;
                /* WB3c and WB3d precede 4, so no intervening Extend
                 * chars allowed. */
                if (wbcCur != WBP_ZWJ && wbcSeqStart != WBP_ZWJ &&
                    wbcSeqStart != WBP_WSegSpace)
                {
                    /* "inherit" the previous class. */
                    wbcCur = wbcLast;
                }
            }
            break;

        case WBP_Katakana:
            if ((wbcSeqStart == WBP_Katakana) || /* WB13 */
                    (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
            {
                set_brks_to(s, brks, posLast, posCur, len,
                            WORDBREAK_NOBREAK, get_next_char);
            }
            /* No rule found, reset */
            else
            {
                set_brks_to(s, brks, posLast, posCur, len,
                            WORDBREAK_BREAK, get_next_char);
            }
            wbcSeqStart = wbcCur;
            posLast = posCur;
            break;

        case WBP_Hebrew_Letter:
        case WBP_ALetter:
            if ((wbcSeqStart == WBP_Hebrew_Letter) &&
                    (wbcLast == WBP_Double_Quote)) /* WB7b,c */
            {
               if (wbcCur == WBP_Hebrew_Letter)
                 {
                     set_brks_to(s, brks, posLast, posCur, len,
                             WORDBREAK_NOBREAK, get_next_char);
                 }
               else
                 {
                     set_brks_to(s, brks, posLast, posCur, len,
                             WORDBREAK_BREAK, get_next_char);
                 }
            }
            else if (((wbcSeqStart == WBP_ALetter) ||
                        (wbcSeqStart == WBP_Hebrew_Letter)) || /* WB5,6,7 */
                    (wbcLast == WBP_Numeric) || /* WB10 */
                    (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
            {
                set_brks_to(s, brks, posLast, posCur, len,
                            WORDBREAK_NOBREAK, get_next_char);
            }
            /* No rule found, reset */
            else
            {
                set_brks_to(s, brks, posLast, posCur, len,
                            WORDBREAK_BREAK, get_next_char);
            }
            wbcSeqStart = wbcCur;
            posLast = posCur;
            break;

        case WBP_Single_Quote:
            if (wbcLast == WBP_Hebrew_Letter) /* WB7a */
            {
                set_brks_to(s, brks, posLast, posCur, len,
                            WORDBREAK_NOBREAK, get_next_char);
                wbcSeqStart = wbcCur;
                posLast = posCur;
            }
            /* Fall through */

        case WBP_MidNumLet:
            if (((wbcLast == WBP_ALetter) ||
                        (wbcLast == WBP_Hebrew_Letter)) || /* WB6,7 */
                    (wbcLast == WBP_Numeric)) /* WB11,12 */
            {
                /* Go on */
            }
            else
            {
                set_brks_to(s, brks, posLast, posCur, len,
                            WORDBREAK_BREAK, get_next_char);
                wbcSeqStart = wbcCur;
                posLast = posCur;
            }
            break;

        case WBP_MidLetter:
            if ((wbcLast == WBP_ALetter) ||
                    (wbcLast == WBP_Hebrew_Letter)) /* WB6,7 */
            {
                /* Go on */
            }
            else
            {
                set_brks_to(s, brks, posLast, posCur, len,
                            WORDBREAK_BREAK, get_next_char);
                wbcSeqStart = wbcCur;
                posLast = posCur;
            }
            break;

        case WBP_MidNum:
            if (wbcLast == WBP_Numeric) /* WB11,12 */
            {
                /* Go on */
            }
            else
            {
                set_brks_to(s, brks, posLast, posCur, len,
                            WORDBREAK_BREAK, get_next_char);
                wbcSeqStart = wbcCur;
                posLast = posCur;
            }
            break;

        case WBP_Numeric:
            if ((wbcSeqStart == WBP_Numeric) || /* WB8,11,12 */
                    ((wbcLast == WBP_ALetter) ||
                     (wbcLast == WBP_Hebrew_Letter)) || /* WB9 */
                    (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
            {
                set_brks_to(s, brks, posLast, posCur, len,
                            WORDBREAK_NOBREAK, get_next_char);
            }
            /* No rule found, reset */
            else
            {
                set_brks_to(s, brks, posLast, posCur, len,
                            WORDBREAK_BREAK, get_next_char);
            }
            wbcSeqStart = wbcCur;
            posLast = posCur;
            break;

        case WBP_ExtendNumLet:
            /* WB13a,13b */
            if ((wbcSeqStart == wbcLast) &&
                ((wbcLast == WBP_ALetter) ||
                 (wbcLast == WBP_Hebrew_Letter) ||
                 (wbcLast == WBP_Numeric) ||
                 (wbcLast == WBP_Katakana) ||
                 (wbcLast == WBP_ExtendNumLet)))
            {
                set_brks_to(s, brks, posLast, posCur, len,
                            WORDBREAK_NOBREAK, get_next_char);
            }
            /* No rule found, reset */
            else
            {
                set_brks_to(s, brks, posLast, posCur, len,
                            WORDBREAK_BREAK, get_next_char);
            }
            wbcSeqStart = wbcCur;
            posLast = posCur;
            break;

        case WBP_Regional_Indicator:
            /* WB15,16 */
            if ((wbcSeqStart == WBP_Regional_Indicator) &&
                ((riCounter % 2) == 1))
            {
                set_brks_to(s, brks, posLast, posCur, len,
                        WORDBREAK_NOBREAK, get_next_char);
                riCounter = 0; /* Reset the sequence */
            }
            /* No rule found, reset */
            else
            {
                set_brks_to(s, brks, posLast, posCur, len,
                            WORDBREAK_BREAK, get_next_char);
                riCounter = 1;
            }
            wbcSeqStart = wbcCur;
            posLast = posCur;
            break;

        case WBP_Double_Quote:
            if (wbcLast == WBP_Hebrew_Letter) /* WB7b,c */
            {
               /* Go on */
            }
            else
            {
                set_brks_to(s, brks, posLast, posCur, len,
                            WORDBREAK_BREAK, get_next_char);
                wbcSeqStart = wbcCur;
                posLast = posCur;
            }
            break;

        case WBP_WSegSpace:
            if (wbcLast == WBP_WSegSpace) /* WB3d */
            {
                set_brks_to(s, brks, posLast, posCur, len,
                            WORDBREAK_NOBREAK, get_next_char);
                posLast = posCur;
                break;
            }
            /* Fall through */

        case WBP_Any:
            /* Check for rule WB3c */
            if (wbcLast == WBP_ZWJ && ub_is_extended_pictographic(ch))
            {
                set_brks_to(s, brks, posLast, posCur, len,
                            WORDBREAK_NOBREAK, get_next_char);
                posLast = posCur;
                break;
            }

            /* Allow breaks and reset */
            set_brks_to(s, brks, posLast, posCur, len,
                        WORDBREAK_BREAK, get_next_char);
            wbcSeqStart = wbcCur;
            posLast = posCur;
            break;

        default:
            /* Error, should never get here! */
            assert(0);
            break;
        }

        wbcLast = wbcCur;
        posCur = posNext;
        ch = get_next_char(s, len, &posNext);
    }

    /* WB2 */
    set_brks_to(s, brks, posLast, posNext, len,
                WORDBREAK_BREAK, get_next_char);
}
示例#3
0
文件: wordbreak.c 项目: Limsik/e17
/**
 * Sets the word breaking information for a generic input string.
 *
 * @param[in]  s			input string
 * @param[in]  len			length of the input
 * @param[in]  lang			language of the input
 * @param[out] brks			pointer to the output breaking data, containing
 *							#WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
 *							#WORDBREAK_INSIDEACHAR
 * @param[in] get_next_char	function to get the next UTF-32 character
 */
static void set_wordbreaks(
		const void *s,
		size_t len,
		const char *lang,
		char *brks,
		get_next_char_t get_next_char)
{
	/* Previous class */
	enum WordBreakClass p_cls = WBP_Undefined;
	/* Strong previous class. */
	enum WordBreakClass sp_cls = WBP_Undefined;
	utf32_t ch;
	size_t posCur = 0;
	size_t posCurSt = 0;
	size_t posLast = 0;

	/* FIXME: unused atm. */
	(void) lang;


	/* Init brks */
	memset(brks, WORDBREAK_BREAK, len);

	ch = get_next_char(s, len, &posCur);

	/* WB3a, WB3b are implied. */
	for ( ; ch != EOS ; )
	{
		/* Current class */
		enum WordBreakClass c_cls;
		c_cls = get_char_wb_class(ch, wb_prop_default,
				ARRAY_LEN(wb_prop_default));

		switch (c_cls)
		{
	    case WBP_CR:
			set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
					get_next_char);
			sp_cls = c_cls;
			posLast = posCurSt;
			break;

	    case WBP_LF:
			if (sp_cls == WBP_CR) /* WB3 */
			{
				set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK,
						get_next_char);
				sp_cls = c_cls;
				posLast = posCurSt;
			}
			sp_cls = c_cls;
			posLast = posCurSt;
			break;

	    case WBP_Newline:
			/* WB3a, WB3b */
			set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
					get_next_char);
			sp_cls = c_cls;
			posLast = posCurSt;
			break;

	    case WBP_Extend:
	    case WBP_Format:
			/* WB4 - If not the first char/after a newline (W3ab),
			 * skip this class, set it to be the same as the prev, and mark
			 * brks not to break before them. */
			if ((sp_cls == WBP_Undefined) || IS_WB3ab(sp_cls))
			{
				set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
						get_next_char);
				sp_cls = c_cls;
			}
			else
			{
				/* It's surely not the first */
				brks[posCurSt - 1] = WORDBREAK_NOBREAK;
				/* "inherit" the previous class. */
				c_cls = p_cls;
			}
			break;

	    case WBP_Katakana:
			if ((sp_cls == WBP_Katakana) || /* WB13 */
					(sp_cls == WBP_ExtendNumLet)) /* WB13b */
			{
				set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK,
						get_next_char);
			}
			/* No rule found, reset */
			else
			{
				set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
						get_next_char);
			}
			sp_cls = c_cls;
			posLast = posCurSt;
			break;

	    case WBP_ALetter:
			if ((sp_cls == WBP_ALetter) || /* WB5,6,7 */
					((sp_cls == WBP_Numeric) && (p_cls == WBP_Numeric)) || /* WB10 */
					(sp_cls == WBP_ExtendNumLet)) /* WB13b */
			{
				set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK,
						get_next_char);
			}
			/* No rule found, reset */
			else
			{
				set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
						get_next_char);
			}
			sp_cls = c_cls;
			posLast = posCurSt;
			break;

	    case WBP_MidNumLet:
			if ((p_cls == WBP_ALetter) || /* WBP6,7 */
					(p_cls == WBP_Numeric)) /* WBP11,12 */
			{
				/* Go on */
			}
			else
			{
				set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
						get_next_char);
				sp_cls = c_cls;
				posLast = posCurSt;
			}
			break;

	    case WBP_MidLetter:
			if (p_cls == WBP_ALetter) /* WBP6,7 */
			{
				/* Go on */
			}
			else
			{
				set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
						get_next_char);
				sp_cls = c_cls;
				posLast = posCurSt;
			}
			break;

	    case WBP_MidNum:
			if (p_cls == WBP_Numeric) /* WBP11,12 */
			{
				/* Go on */
			}
			else
			{
				set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
						get_next_char);
				sp_cls = c_cls;
				posLast = posCurSt;
			}
			break;

	    case WBP_Numeric:
			if ((sp_cls == WBP_Numeric) || /* WB8,11,12 */
					((sp_cls == WBP_ALetter) && (p_cls == WBP_ALetter)) || /* WB9 */
					(sp_cls == WBP_ExtendNumLet)) /* WB13b */
			{
				set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK,
						get_next_char);
			}
			/* No rule found, reset */
			else
			{
				set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
						get_next_char);
			}
			sp_cls = c_cls;
			posLast = posCurSt;
			break;

	    case WBP_ExtendNumLet:
			/* WB13a,13b */
			if ((sp_cls == p_cls) &&
				((p_cls == WBP_ALetter) ||
				 (p_cls == WBP_Numeric) ||
				 (p_cls == WBP_Katakana) ||
				 (p_cls == WBP_ExtendNumLet)))
			{
				set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK,
						get_next_char);
			}
			/* No rule found, reset */
			else
			{
				set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
						get_next_char);
			}
			sp_cls = c_cls;
			posLast = posCurSt;
			break;

		 case WBP_Any:
			/* Allow breaks and reset */
			set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK,
					get_next_char);
			sp_cls = c_cls;
			posLast = posCurSt;
			break;

	    default:
			/* Error, should never get here! */
			assert(0);
			break;
		}

		p_cls = c_cls;
		posCurSt = posCur;
		ch = get_next_char(s, len, &posCur);
    }

	/* WB2 */
	set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK,
			get_next_char);
}