Пример #1
0
/**
 * Initializes line breaking context for a given language.
 *
 * @param[in,out] lbpCtx  pointer to the line breaking context
 * @param[in]     ch      the first character to process
 * @param[in]     lang    language of the input
 * @post                  the line breaking context is initialized
 */
void lb_init_break_context(
        struct LineBreakContext* lbpCtx,
        utf32_t ch,
        const char* lang)
{
    lbpCtx->lang = lang;
    lbpCtx->lbpLang = get_lb_prop_lang(lang);
    lbpCtx->lbcLast = LBP_Undefined;
    lbpCtx->lbcNew = LBP_Undefined;
    lbpCtx->lbcCur = resolve_lb_class(
                        get_char_lb_class_lang(ch, lbpCtx->lbpLang),
                        lbpCtx->lang);
    treat_first_char(lbpCtx);
}
Пример #2
0
/**
 * Updates LineBreakingContext for the next code point and returns
 * the detected break.
 *
 * @param[in,out] lbpCtx  pointer to the line breaking context
 * @param[in]     ch      Unicode code point
 * @return                break result, one of #LINEBREAK_MUSTBREAK,
 *                        #LINEBREAK_ALLOWBREAK, and #LINEBREAK_NOBREAK
 * @post                  the line breaking context is updated
 */
int lb_process_next_char(
        struct LineBreakContext *lbpCtx,
        utf32_t ch )
{
    int brk;

    lbpCtx->lbcLast = lbpCtx->lbcNew;
    lbpCtx->lbcNew = get_char_lb_class_lang(ch, lbpCtx->lbpLang);
    brk = get_lb_result_simple(lbpCtx);
    switch (brk)
    {
    case LINEBREAK_MUSTBREAK:
        lbpCtx->lbcCur = resolve_lb_class(lbpCtx->lbcNew, lbpCtx->lang);
        treat_first_char(lbpCtx);
        break;
    case LINEBREAK_UNDEFINED:
        lbpCtx->lbcNew = resolve_lb_class(lbpCtx->lbcNew, lbpCtx->lang);
        brk = get_lb_result_lookup(lbpCtx);
        break;
    default:
        break;
    }
    return brk;
}
Пример #3
0
/**
 * Sets the line breaking information for a generic input string.
 *
 * @param[in]  s			input string
 * @param[in]  len			length of the input
 * @param[in]  lang			language of the input
 * @param[out] brks			pointer to the output breaking data,
 *							containing #LINEBREAK_MUSTBREAK,
 *							#LINEBREAK_ALLOWBREAK, #LINEBREAK_NOBREAK,
 *							or #LINEBREAK_INSIDEACHAR
 * @param[in] get_next_char	function to get the next UTF-32 character
 */
void set_linebreaks(
		const void *s,
		size_t len,
		const char *lang,
		char *brks,
		get_next_char_t get_next_char)
{
	utf32_t ch;
	enum LineBreakClass lbcCur;
	enum LineBreakClass lbcNew;
	enum LineBreakClass lbcLast;
	struct LineBreakProperties *lbpLang;
	size_t posCur = 0;
	size_t posLast = 0;

	--posLast;	/* To be ++'d later */
	ch = get_next_char(s, len, &posCur);
	if (ch == EOS)
		return;
	lbpLang = get_lb_prop_lang(lang);
	lbcCur = resolve_lb_class(get_char_lb_class_lang(ch, lbpLang), lang);
	lbcNew = LBP_Undefined;

nextline:

	/* Special treatment for the first character */
	switch (lbcCur)
	{
	case LBP_LF:
	case LBP_NL:
		lbcCur = LBP_BK;
		break;
	case LBP_CB:
		lbcCur = LBP_BA;
		break;
	case LBP_SP:
		lbcCur = LBP_WJ;
		break;
	default:
		break;
	}

	/* Process a line till an explicit break or end of string */
	for (;;)
	{
		for (++posLast; posLast < posCur - 1; ++posLast)
		{
			brks[posLast] = LINEBREAK_INSIDEACHAR;
		}
		assert(posLast == posCur - 1);
		lbcLast = lbcNew;
		ch = get_next_char(s, len, &posCur);
		if (ch == EOS)
			break;
		lbcNew = get_char_lb_class_lang(ch, lbpLang);
		if (lbcCur == LBP_BK || (lbcCur == LBP_CR && lbcNew != LBP_LF))
		{
			brks[posLast] = LINEBREAK_MUSTBREAK;
			lbcCur = resolve_lb_class(lbcNew, lang);
			goto nextline;
		}

		switch (lbcNew)
		{
		case LBP_SP:
			brks[posLast] = LINEBREAK_NOBREAK;
			continue;
		case LBP_BK:
		case LBP_LF:
		case LBP_NL:
			brks[posLast] = LINEBREAK_NOBREAK;
			lbcCur = LBP_BK;
			continue;
		case LBP_CR:
			brks[posLast] = LINEBREAK_NOBREAK;
			lbcCur = LBP_CR;
			continue;
		case LBP_CB:
			brks[posLast] = LINEBREAK_ALLOWBREAK;
			lbcCur = LBP_BA;
			continue;
		default:
			break;
		}

		lbcNew = resolve_lb_class(lbcNew, lang);

		assert(lbcCur <= LBP_JT);
		assert(lbcNew <= LBP_JT);
		switch (baTable[lbcCur - 1][lbcNew - 1])
		{
		case DIR_BRK:
			brks[posLast] = LINEBREAK_ALLOWBREAK;
			break;
		case CMI_BRK:
		case IND_BRK:
			if (lbcLast == LBP_SP)
			{
				brks[posLast] = LINEBREAK_ALLOWBREAK;
			}
			else
			{
				brks[posLast] = LINEBREAK_NOBREAK;
			}
			break;
		case CMP_BRK:
			brks[posLast] = LINEBREAK_NOBREAK;
			if (lbcLast != LBP_SP)
				continue;
			break;
		case PRH_BRK:
			brks[posLast] = LINEBREAK_NOBREAK;
			break;
		}

		lbcCur = lbcNew;
	}

	assert(posLast == posCur - 1 && posCur <= len);
	/* Break after the last character */
	brks[posLast] = LINEBREAK_MUSTBREAK;
	/* When the input contains incomplete sequences */
	while (posCur < len)
	{
		brks[posCur++] = LINEBREAK_INSIDEACHAR;
	}
}