/** * Sets the word breaking information for a generic input string. * * @param[in] s input string * @param[in] len length of the input * @param[in] lang language of the input * @param[out] brks pointer to the output breaking data, containing * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or * #WORDBREAK_INSIDEACHAR * @param[in] get_next_char function to get the next UTF-32 character */ static void set_wordbreaks( const void *s, size_t len, const char *lang, char *brks, get_next_char_t get_next_char) { enum WordBreakClass wbcLast = WBP_Undefined; /* wbcSeqStart is the class that started the current sequence. * WBP_Undefined is a special case that means "sot". * This value is the class that is at the start of the current rule * matching sequence. For example, in case of Numeric+MidNum+Numeric * it'll be Numeric all the way. */ enum WordBreakClass wbcSeqStart = WBP_Undefined; utf32_t ch; size_t posNext = 0; size_t posCur = 0; size_t posLast = 0; /* TODO: Language-specific specialization. */ (void) lang; /* Init brks. */ memset(brks, WORDBREAK_BREAK, len); ch = get_next_char(s, len, &posNext); while (ch != EOS) { enum WordBreakClass wbcCur; wbcCur = get_char_wb_class(ch, wb_prop_default, ARRAY_LEN(wb_prop_default)); switch (wbcCur) { case WBP_CR: /* WB3b */ set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); wbcSeqStart = wbcCur; posLast = posCur; break; case WBP_LF: if (wbcSeqStart == WBP_CR) /* WB3 */ { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_NOBREAK, get_next_char); wbcSeqStart = wbcCur; posLast = posCur; break; } /* Fall off */ case WBP_Newline: /* WB3a,3b */ set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); wbcSeqStart = wbcCur; posLast = posCur; break; case WBP_Extend: case WBP_Format: /* WB4 - If not the first char/after a newline (WB3a,3b), skip * this class, set it to be the same as the prev, and mark * brks not to break before them. */ if ((wbcSeqStart == WBP_Undefined) || IS_WB3ab(wbcSeqStart)) { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); wbcSeqStart = wbcCur; } else { /* It's surely not the first */ brks[posCur - 1] = WORDBREAK_NOBREAK; /* "inherit" the previous class. */ wbcCur = wbcLast; } break; case WBP_Katakana: if ((wbcSeqStart == WBP_Katakana) || /* WB13 */ (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */ { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_NOBREAK, get_next_char); } /* No rule found, reset */ else { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); } wbcSeqStart = wbcCur; posLast = posCur; break; case WBP_ALetter: if ((wbcSeqStart == WBP_ALetter) || /* WB5,6,7 */ (wbcLast == WBP_Numeric) || /* WB10 */ (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */ { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_NOBREAK, get_next_char); } /* No rule found, reset */ else { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); } wbcSeqStart = wbcCur; posLast = posCur; break; case WBP_MidNumLet: if ((wbcLast == WBP_ALetter) || /* WB6,7 */ (wbcLast == WBP_Numeric)) /* WB11,12 */ { /* Go on */ } else { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); wbcSeqStart = wbcCur; posLast = posCur; } break; case WBP_MidLetter: if (wbcLast == WBP_ALetter) /* WB6,7 */ { /* Go on */ } else { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); wbcSeqStart = wbcCur; posLast = posCur; } break; case WBP_MidNum: if (wbcLast == WBP_Numeric) /* WB11,12 */ { /* Go on */ } else { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); wbcSeqStart = wbcCur; posLast = posCur; } break; case WBP_Numeric: if ((wbcSeqStart == WBP_Numeric) || /* WB8,11,12 */ (wbcLast == WBP_ALetter) || /* WB9 */ (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */ { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_NOBREAK, get_next_char); } /* No rule found, reset */ else { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); } wbcSeqStart = wbcCur; posLast = posCur; break; case WBP_ExtendNumLet: /* WB13a,13b */ if ((wbcSeqStart == wbcLast) && ((wbcLast == WBP_ALetter) || (wbcLast == WBP_Numeric) || (wbcLast == WBP_Katakana) || (wbcLast == WBP_ExtendNumLet))) { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_NOBREAK, get_next_char); } /* No rule found, reset */ else { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); } wbcSeqStart = wbcCur; posLast = posCur; break; case WBP_Any: /* Allow breaks and reset */ set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); wbcSeqStart = wbcCur; posLast = posCur; break; default: /* Error, should never get here! */ assert(0); break; } wbcLast = wbcCur; posCur = posNext; ch = get_next_char(s, len, &posNext); } /* WB2 */ set_brks_to(s, brks, posLast, posNext, len, WORDBREAK_BREAK, get_next_char); }
/** * Sets the word breaking information for a generic input string. * * @param[in] s input string * @param[in] len length of the input * @param[in] lang language of the input (reserved for future use) * @param[out] brks pointer to the output breaking data, containing * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or * #WORDBREAK_INSIDEACHAR * @param[in] get_next_char function to get the next UTF-32 character */ static void set_wordbreaks( const void *s, size_t len, const char *lang, char *brks, get_next_char_t get_next_char) { /* Counter of how many time we cam across RI */ int riCounter = 0; enum WordBreakClass wbcLast = WBP_Undefined; /* wbcSeqStart is the class that started the current sequence. * WBP_Undefined is a special case that means "sot". * This value is the class that is at the start of the current rule * matching sequence. For example, in case of Numeric+MidNum+Numeric * it'll be Numeric all the way. */ enum WordBreakClass wbcSeqStart = WBP_Undefined; utf32_t ch; size_t posNext = 0; size_t posCur = 0; size_t posLast = 0; /* TODO: Language-specific specialization. */ (void) lang; /* Init brks. */ memset(brks, WORDBREAK_BREAK, len); ch = get_next_char(s, len, &posNext); while (ch != EOS) { enum WordBreakClass wbcCur; wbcCur = get_char_wb_class(ch, wb_prop_default, ARRAY_LEN(wb_prop_default)); switch (wbcCur) { case WBP_CR: /* WB3b */ set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); wbcSeqStart = wbcCur; posLast = posCur; break; case WBP_LF: if (wbcSeqStart == WBP_CR) /* WB3 */ { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_NOBREAK, get_next_char); wbcSeqStart = wbcCur; posLast = posCur; break; } /* Fall through */ case WBP_Newline: /* WB3a,3b */ set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); wbcSeqStart = wbcCur; posLast = posCur; break; case WBP_ZWJ: case WBP_Extend: case WBP_Format: /* WB4 - If not the first char/after a newline (WB3a,3b), skip * this class, set it to be the same as the prev, and mark * brks not to break before them. */ if ((wbcSeqStart == WBP_Undefined) || IS_WB3ab(wbcSeqStart)) { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); wbcSeqStart = wbcCur; posLast = posCur; } else { /* It's surely not the first */ brks[posCur - 1] = WORDBREAK_NOBREAK; /* WB3c and WB3d precede 4, so no intervening Extend * chars allowed. */ if (wbcCur != WBP_ZWJ && wbcSeqStart != WBP_ZWJ && wbcSeqStart != WBP_WSegSpace) { /* "inherit" the previous class. */ wbcCur = wbcLast; } } break; case WBP_Katakana: if ((wbcSeqStart == WBP_Katakana) || /* WB13 */ (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */ { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_NOBREAK, get_next_char); } /* No rule found, reset */ else { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); } wbcSeqStart = wbcCur; posLast = posCur; break; case WBP_Hebrew_Letter: case WBP_ALetter: if ((wbcSeqStart == WBP_Hebrew_Letter) && (wbcLast == WBP_Double_Quote)) /* WB7b,c */ { if (wbcCur == WBP_Hebrew_Letter) { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_NOBREAK, get_next_char); } else { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); } } else if (((wbcSeqStart == WBP_ALetter) || (wbcSeqStart == WBP_Hebrew_Letter)) || /* WB5,6,7 */ (wbcLast == WBP_Numeric) || /* WB10 */ (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */ { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_NOBREAK, get_next_char); } /* No rule found, reset */ else { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); } wbcSeqStart = wbcCur; posLast = posCur; break; case WBP_Single_Quote: if (wbcLast == WBP_Hebrew_Letter) /* WB7a */ { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_NOBREAK, get_next_char); wbcSeqStart = wbcCur; posLast = posCur; } /* Fall through */ case WBP_MidNumLet: if (((wbcLast == WBP_ALetter) || (wbcLast == WBP_Hebrew_Letter)) || /* WB6,7 */ (wbcLast == WBP_Numeric)) /* WB11,12 */ { /* Go on */ } else { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); wbcSeqStart = wbcCur; posLast = posCur; } break; case WBP_MidLetter: if ((wbcLast == WBP_ALetter) || (wbcLast == WBP_Hebrew_Letter)) /* WB6,7 */ { /* Go on */ } else { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); wbcSeqStart = wbcCur; posLast = posCur; } break; case WBP_MidNum: if (wbcLast == WBP_Numeric) /* WB11,12 */ { /* Go on */ } else { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); wbcSeqStart = wbcCur; posLast = posCur; } break; case WBP_Numeric: if ((wbcSeqStart == WBP_Numeric) || /* WB8,11,12 */ ((wbcLast == WBP_ALetter) || (wbcLast == WBP_Hebrew_Letter)) || /* WB9 */ (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */ { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_NOBREAK, get_next_char); } /* No rule found, reset */ else { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); } wbcSeqStart = wbcCur; posLast = posCur; break; case WBP_ExtendNumLet: /* WB13a,13b */ if ((wbcSeqStart == wbcLast) && ((wbcLast == WBP_ALetter) || (wbcLast == WBP_Hebrew_Letter) || (wbcLast == WBP_Numeric) || (wbcLast == WBP_Katakana) || (wbcLast == WBP_ExtendNumLet))) { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_NOBREAK, get_next_char); } /* No rule found, reset */ else { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); } wbcSeqStart = wbcCur; posLast = posCur; break; case WBP_Regional_Indicator: /* WB15,16 */ if ((wbcSeqStart == WBP_Regional_Indicator) && ((riCounter % 2) == 1)) { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_NOBREAK, get_next_char); riCounter = 0; /* Reset the sequence */ } /* No rule found, reset */ else { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); riCounter = 1; } wbcSeqStart = wbcCur; posLast = posCur; break; case WBP_Double_Quote: if (wbcLast == WBP_Hebrew_Letter) /* WB7b,c */ { /* Go on */ } else { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); wbcSeqStart = wbcCur; posLast = posCur; } break; case WBP_WSegSpace: if (wbcLast == WBP_WSegSpace) /* WB3d */ { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_NOBREAK, get_next_char); posLast = posCur; break; } /* Fall through */ case WBP_Any: /* Check for rule WB3c */ if (wbcLast == WBP_ZWJ && ub_is_extended_pictographic(ch)) { set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_NOBREAK, get_next_char); posLast = posCur; break; } /* Allow breaks and reset */ set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); wbcSeqStart = wbcCur; posLast = posCur; break; default: /* Error, should never get here! */ assert(0); break; } wbcLast = wbcCur; posCur = posNext; ch = get_next_char(s, len, &posNext); } /* WB2 */ set_brks_to(s, brks, posLast, posNext, len, WORDBREAK_BREAK, get_next_char); }
/** * Sets the word breaking information for a generic input string. * * @param[in] s input string * @param[in] len length of the input * @param[in] lang language of the input * @param[out] brks pointer to the output breaking data, containing * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or * #WORDBREAK_INSIDEACHAR * @param[in] get_next_char function to get the next UTF-32 character */ static void set_wordbreaks( const void *s, size_t len, const char *lang, char *brks, get_next_char_t get_next_char) { /* Previous class */ enum WordBreakClass p_cls = WBP_Undefined; /* Strong previous class. */ enum WordBreakClass sp_cls = WBP_Undefined; utf32_t ch; size_t posCur = 0; size_t posCurSt = 0; size_t posLast = 0; /* FIXME: unused atm. */ (void) lang; /* Init brks */ memset(brks, WORDBREAK_BREAK, len); ch = get_next_char(s, len, &posCur); /* WB3a, WB3b are implied. */ for ( ; ch != EOS ; ) { /* Current class */ enum WordBreakClass c_cls; c_cls = get_char_wb_class(ch, wb_prop_default, ARRAY_LEN(wb_prop_default)); switch (c_cls) { case WBP_CR: set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, get_next_char); sp_cls = c_cls; posLast = posCurSt; break; case WBP_LF: if (sp_cls == WBP_CR) /* WB3 */ { set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK, get_next_char); sp_cls = c_cls; posLast = posCurSt; } sp_cls = c_cls; posLast = posCurSt; break; case WBP_Newline: /* WB3a, WB3b */ set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, get_next_char); sp_cls = c_cls; posLast = posCurSt; break; case WBP_Extend: case WBP_Format: /* WB4 - If not the first char/after a newline (W3ab), * skip this class, set it to be the same as the prev, and mark * brks not to break before them. */ if ((sp_cls == WBP_Undefined) || IS_WB3ab(sp_cls)) { set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, get_next_char); sp_cls = c_cls; } else { /* It's surely not the first */ brks[posCurSt - 1] = WORDBREAK_NOBREAK; /* "inherit" the previous class. */ c_cls = p_cls; } break; case WBP_Katakana: if ((sp_cls == WBP_Katakana) || /* WB13 */ (sp_cls == WBP_ExtendNumLet)) /* WB13b */ { set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK, get_next_char); } /* No rule found, reset */ else { set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, get_next_char); } sp_cls = c_cls; posLast = posCurSt; break; case WBP_ALetter: if ((sp_cls == WBP_ALetter) || /* WB5,6,7 */ ((sp_cls == WBP_Numeric) && (p_cls == WBP_Numeric)) || /* WB10 */ (sp_cls == WBP_ExtendNumLet)) /* WB13b */ { set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK, get_next_char); } /* No rule found, reset */ else { set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, get_next_char); } sp_cls = c_cls; posLast = posCurSt; break; case WBP_MidNumLet: if ((p_cls == WBP_ALetter) || /* WBP6,7 */ (p_cls == WBP_Numeric)) /* WBP11,12 */ { /* Go on */ } else { set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, get_next_char); sp_cls = c_cls; posLast = posCurSt; } break; case WBP_MidLetter: if (p_cls == WBP_ALetter) /* WBP6,7 */ { /* Go on */ } else { set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, get_next_char); sp_cls = c_cls; posLast = posCurSt; } break; case WBP_MidNum: if (p_cls == WBP_Numeric) /* WBP11,12 */ { /* Go on */ } else { set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, get_next_char); sp_cls = c_cls; posLast = posCurSt; } break; case WBP_Numeric: if ((sp_cls == WBP_Numeric) || /* WB8,11,12 */ ((sp_cls == WBP_ALetter) && (p_cls == WBP_ALetter)) || /* WB9 */ (sp_cls == WBP_ExtendNumLet)) /* WB13b */ { set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK, get_next_char); } /* No rule found, reset */ else { set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, get_next_char); } sp_cls = c_cls; posLast = posCurSt; break; case WBP_ExtendNumLet: /* WB13a,13b */ if ((sp_cls == p_cls) && ((p_cls == WBP_ALetter) || (p_cls == WBP_Numeric) || (p_cls == WBP_Katakana) || (p_cls == WBP_ExtendNumLet))) { set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_NOBREAK, get_next_char); } /* No rule found, reset */ else { set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, get_next_char); } sp_cls = c_cls; posLast = posCurSt; break; case WBP_Any: /* Allow breaks and reset */ set_brks_to(s, brks, posLast, posCurSt, len, WORDBREAK_BREAK, get_next_char); sp_cls = c_cls; posLast = posCurSt; break; default: /* Error, should never get here! */ assert(0); break; } p_cls = c_cls; posCurSt = posCur; ch = get_next_char(s, len, &posCur); } /* WB2 */ set_brks_to(s, brks, posLast, posCur, len, WORDBREAK_BREAK, get_next_char); }