void CountWords(const UnicodeString& ustr, size_t& cnt, size_t& ctrl_cnt, size_t& sp_cnt) { UErrorCode status = U_ZERO_ERROR; boost::scoped_ptr<BreakIterator> bi ( BreakIterator::createWordInstance(Locale::getDefault(), status) ); bi->setText(ustr); int32_t i = bi->first(); while (i < ustr.length()) { ++cnt; UChar32 ch = ustr.char32At(i); if (u_iscntrl(ch)) ++ctrl_cnt; else if(u_isspace(ch)) ++sp_cnt; i = bi->next(); } }
static int u_iscclass(PARROT_INTERP, UINTVAL codepoint, INTVAL flags) { ASSERT_ARGS(u_iscclass) #if PARROT_HAS_ICU UNUSED(interp); /* XXX which one return u_charDigitValue(codepoint); */ if ((flags & enum_cclass_uppercase) && u_isupper(codepoint)) return 1; if ((flags & enum_cclass_lowercase) && u_islower(codepoint)) return 1; if ((flags & enum_cclass_alphabetic) && u_isalpha(codepoint)) return 1; if ((flags & enum_cclass_numeric) && u_isdigit(codepoint)) return 1; if ((flags & enum_cclass_hexadecimal) && u_isxdigit(codepoint)) return 1; if ((flags & enum_cclass_whitespace) && u_isspace(codepoint)) return 1; if ((flags & enum_cclass_printing) && u_isprint(codepoint)) return 1; if ((flags & enum_cclass_graphical) && u_isgraph(codepoint)) return 1; if ((flags & enum_cclass_blank) && u_isblank(codepoint)) return 1; if ((flags & enum_cclass_control) && u_iscntrl(codepoint)) return 1; if ((flags & enum_cclass_alphanumeric) && u_isalnum(codepoint)) return 1; if ((flags & enum_cclass_word) && (u_isalnum(codepoint) || codepoint == '_')) return 1; if ((flags & enum_cclass_newline) && (codepoint == 0x2028 || codepoint == 0x2029 || u_hasBinaryProperty(codepoint, UCHAR_LINE_BREAK))) return 1; return 0; #else if (codepoint < 256) return (Parrot_iso_8859_1_typetable[codepoint] & flags) ? 1 : 0; if (flags == enum_cclass_any) return 1; /* All codepoints from u+0100 to u+02af are alphabetic, so we * cheat on the WORD and ALPHABETIC properties to include these * (and incorrectly exclude all others). This is a stopgap until * ICU is everywhere, or we have better non-ICU unicode support. */ if (flags == enum_cclass_word || flags == enum_cclass_alphabetic) return (codepoint < 0x2b0); if (flags & enum_cclass_whitespace) { /* from http://www.unicode.org/Public/UNIDATA/PropList.txt */ switch (codepoint) { case 0x1680: case 0x180e: case 0x2000: case 0x2001: case 0x2002: case 0x2003: case 0x2004: case 0x2005: case 0x2006: case 0x2007: case 0x2008: case 0x2009: case 0x200a: case 0x2028: case 0x2029: case 0x202f: case 0x205f: case 0x3000: return 1; default: break; } } if (flags & enum_cclass_numeric) { /* from http://www.unicode.org/Public/UNIDATA/UnicodeData.txt */ if (codepoint >= 0x0660 && codepoint <= 0x0669) return 1; if (codepoint >= 0x06f0 && codepoint <= 0x06f9) return 1; if (codepoint >= 0x07c0 && codepoint <= 0x07c9) return 1; if (codepoint >= 0x0966 && codepoint <= 0x096f) return 1; if (codepoint >= 0x09e6 && codepoint <= 0x09ef) return 1; if (codepoint >= 0x0a66 && codepoint <= 0x0a6f) return 1; if (codepoint >= 0x0ae6 && codepoint <= 0x0aef) return 1; if (codepoint >= 0x0b66 && codepoint <= 0x0b6f) return 1; if (codepoint >= 0x0be6 && codepoint <= 0x0bef) return 1; if (codepoint >= 0x0c66 && codepoint <= 0x0c6f) return 1; if (codepoint >= 0x0ce6 && codepoint <= 0x0cef) return 1; if (codepoint >= 0x0d66 && codepoint <= 0x0d6f) return 1; if (codepoint >= 0x0e50 && codepoint <= 0x0e59) return 1; if (codepoint >= 0x0ed0 && codepoint <= 0x0ed9) return 1; if (codepoint >= 0x0f20 && codepoint <= 0x0f29) return 1; if (codepoint >= 0x1040 && codepoint <= 0x1049) return 1; if (codepoint >= 0x17e0 && codepoint <= 0x17e9) return 1; if (codepoint >= 0x1810 && codepoint <= 0x1819) return 1; if (codepoint >= 0x1946 && codepoint <= 0x194f) return 1; if (codepoint >= 0x19d0 && codepoint <= 0x19d9) return 1; if (codepoint >= 0x1b50 && codepoint <= 0x1b59) return 1; if (codepoint >= 0xff10 && codepoint <= 0xff19) return 1; } if (flags & enum_cclass_newline) { /* from http://www.unicode.org/Public/UNIDATA/extracted/DerivedLineBreak.txt * Line_Break=Mandatory_Break*/ if (codepoint == 0x2028 || codepoint == 0x2029) return 1; } if (flags & ~(enum_cclass_whitespace | enum_cclass_numeric | enum_cclass_newline)) Parrot_ex_throw_from_c_noargs(interp, EXCEPTION_LIBRARY_ERROR, "no ICU lib loaded"); return 0; #endif }
// Determines whether the specified code point is a control character // (as defined by this function). // A control character is one of the following: // - ISO 8-bit control character (U+0000..U+001f and U+007f..U+009f) // - U_CONTROL_CHAR (Cc) // - U_FORMAT_CHAR (Cf) // - U_LINE_SEPARATOR (Zl) // - U_PARAGRAPH_SEPARATOR (Zp) bool BUnicodeChar::IsControl(uint32 c) { BUnicodeChar(); return u_iscntrl(c); }