static int u_iscclass(PARROT_INTERP, UINTVAL codepoint, INTVAL flags) { ASSERT_ARGS(u_iscclass) #if PARROT_HAS_ICU UNUSED(interp); /* XXX which one return u_charDigitValue(codepoint); */ if ((flags & enum_cclass_uppercase) && u_isupper(codepoint)) return 1; if ((flags & enum_cclass_lowercase) && u_islower(codepoint)) return 1; if ((flags & enum_cclass_alphabetic) && u_isalpha(codepoint)) return 1; if ((flags & enum_cclass_numeric) && u_isdigit(codepoint)) return 1; if ((flags & enum_cclass_hexadecimal) && u_isxdigit(codepoint)) return 1; if ((flags & enum_cclass_whitespace) && u_isspace(codepoint)) return 1; if ((flags & enum_cclass_printing) && u_isprint(codepoint)) return 1; if ((flags & enum_cclass_graphical) && u_isgraph(codepoint)) return 1; if ((flags & enum_cclass_blank) && u_isblank(codepoint)) return 1; if ((flags & enum_cclass_control) && u_iscntrl(codepoint)) return 1; if ((flags & enum_cclass_alphanumeric) && u_isalnum(codepoint)) return 1; if ((flags & enum_cclass_word) && (u_isalnum(codepoint) || codepoint == '_')) return 1; if ((flags & enum_cclass_newline) && (codepoint == 0x2028 || codepoint == 0x2029 || u_hasBinaryProperty(codepoint, UCHAR_LINE_BREAK))) return 1; return 0; #else if (codepoint < 256) return (Parrot_iso_8859_1_typetable[codepoint] & flags) ? 1 : 0; if (flags == enum_cclass_any) return 1; /* All codepoints from u+0100 to u+02af are alphabetic, so we * cheat on the WORD and ALPHABETIC properties to include these * (and incorrectly exclude all others). This is a stopgap until * ICU is everywhere, or we have better non-ICU unicode support. */ if (flags == enum_cclass_word || flags == enum_cclass_alphabetic) return (codepoint < 0x2b0); if (flags & enum_cclass_whitespace) { /* from http://www.unicode.org/Public/UNIDATA/PropList.txt */ switch (codepoint) { case 0x1680: case 0x180e: case 0x2000: case 0x2001: case 0x2002: case 0x2003: case 0x2004: case 0x2005: case 0x2006: case 0x2007: case 0x2008: case 0x2009: case 0x200a: case 0x2028: case 0x2029: case 0x202f: case 0x205f: case 0x3000: return 1; default: break; } } if (flags & enum_cclass_numeric) { /* from http://www.unicode.org/Public/UNIDATA/UnicodeData.txt */ if (codepoint >= 0x0660 && codepoint <= 0x0669) return 1; if (codepoint >= 0x06f0 && codepoint <= 0x06f9) return 1; if (codepoint >= 0x07c0 && codepoint <= 0x07c9) return 1; if (codepoint >= 0x0966 && codepoint <= 0x096f) return 1; if (codepoint >= 0x09e6 && codepoint <= 0x09ef) return 1; if (codepoint >= 0x0a66 && codepoint <= 0x0a6f) return 1; if (codepoint >= 0x0ae6 && codepoint <= 0x0aef) return 1; if (codepoint >= 0x0b66 && codepoint <= 0x0b6f) return 1; if (codepoint >= 0x0be6 && codepoint <= 0x0bef) return 1; if (codepoint >= 0x0c66 && codepoint <= 0x0c6f) return 1; if (codepoint >= 0x0ce6 && codepoint <= 0x0cef) return 1; if (codepoint >= 0x0d66 && codepoint <= 0x0d6f) return 1; if (codepoint >= 0x0e50 && codepoint <= 0x0e59) return 1; if (codepoint >= 0x0ed0 && codepoint <= 0x0ed9) return 1; if (codepoint >= 0x0f20 && codepoint <= 0x0f29) return 1; if (codepoint >= 0x1040 && codepoint <= 0x1049) return 1; if (codepoint >= 0x17e0 && codepoint <= 0x17e9) return 1; if (codepoint >= 0x1810 && codepoint <= 0x1819) return 1; if (codepoint >= 0x1946 && codepoint <= 0x194f) return 1; if (codepoint >= 0x19d0 && codepoint <= 0x19d9) return 1; if (codepoint >= 0x1b50 && codepoint <= 0x1b59) return 1; if (codepoint >= 0xff10 && codepoint <= 0xff19) return 1; } if (flags & enum_cclass_newline) { /* from http://www.unicode.org/Public/UNIDATA/extracted/DerivedLineBreak.txt * Line_Break=Mandatory_Break*/ if (codepoint == 0x2028 || codepoint == 0x2029) return 1; } if (flags & ~(enum_cclass_whitespace | enum_cclass_numeric | enum_cclass_newline)) Parrot_ex_throw_from_c_noargs(interp, EXCEPTION_LIBRARY_ERROR, "no ICU lib loaded"); return 0; #endif }
/* * imp: common/uchar.c * hdr: common/unicode/uchar.h * @stable ICU 2.0 */ U_CAPI UBool U_EXPORT2 u_isprint_4_0(UChar32 c) { return u_isprint(c); }
// Determines whether the specified code point is a printable character. // True for general categories other than "C" (controls). bool BUnicodeChar::IsPrintable(uint32 c) { BUnicodeChar(); return u_isprint(c); }
void get_unicode_info(const char* text, const icu::UnicodeString& us, Sqlite::Statement& insert) { bool allokay = true; for (const char* t = text; *t; ++t) { if (!(std::isalnum(*t) || *t == '_' || *t == ':' || *t == ' ' || *t == '.' || *t == '-')) { allokay = false; break; } } if (allokay) { return; } bool unusual = false; for (icu::StringCharacterIterator it(us); it.hasNext(); it.next()) { UChar32 codepoint = it.current32(); int8_t chartype = u_charType(codepoint); if (! u_isprint(codepoint)) { unusual = true; break; } if (u_charDirection(codepoint) != 0) { unusual = true; break; } if (chartype != 1 && // UPPERCASE_LETTER chartype != 2 && // LOWERCASE_LETTER chartype != 9 && // DECIMAL_DIGIT_NUMBER chartype != 12 && // SPACE_SEPARATOR chartype != 19 && // DASH_PUNCTUATION chartype != 22 && // CONNECTOR_PUNCTUATION chartype != 23) { // OTHER_PUNCTUATION unusual = true; break; } } if (unusual) { int num = 0; for (icu::StringCharacterIterator it(us); it.hasNext(); it.next(), ++num) { UChar32 codepoint = it.current32(); int8_t chartype = u_charType(codepoint); char buffer[100]; UErrorCode errorCode = U_ZERO_ERROR; u_charName(codepoint, U_UNICODE_CHAR_NAME, buffer, sizeof(buffer), &errorCode); UCharDirection direction = u_charDirection(codepoint); int32_t block = u_getIntPropertyValue(codepoint, UCHAR_BLOCK); icu::UnicodeString ustr(codepoint); std::string str; ustr.toUTF8String(str); char uplus[10]; snprintf(uplus, 10, "U+%04x", codepoint); insert. bind_text(text). bind_int(num). bind_text(str.c_str()). bind_text(uplus). bind_int(block). bind_text(category_to_string(chartype)). bind_int(direction). bind_text(buffer). execute(); } } }