/** * gucharmap_unichar_isgraph: * @uc: a Unicode character * * Determines whether a character is printable and not a space * (returns %FALSE for control characters, format characters, and * spaces). g_unichar_isprint() is similar, but returns %TRUE for * spaces. Given some UTF-8 text, obtain a character value with * g_utf8_get_char(). * * Return value: %TRUE if @c is printable unless it's a space **/ gboolean gucharmap_unichar_isgraph (gunichar uc) { GUnicodeType t = gucharmap_unichar_type (uc); /* From http://www.unicode.org/versions/Unicode9.0.0/ch09.pdf, p16 * "Unlike most other format control characters, however, they should be * rendered with a visible glyph, even in circumstances where no suitable * digit or sequence of digits follows them in logical order." * There the standard talks about the ar signs spanning numbers, but * I think this should apply to all Prepended_Concatenation_Mark format * characters. * Instead of parsing the corresponding data file, just hardcode the * (few!) existing characters here. */ if (t == G_UNICODE_FORMAT) return (uc >= 0x0600 && uc <= 0x0605) || uc == 0x06DD || uc == 0x070F || uc == 0x08E2 || uc == 0x110BD; return (t != G_UNICODE_CONTROL && t != G_UNICODE_UNASSIGNED && t != G_UNICODE_PRIVATE_USE && t != G_UNICODE_SURROGATE && t != G_UNICODE_SPACE_SEPARATOR); }
/** * gucharmap_unichar_isgraph: * @uc: a Unicode character * * Determines whether a character is printable and not a space * (returns %FALSE for control characters, format characters, and * spaces). g_unichar_isprint() is similar, but returns %TRUE for * spaces. Given some UTF-8 text, obtain a character value with * g_utf8_get_char(). * * Return value: %TRUE if @c is printable unless it's a space **/ gboolean gucharmap_unichar_isgraph (gunichar uc) { GUnicodeType t = gucharmap_unichar_type (uc); return (t != G_UNICODE_CONTROL && t != G_UNICODE_FORMAT && t != G_UNICODE_UNASSIGNED && t != G_UNICODE_PRIVATE_USE && t != G_UNICODE_SURROGATE && t != G_UNICODE_SPACE_SEPARATOR); }
/** * gucharmap_unichar_to_printable_utf8: * @uc: a unicode character * @outbuf: output buffer, must have at least 10 bytes of space. * If %NULL, the length will be computed and returned * and nothing will be written to @outbuf. * * Converts a single character to UTF-8 suitable for rendering. Check the * source to see what this means. ;-) * * * Return value: number of bytes written **/ gint gucharmap_unichar_to_printable_utf8 (gunichar uc, gchar *outbuf) { /* Unicode Standard 3.2, section 2.6, "By convention, diacritical marks * used by the Unicode Standard may be exhibited in (apparent) isolation * by applying them to U+0020 SPACE or to U+00A0 NO BREAK SPACE." */ /* 17:10 < owen> noah: I'm *not* claiming that what Pango does currently * is right, but convention isn't a requirement. I think * it's probably better to do the Uniscribe thing and put * the lone combining mark on a dummy character and require * ZWJ * 17:11 < noah> owen: do you mean that i should put a ZWJ in there, or * that pango will do that? * 17:11 < owen> noah: I mean, you should (assuming some future more * capable version of Pango) put it in there */ if (! gucharmap_unichar_validate (uc) || (! gucharmap_unichar_isgraph (uc) && gucharmap_unichar_type (uc) != G_UNICODE_PRIVATE_USE)) return 0; else if (gucharmap_unichar_type (uc) == G_UNICODE_COMBINING_MARK || gucharmap_unichar_type (uc) == G_UNICODE_ENCLOSING_MARK || gucharmap_unichar_type (uc) == G_UNICODE_NON_SPACING_MARK) { gint x; outbuf[0] = ' '; outbuf[1] = '\xe2'; /* ZERO */ outbuf[2] = '\x80'; /* WIDTH */ outbuf[3] = '\x8d'; /* JOINER (0x200D) */ x = g_unichar_to_utf8 (uc, outbuf + 4); return x + 4; } else return g_unichar_to_utf8 (uc, outbuf); }
const gchar * gucharmap_get_unicode_category_name (gunichar wc) { _gucharmap_intl_ensure_initialized (); switch (gucharmap_unichar_type (wc)) { case G_UNICODE_CONTROL: return _("Other, Control"); case G_UNICODE_FORMAT: return _("Other, Format"); case G_UNICODE_UNASSIGNED: return _("Other, Not Assigned"); case G_UNICODE_PRIVATE_USE: return _("Other, Private Use"); case G_UNICODE_SURROGATE: return _("Other, Surrogate"); case G_UNICODE_LOWERCASE_LETTER: return _("Letter, Lowercase"); case G_UNICODE_MODIFIER_LETTER: return _("Letter, Modifier"); case G_UNICODE_OTHER_LETTER: return _("Letter, Other"); case G_UNICODE_TITLECASE_LETTER: return _("Letter, Titlecase"); case G_UNICODE_UPPERCASE_LETTER: return _("Letter, Uppercase"); case G_UNICODE_COMBINING_MARK: return _("Mark, Spacing Combining"); case G_UNICODE_ENCLOSING_MARK: return _("Mark, Enclosing"); case G_UNICODE_NON_SPACING_MARK: return _("Mark, Non-Spacing"); case G_UNICODE_DECIMAL_NUMBER: return _("Number, Decimal Digit"); case G_UNICODE_LETTER_NUMBER: return _("Number, Letter"); case G_UNICODE_OTHER_NUMBER: return _("Number, Other"); case G_UNICODE_CONNECT_PUNCTUATION: return _("Punctuation, Connector"); case G_UNICODE_DASH_PUNCTUATION: return _("Punctuation, Dash"); case G_UNICODE_CLOSE_PUNCTUATION: return _("Punctuation, Close"); case G_UNICODE_FINAL_PUNCTUATION: return _("Punctuation, Final Quote"); case G_UNICODE_INITIAL_PUNCTUATION: return _("Punctuation, Initial Quote"); case G_UNICODE_OTHER_PUNCTUATION: return _("Punctuation, Other"); case G_UNICODE_OPEN_PUNCTUATION: return _("Punctuation, Open"); case G_UNICODE_CURRENCY_SYMBOL: return _("Symbol, Currency"); case G_UNICODE_MODIFIER_SYMBOL: return _("Symbol, Modifier"); case G_UNICODE_MATH_SYMBOL: return _("Symbol, Math"); case G_UNICODE_OTHER_SYMBOL: return _("Symbol, Other"); case G_UNICODE_LINE_SEPARATOR: return _("Separator, Line"); case G_UNICODE_PARAGRAPH_SEPARATOR: return _("Separator, Paragraph"); case G_UNICODE_SPACE_SEPARATOR: return _("Separator, Space"); default: return ""; } }
/** * gucharmap_unichar_isdefined: * @uc: a Unicode character * * Determines if a given character is assigned in the Unicode * standard. * * Return value: %TRUE if the character has an assigned value **/ gboolean gucharmap_unichar_isdefined (gunichar uc) { return gucharmap_unichar_type (uc) != G_UNICODE_UNASSIGNED; }