static VALUE utf8proc_ruby_char(VALUE self, VALUE code_param) { char buffer[4]; ssize_t result; int uc; uc = NUM2INT(code_param); if (!utf8proc_codepoint_valid(uc)) rb_raise(rb_eArgError, "Invalid Unicode code point"); result = utf8proc_encode_char(uc, buffer); return rb_str_new(buffer, result); }
int utf8proc_wctomb(char *s, wchar_t wc) { if (s == NULL) return (0); if (!utf8proc_codepoint_valid(wc)) return (-1); return (utf8proc_encode_char(wc, s)); }
static void test_utf8proc_encode_char ( void ) { uint8_t utf8_str[4]; int32_t unicode_char; ssize_t bytes_written; for ( unicode_char = 0; unicode_char < 128; ++unicode_char ) { bytes_written = utf8proc_encode_char ( unicode_char, utf8_str ); rtems_test_assert ( bytes_written == 1 ); rtems_test_assert ( utf8_str[0] == (uint8_t)unicode_char ); } }
typestr TMarcRecord::unescape_xml(const typestr & a_str) { typestr new_str = a_str; // Convert numeric entities char* p = new_str.str(); while (p = strchr(p, '&')) { char* p2 = p + 1; if (*p2 == '#') { ++p2; int value; const char* mask = "%u;"; if (*p2 == 'x') { ++p2; mask = "%x;"; } if (sscanf(p2, mask, &value) == 1) { p2 = p; typestr source; while (*p2 != ';') { source.append_char(*p2); ++p2; } source.append_char(';'); char utf8[10]; int chars = utf8proc_encode_char(value, (uint8_t *)utf8); utf8[chars] = '\0'; new_str.replace(source.cstr(), utf8, p - new_str.str()); p += chars; continue; } } ++p; } new_str.replace("<", "<"); new_str.replace(">", ">"); new_str.replace("'", "'"); new_str.replace(""", "\""); new_str.replace("&", "&"); return new_str; }
string_tree_t *regex_string_tree(char *regex, size_t len) { uint8_t *char_ptr = (uint8_t *)regex; bool in_set = false; bool in_brackets = false; int32_t codepoint; int32_t last_codepoint = 0; ssize_t char_len; size_t bracket_start; size_t bracket_len; char temp_char[MAX_UTF8_CHAR_SIZE]; ssize_t temp_char_len; string_tree_t *tree = string_tree_new(); if (len == 0) { // Single token with zero-length string_tree_add_string_len(tree, regex, len); string_tree_finalize_token(tree); return tree; } uint32_array *char_set = uint32_array_new(); size_t idx = 0; int i, j; bool add_to_index = false; while (idx < len) { char_len = utf8proc_iterate(char_ptr, len, &codepoint); if (char_len <= 0) { uint32_array_destroy(char_set); string_tree_destroy(tree); return NULL; } if (!(utf8proc_codepoint_valid(codepoint))) { idx += char_len; char_ptr += char_len; continue; } add_to_index = true; if (codepoint == LSQUARE_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) { log_debug("begin set\n"); in_set = true; codepoint = BEGIN_SET_CODEPOINT; uint32_array_clear(char_set); } else if (codepoint == RSQUARE_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT && in_set) { log_debug("end set"); for (j = 0; j < char_set->n; j++) { temp_char_len = utf8proc_encode_char(char_set->a[j], (uint8_t *)temp_char); log_debug("Adding string %.*s\n", (int)temp_char_len, temp_char); string_tree_add_string_len(tree, temp_char, temp_char_len); } string_tree_finalize_token(tree); uint32_array_clear(char_set); // Add a special codepoint to the sequence to distinguish from an escaped square bracket codepoint = END_SET_CODEPOINT; in_set = false; } else if (codepoint == LCURLY_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) { in_brackets = true; bracket_start = idx + char_len; bracket_len = 0; add_to_index = false; } else if (codepoint == RCURLY_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT && in_brackets) { log_debug("Adding bracketed string: %.*s\n", (int) bracket_len, regex + bracket_start); string_tree_add_string_len(tree, regex + bracket_start, bracket_len); in_brackets = false; } else if ((codepoint == LPAREN_CODEPOINT || codepoint == RPAREN_CODEPOINT) && last_codepoint != BACKSLASH_CODEPOINT) { log_debug("group\n"); add_to_index = false; } else if (in_set) { log_debug("in set\n"); // Queue node, we'll add them to the trie uint32_array_push(char_set, codepoint); add_to_index = false; } else if (in_brackets) { add_to_index = false; bracket_len += char_len; } else if (codepoint == BACKSLASH_CODEPOINT && last_codepoint != BACKSLASH_CODEPOINT) { add_to_index = false; } log_debug("codepoint = %d\n", codepoint); if (add_to_index) { temp_char_len = utf8proc_encode_char(codepoint, (uint8_t *)temp_char); log_debug("char = %.*s\n", (int)temp_char_len, temp_char); string_tree_add_string_len(tree, temp_char, temp_char_len); string_tree_finalize_token(tree); } idx += char_len; char_ptr += char_len; } uint32_array_destroy(char_set); return tree; }
size_t houdini_unescape_ent(cmark_strbuf *ob, const uint8_t *src, size_t size) { size_t i = 0; if (size >= 3 && src[0] == '#') { int codepoint = 0; int num_digits = 0; if (_isdigit(src[1])) { for (i = 1; i < size && _isdigit(src[i]); ++i) { codepoint = (codepoint * 10) + (src[i] - '0'); if (codepoint >= 0x110000) { // Keep counting digits but // avoid integer overflow. codepoint = 0x110000; } } num_digits = i - 1; } else if (src[1] == 'x' || src[1] == 'X') { for (i = 2; i < size && _isxdigit(src[i]); ++i) { codepoint = (codepoint * 16) + ((src[i] | 32) % 39 - 9); if (codepoint >= 0x110000) { // Keep counting digits but // avoid integer overflow. codepoint = 0x110000; } } num_digits = i - 2; } if (num_digits >= 1 && num_digits <= 8 && i < size && src[i] == ';') { if (codepoint == 0 || (codepoint >= 0xD800 && codepoint < 0xE000) || codepoint >= 0x110000) { codepoint = 0xFFFD; } utf8proc_encode_char(codepoint, ob); return i + 1; } } else { if (size > MAX_WORD_LENGTH) size = MAX_WORD_LENGTH; for (i = MIN_WORD_LENGTH; i < size; ++i) { if (src[i] == ' ') break; if (src[i] == ';') { const struct html_ent *entity = find_entity((char *)src, i); if (entity != NULL) { int len = 0; while (len < 4 && entity->utf8[len] != '\0') { ++len; } cmark_strbuf_put(ob, entity->utf8, len); return i + 1; } break; } } } return 0; }
void cmark_render_code_point(cmark_renderer *renderer, uint32_t c) { utf8proc_encode_char(c, renderer->buffer); renderer->column += 1; }