String* Str_new_from_char(int32_t code_point) { const size_t MAX_UTF8_BYTES = 4; char *ptr = (char*)MALLOCATE(MAX_UTF8_BYTES + 1); size_t size = StrHelp_encode_utf8_char(code_point, (uint8_t*)ptr); ptr[size] = '\0'; String *self = (String*)Class_Make_Obj(STRING); self->ptr = ptr; self->size = size; self->origin = self; return self; }
static void test_back_utf8_char(TestBatchRunner *runner) { char buffer[4]; char *buf = buffer + 1; uint32_t len = StrHelp_encode_utf8_char(0x263A, buffer); char *end = buffer + len; TEST_TRUE(runner, StrHelp_back_utf8_char(end, buffer) == buffer, "back_utf8_char"); TEST_TRUE(runner, StrHelp_back_utf8_char(end, buf) == NULL, "back_utf8_char returns NULL rather than back up beyond start"); TEST_TRUE(runner, StrHelp_back_utf8_char(buffer, buffer) == NULL, "back_utf8_char returns NULL when end == start"); }
static void test_utf8_round_trip(TestBatchRunner *runner) { int32_t code_point; for (code_point = 0; code_point <= 0x10FFFF; code_point++) { char buffer[4]; uint32_t size = StrHelp_encode_utf8_char(code_point, buffer); char *start = buffer; char *end = start + size; // Verify length returned by encode_utf8_char(). if (size != StrHelp_UTF8_COUNT[(unsigned char)buffer[0]]) { break; } // Verify that utf8_valid() agrees with alternate implementation. if (!!StrHelp_utf8_valid(start, size) != !!S_utf8_valid_alt(start, size) ) { break; } // Verify back_utf8_char(). if (StrHelp_back_utf8_char(end, start) != start) { break; } // Verify round trip of encode/decode. if (StrHelp_decode_utf8_char(buffer) != code_point) { break; } } if (code_point == 0x110000) { PASS(runner, "Successfully round tripped 0 - 0x10FFFF"); } else { FAIL(runner, "Failed round trip at 0x%.1X", (unsigned)code_point); } }
static CharBuf* S_unescape_text(char *const top, char *const end) { // The unescaped string will never be longer than the escaped string // because only a \u escape can theoretically be too long and // StrHelp_encode_utf8_char guards against sequences over 4 bytes. // Therefore we can allocate once and not worry about reallocating. size_t cap = end - top + 1; char *target_buf = (char*)MALLOCATE(cap); size_t target_size = 0; for (char *text = top; text < end; text++) { if (*text != '\\') { target_buf[target_size++] = *text; } else { // Process escape. text++; switch (*text) { case '"': target_buf[target_size++] = '"'; break; case '\\': target_buf[target_size++] = '\\'; break; case '/': target_buf[target_size++] = '/'; break; case 'b': target_buf[target_size++] = '\b'; break; case 'f': target_buf[target_size++] = '\f'; break; case 'n': target_buf[target_size++] = '\n'; break; case 'r': target_buf[target_size++] = '\r'; break; case 't': target_buf[target_size++] = '\t'; break; case 'u': { // Copy into a temp buffer because strtol will overrun // into adjacent text data for e.g. "\uAAAA1". char temp[5] = { 0, 0, 0, 0, 0 }; memcpy(temp, text + 1, 4); text += 4; char *num_end; long code_point = strtol(temp, &num_end, 16); char *temp_ptr = temp; if (num_end != temp_ptr + 4 || code_point < 0) { FREEMEM(target_buf); SET_ERROR(CB_newf("Invalid \\u escape"), text - 5, end); return NULL; } if (code_point >= 0xD800 && code_point <= 0xDFFF) { FREEMEM(target_buf); SET_ERROR(CB_newf("Surrogate pairs not supported"), text - 5, end); return NULL; } target_size += StrHelp_encode_utf8_char((uint32_t)code_point, target_buf + target_size); } break; default: FREEMEM(target_buf); SET_ERROR(CB_newf("Illegal escape"), text - 1, end); return NULL; } } } // NULL-terminate, sanity check, then return the escaped string. target_buf[target_size] = '\0'; if (!StrHelp_utf8_valid(target_buf, target_size)) { FREEMEM(target_buf); CharBuf *mess = MAKE_MESS("Bad UTF-8 in JSON"); Err_set_error(Err_new(mess)); return NULL; } return CB_new_steal_from_trusted_str(target_buf, target_size, cap); }