CharBuf* CB_new_steal_str(char *ptr, size_t size, size_t cap) { if (!StrHelp_utf8_valid(ptr, size)) { DIE_INVALID_UTF8(ptr, size); } return CB_new_steal_from_trusted_str(ptr, size, cap); }
String* Str_Cat_Utf8_IMP(String *self, const char* ptr, size_t size) { if (!StrHelp_utf8_valid(ptr, size)) { DIE_INVALID_UTF8(ptr, size); } return Str_Cat_Trusted_Utf8(self, ptr, size); }
CharBuf* CB_new_from_utf8(const char *ptr, size_t size) { if (!StrHelp_utf8_valid(ptr, size)) { DIE_INVALID_UTF8(ptr, size); } return CB_new_from_trusted_utf8(ptr, size); }
void TextTermStepper_read_key_frame(TextTermStepper *self, InStream *instream) { const uint32_t text_len = InStream_Read_C32(instream); CharBuf *value; char *ptr; // Allocate space. if (self->value == NULL) { self->value = (Obj*)CB_new(text_len); } value = (CharBuf*)self->value; ptr = CB_Grow(value, text_len); // Set the value text. InStream_Read_Bytes(instream, ptr, text_len); CB_Set_Size(value, text_len); if (!StrHelp_utf8_valid(ptr, text_len)) { THROW(ERR, "Invalid UTF-8 sequence in '%o' at byte %i64", InStream_Get_Filename(instream), InStream_Tell(instream) - text_len); } // Null-terminate. ptr[text_len] = '\0'; }
void TextTermStepper_Read_Delta_IMP(TextTermStepper *self, InStream *instream) { TextTermStepperIVARS *const ivars = TextTermStepper_IVARS(self); const uint32_t text_overlap = InStream_Read_C32(instream); const uint32_t finish_chars_len = InStream_Read_C32(instream); const uint32_t total_text_len = text_overlap + finish_chars_len; // Allocate space. CharBuf *charbuf = (CharBuf*)ivars->value; char *ptr = CB_Grow(charbuf, total_text_len); // Set the value text. InStream_Read_Bytes(instream, ptr + text_overlap, finish_chars_len); CB_Set_Size(charbuf, total_text_len); if (!StrHelp_utf8_valid(ptr, total_text_len)) { THROW(ERR, "Invalid UTF-8 sequence in '%o' at byte %i64", InStream_Get_Filename(instream), InStream_Tell(instream) - finish_chars_len); } // Null-terminate. ptr[total_text_len] = '\0'; // Invalidate string. DECREF(ivars->string); ivars->string = NULL; }
String* Str_new_steal_utf8(char *utf8, size_t size) { if (!StrHelp_utf8_valid(utf8, size)) { DIE_INVALID_UTF8(utf8, size); } String *self = (String*)Class_Make_Obj(STRING); return Str_init_steal_trusted_utf8(self, utf8, size); }
static void S_test_validity(TestBatchRunner *runner, const char *content, size_t size, bool expected, const char *description) { bool sane = StrHelp_utf8_valid(content, size); bool double_check = S_utf8_valid_alt(content, size); if (sane != double_check) { FAIL(runner, "Disagreement: %s", description); } else { TEST_TRUE(runner, sane == expected, "%s", description); } }
String* Freezer_deserialize_string(String *string, InStream *instream) { size_t size = InStream_Read_C32(instream); if (size == SIZE_MAX) { THROW(ERR, "Can't deserialize SIZE_MAX bytes"); } char *buf = (char*)MALLOCATE(size + 1); InStream_Read_Bytes(instream, buf, size); buf[size] = '\0'; if (!StrHelp_utf8_valid(buf, size)) { THROW(ERR, "Attempt to deserialize invalid UTF-8"); } return Str_init_steal_trusted_utf8(string, buf, size); }
static CharBuf* S_parse_string(char **json_ptr, char *const limit) { // Find terminating double quote, determine whether there are any escapes. char *top = *json_ptr + 1; char *end = NULL; bool_t saw_backslash = false; for (char *text = top; text < limit; text++) { if (*text == '"') { end = text; break; } else if (*text == '\\') { saw_backslash = true; if (text + 1 < limit && text[1] == 'u') { text += 5; } else { text += 1; } } } if (!end) { SET_ERROR(CB_newf("Unterminated string"), *json_ptr, limit); return NULL; } // Advance the text buffer to just beyond the closing quote. *json_ptr = end + 1; if (saw_backslash) { return S_unescape_text(top, end); } else { // Optimize common case where there are no escapes. size_t len = end - top; if (!StrHelp_utf8_valid(top, len)) { CharBuf *mess = MAKE_MESS("Bad UTF-8 in JSON"); Err_set_error(Err_new(mess)); return NULL; } return CB_new_from_trusted_utf8(top, len); } }
static void test_utf8_round_trip(TestBatchRunner *runner) { int32_t code_point; for (code_point = 0; code_point <= 0x10FFFF; code_point++) { char buffer[4]; uint32_t size = StrHelp_encode_utf8_char(code_point, buffer); char *start = buffer; char *end = start + size; // Verify length returned by encode_utf8_char(). if (size != StrHelp_UTF8_COUNT[(unsigned char)buffer[0]]) { break; } // Verify that utf8_valid() agrees with alternate implementation. if (!!StrHelp_utf8_valid(start, size) != !!S_utf8_valid_alt(start, size) ) { break; } // Verify back_utf8_char(). if (StrHelp_back_utf8_char(end, start) != start) { break; } // Verify round trip of encode/decode. if (StrHelp_decode_utf8_char(buffer) != code_point) { break; } } if (code_point == 0x110000) { PASS(runner, "Successfully round tripped 0 - 0x10FFFF"); } else { FAIL(runner, "Failed round trip at 0x%.1X", (unsigned)code_point); } }
void CB_vcatf(CharBuf *self, const char *pattern, va_list args) { size_t pattern_len = strlen(pattern); const char *pattern_start = pattern; const char *pattern_end = pattern + pattern_len; char buf[64]; for (; pattern < pattern_end; pattern++) { const char *slice_end = pattern; // Consume all characters leading up to a '%'. while (slice_end < pattern_end && *slice_end != '%') { slice_end++; } if (pattern != slice_end) { size_t size = slice_end - pattern; CB_Cat_Trusted_Str(self, pattern, size); pattern = slice_end; } if (pattern < pattern_end) { pattern++; // Move past '%'. switch (*pattern) { case '%': { CB_Cat_Trusted_Str(self, "%", 1); } break; case 'o': { Obj *obj = va_arg(args, Obj*); if (!obj) { CB_Cat_Trusted_Str(self, "[NULL]", 6); } else if (Obj_Is_A(obj, CHARBUF)) { CB_Cat(self, (CharBuf*)obj); } else { CharBuf *string = Obj_To_String(obj); CB_Cat(self, string); DECREF(string); } } break; case 'i': { int64_t val = 0; size_t size; if (pattern[1] == '8') { val = va_arg(args, int32_t); pattern++; } else if (pattern[1] == '3' && pattern[2] == '2') { val = va_arg(args, int32_t); pattern += 2; } else if (pattern[1] == '6' && pattern[2] == '4') { val = va_arg(args, int64_t); pattern += 2; } else { S_die_invalid_pattern(pattern_start); } size = sprintf(buf, "%" I64P, val); CB_Cat_Trusted_Str(self, buf, size); } break; case 'u': { uint64_t val = 0; size_t size; if (pattern[1] == '8') { val = va_arg(args, uint32_t); pattern += 1; } else if (pattern[1] == '3' && pattern[2] == '2') { val = va_arg(args, uint32_t); pattern += 2; } else if (pattern[1] == '6' && pattern[2] == '4') { val = va_arg(args, uint64_t); pattern += 2; } else { S_die_invalid_pattern(pattern_start); } size = sprintf(buf, "%" U64P, val); CB_Cat_Trusted_Str(self, buf, size); } break; case 'f': { if (pattern[1] == '6' && pattern[2] == '4') { double num = va_arg(args, double); char bigbuf[512]; size_t size = sprintf(bigbuf, "%g", num); CB_Cat_Trusted_Str(self, bigbuf, size); pattern += 2; } else { S_die_invalid_pattern(pattern_start); } } break; case 'x': { if (pattern[1] == '3' && pattern[2] == '2') { unsigned long val = va_arg(args, uint32_t); size_t size = sprintf(buf, "%.8lx", val); CB_Cat_Trusted_Str(self, buf, size); pattern += 2; } else { S_die_invalid_pattern(pattern_start); } } break; case 's': { char *string = va_arg(args, char*); if (string == NULL) { CB_Cat_Trusted_Str(self, "[NULL]", 6); } else { size_t size = strlen(string); if (StrHelp_utf8_valid(string, size)) { CB_Cat_Trusted_Str(self, string, size); } else { CB_Cat_Trusted_Str(self, "[INVALID UTF8]", 14); } } } break; default: { // Assume NULL-terminated pattern string, which // eliminates the need for bounds checking if '%' is // the last visible character. S_die_invalid_pattern(pattern_start); } }
static CharBuf* S_unescape_text(char *const top, char *const end) { // The unescaped string will never be longer than the escaped string // because only a \u escape can theoretically be too long and // StrHelp_encode_utf8_char guards against sequences over 4 bytes. // Therefore we can allocate once and not worry about reallocating. size_t cap = end - top + 1; char *target_buf = (char*)MALLOCATE(cap); size_t target_size = 0; for (char *text = top; text < end; text++) { if (*text != '\\') { target_buf[target_size++] = *text; } else { // Process escape. text++; switch (*text) { case '"': target_buf[target_size++] = '"'; break; case '\\': target_buf[target_size++] = '\\'; break; case '/': target_buf[target_size++] = '/'; break; case 'b': target_buf[target_size++] = '\b'; break; case 'f': target_buf[target_size++] = '\f'; break; case 'n': target_buf[target_size++] = '\n'; break; case 'r': target_buf[target_size++] = '\r'; break; case 't': target_buf[target_size++] = '\t'; break; case 'u': { // Copy into a temp buffer because strtol will overrun // into adjacent text data for e.g. "\uAAAA1". char temp[5] = { 0, 0, 0, 0, 0 }; memcpy(temp, text + 1, 4); text += 4; char *num_end; long code_point = strtol(temp, &num_end, 16); char *temp_ptr = temp; if (num_end != temp_ptr + 4 || code_point < 0) { FREEMEM(target_buf); SET_ERROR(CB_newf("Invalid \\u escape"), text - 5, end); return NULL; } if (code_point >= 0xD800 && code_point <= 0xDFFF) { FREEMEM(target_buf); SET_ERROR(CB_newf("Surrogate pairs not supported"), text - 5, end); return NULL; } target_size += StrHelp_encode_utf8_char((uint32_t)code_point, target_buf + target_size); } break; default: FREEMEM(target_buf); SET_ERROR(CB_newf("Illegal escape"), text - 1, end); return NULL; } } } // NULL-terminate, sanity check, then return the escaped string. target_buf[target_size] = '\0'; if (!StrHelp_utf8_valid(target_buf, target_size)) { FREEMEM(target_buf); CharBuf *mess = MAKE_MESS("Bad UTF-8 in JSON"); Err_set_error(Err_new(mess)); return NULL; } return CB_new_steal_from_trusted_str(target_buf, target_size, cap); }