void myhtml_string_append_lowercase_with_preprocessing(myhtml_string_t* str, const char* buff, size_t length) { MyHTML_STRING_REALLOC_IF_NEED(str, (length + 1), 32); unsigned char *data = (unsigned char*)str->data; const unsigned char *u_buff = (unsigned char*)buff; for (size_t i = 0; i < length; i++) { if(u_buff[i] == 0x0D) { data[str->length] = 0x0A; if((i+1) < length && u_buff[(i + 1)] == 0x0A) { i++; } } else if(u_buff[i] == 0x00) { MyHTML_STRING_REALLOC_IF_NEED(str, (length + 4), 32); // Unicode Character 'REPLACEMENT CHARACTER' (U+FFFD) data[str->length] = 0xEF; str->length++; data[str->length] = 0xBF; str->length++; data[str->length] = 0xBD; } else { data[str->length] = myhtml_string_chars_lowercase_map[ u_buff[i] ]; } str->length++; } str->data[str->length] = '\0'; }
void myhtml_string_append_with_replacement_null_characters_only(myhtml_string_t* str, const char* buff, size_t length) { MyHTML_STRING_REALLOC_IF_NEED(str, (length + 1), 0); unsigned char *data = (unsigned char*)str->data; const unsigned char *u_buff = (const unsigned char*)buff; for (size_t i = 0; i < length; i++) { if(u_buff[i] == 0x00) { myhtml_string_realloc(str, (str->size + 5)); data = (unsigned char*)str->data; // Unicode Character 'REPLACEMENT CHARACTER' (U+FFFD) data[str->length] = 0xEF; str->length++; data[str->length] = 0xBF; str->length++; data[str->length] = 0xBD; } else data[str->length] = u_buff[i]; str->length++; } str->data[str->length] = '\0'; }
size_t myhtml_string_append_lowercase_chunk_with_convert_encoding_with_preprocessing(myhtml_string_t* str, myhtml_encoding_result_t* res, const char* buff, size_t length, myhtml_encoding_t encoding, bool emit_null_chars) { MyHTML_STRING_REALLOC_IF_NEED(str, (length + 1), 0); unsigned const char* u_buff = (unsigned const char*)buff; const myhtml_encoding_custom_f func = myhtml_encoding_get_function_by_id(encoding); for (size_t i = 0; i < length; i++) { if(func(u_buff[i], res) == MyHTML_ENCODING_STATUS_OK) { MyHTML_STRING_REALLOC_IF_NEED(str, 5, 1); size_t len = myhtml_encoding_codepoint_to_lowercase_ascii_utf_8(res->result, &str->data[str->length]); if(len == 1) { if(str->data[str->length] == '\r') { str->data[str->length] = '\n'; if((i + 1) < length) { if(buff[(i + 1)] == '\n') i++; } else { str->length++; return str->length; } } else if(str->data[str->length] == 0x00 && emit_null_chars == false) { myhtml_string_realloc(str, (str->size + 5)); // Unicode Character 'REPLACEMENT CHARACTER' (U+FFFD) str->data[str->length] = (char)0xEF; str->length++; str->data[str->length] = (char)0xBF; str->length++; str->data[str->length] = (char)0xBD; } } str->length += len; } } MyHTML_STRING_APPEND_BYTE_WITHOUT_INCREMENT('\0', str, 1); return 0; }
///////////////////////////////////////////////////////// //// Append API //// ///////////////////////////////////////////////////////// void myhtml_string_append(myhtml_string_t* str, const char* buff, size_t length) { MyHTML_STRING_REALLOC_IF_NEED(str, (length + 1), 0); memcpy(&str->data[str->length], buff, (sizeof(char) * length)); str->length += length; str->data[str->length] = '\0'; }
void _myhtml_string_charef_append(myhtml_string_t* str, const char sm) { MyHTML_STRING_REALLOC_IF_NEED(str, (str->length + 2), 32); str->data[str->length] = sm; str->length++; str->data[str->length] = '\0'; }
void _myhtml_string_append_char_references_state_end(myhtml_string_char_ref_chunk_t *chunk, myhtml_string_t* str) { str->length = chunk->begin; /* 4 is max utf8 byte + \0 */ MyHTML_STRING_REALLOC_IF_NEED(str, (chunk->begin + 5), 12); str->length += myhtml_encoding_codepoint_to_ascii_utf_8(chunk->l_data, &str->data[str->length]); str->data[str->length] = '\0'; }
void myhtml_string_append_chunk_with_convert_encoding_with_preprocessing(myhtml_string_t* str, myhtml_encoding_result_t* res, const char* buff, size_t length, myhtml_encoding_t encoding) { unsigned const char* u_buff = (unsigned const char*)buff; myhtml_encoding_custom_f func = myhtml_encoding_get_function_by_id(encoding); for (size_t i = 0; i < length; i++) { if(func(u_buff[i], res) == MyHTML_ENCODING_STATUS_OK) { MyHTML_STRING_REALLOC_IF_NEED(str, 4, 32); size_t len = myhtml_encoding_codepoint_to_ascii_utf_8(res->result, &str->data[str->length]); if(len == 1) { // change \r\n to \n if(str->length > 0) { if(str->data[(str->length - 1)] == '\r') { str->data[(str->length - 1)] = '\n'; if(str->data[str->length] == '\n') str->length--; str->length += len; continue; } } if(str->data[str->length] == 0x00) { MyHTML_STRING_REALLOC_IF_NEED(str, (length + 4), 32); // Unicode Character 'REPLACEMENT CHARACTER' (U+FFFD) str->data[str->length] = 0xEF; str->length++; str->data[str->length] = 0xBF; str->length++; str->data[str->length] = 0xBD; } } str->length += len; } } MyHTML_STRING_APPEND_BYTE_WITHOUT_INCREMENT('\0', str, 1); }
void myhtml_string_append_one_with_convert_encoding(myhtml_string_t* str, myhtml_encoding_result_t* res, const char data, myhtml_encoding_t encoding) { const myhtml_encoding_custom_f func = myhtml_encoding_get_function_by_id(encoding); if(func((unsigned const char)data, res) == MyHTML_ENCODING_STATUS_OK) { MyHTML_STRING_REALLOC_IF_NEED(str, 5, 0); str->length += myhtml_encoding_codepoint_to_ascii_utf_8(res->result, &str->data[str->length]); MyHTML_STRING_APPEND_BYTE_WITHOUT_INCREMENT_REALLOC('\0', str); } }
size_t _myhtml_string_append_char_references_state_2(myhtml_string_char_ref_chunk_t *chunk, myhtml_string_t* str, const char* buff, size_t offset, size_t size) { size_t tmp_offset = offset; const charef_entry_t *current_entry = myhtml_charef_find_by_pos(chunk->charef_res.curr_entry->next, buff, &offset, size, &chunk->charef_res); if(chunk->charef_res.is_done) { chunk->state = 0; if(buff[offset] == ';') offset++; else { if(chunk->is_attributes && (buff[offset] == '=' || myhtml_string_alphanumeric_character[ (unsigned char)buff[offset] ] != 0xff)) { if(chunk->encoding == MyHTML_ENCODING_UTF_8) myhtml_string_append_with_preprocessing(str, &buff[tmp_offset], (offset - tmp_offset)); else myhtml_string_append_chunk_with_convert_encoding_with_preprocessing(str, &chunk->res, &buff[tmp_offset], (offset - tmp_offset), chunk->encoding); return offset; } } if(current_entry->codepoints_len) { for (size_t i = 0; i < current_entry->codepoints_len; i++) { MyHTML_STRING_REALLOC_IF_NEED(str, (chunk->begin + 4), 32); chunk->begin += myhtml_encoding_codepoint_to_ascii_utf_8(current_entry->codepoints[i], &str->data[chunk->begin]); } str->length = chunk->begin; } else { if(chunk->encoding == MyHTML_ENCODING_UTF_8) myhtml_string_append_with_preprocessing(str, &buff[tmp_offset], (offset - tmp_offset)); else myhtml_string_append_chunk_with_convert_encoding_with_preprocessing(str, &chunk->res, &buff[tmp_offset], (offset - tmp_offset), chunk->encoding); } chunk->charef_res.last_entry = NULL; } else { if(chunk->encoding == MyHTML_ENCODING_UTF_8) myhtml_string_append_with_preprocessing(str, &buff[tmp_offset], (offset - tmp_offset)); else myhtml_string_append_chunk_with_convert_encoding_with_preprocessing(str, &chunk->res, &buff[tmp_offset], (offset - tmp_offset), chunk->encoding); } return offset; }
void myhtml_string_append_chunk_with_convert_encoding(myhtml_string_t* str, myhtml_encoding_result_t* res, const char* buff, size_t length, myhtml_encoding_t encoding) { unsigned const char* u_buff = (unsigned const char*)buff; const myhtml_encoding_custom_f func = myhtml_encoding_get_function_by_id(encoding); for (size_t i = 0; i < length; i++) { if(func(u_buff[i], res) == MyHTML_ENCODING_STATUS_OK) { MyHTML_STRING_REALLOC_IF_NEED(str, 5, 0); str->length += myhtml_encoding_codepoint_to_ascii_utf_8(res->result, &str->data[str->length]); } } MyHTML_STRING_APPEND_BYTE_WITHOUT_INCREMENT('\0', str, 1); }
void myhtml_string_append_lowercase(myhtml_string_t* str, const char* data, size_t length) { MyHTML_STRING_REALLOC_IF_NEED(str, (length + 1), 0); unsigned char *ref = (unsigned char*)&str->data[str->length]; const unsigned char *buf = (const unsigned char*)data; size_t i; for(i = 0; i < length; i++) { ref[i] = myhtml_string_chars_lowercase_map[ buf[i] ]; } ref[i] = '\0'; str->length += length; }
size_t myhtml_string_append_with_preprocessing(myhtml_string_t* str, const char* buff, size_t length, bool emit_null_chars) { MyHTML_STRING_REALLOC_IF_NEED(str, (length + 1), 0); unsigned char *data = (unsigned char*)str->data; const unsigned char *u_buff = (const unsigned char*)buff; /* 0x0D == \r */ /* 0x0A == \n */ for (size_t i = 0; i < length; i++) { if(u_buff[i] == 0x0D) { data[str->length] = 0x0A; if((i + 1) < length) { if(u_buff[(i + 1)] == 0x0A) i++; } else { str->length++; return str->length; } } else if(u_buff[i] == 0x00 && emit_null_chars == false) { myhtml_string_realloc(str, (str->size + 5)); data = (unsigned char*)str->data; // Unicode Character 'REPLACEMENT CHARACTER' (U+FFFD) data[str->length] = 0xEF; str->length++; data[str->length] = 0xBF; str->length++; data[str->length] = 0xBD; } else data[str->length] = u_buff[i]; str->length++; } str->data[str->length] = '\0'; return 0; }
void _myhtml_string_append_char_references_state_end(myhtml_string_char_ref_chunk_t *chunk, myhtml_string_t* str) { str->length = chunk->begin; /* 4 is max utf8 byte + \0 */ MyHTML_STRING_REALLOC_IF_NEED(str, (chunk->begin + 5), 12); if(chunk->l_data <= 0x9F) chunk->l_data = replacement_character[chunk->l_data]; else if(chunk->l_data >= 0xD800 && chunk->l_data <= 0xDFFF) chunk->l_data = replacement_character[0]; else if(chunk->l_data > 0x10FFFF) chunk->l_data = replacement_character[0]; // if(chunk->l_data == 0x10FFFE) // chunk->l_data = replacement_character[0]; str->length += myhtml_encoding_codepoint_to_ascii_utf_8(chunk->l_data, &str->data[chunk->begin]); str->data[str->length] = '\0'; }
size_t _myhtml_string_append_char_references_state_2(myhtml_string_char_ref_chunk_t *chunk, myhtml_string_t* str, const char* buff, size_t offset, size_t size) { int is_done = 0; size_t tmp_offset = offset; chunk->entry = myhtml_charef_find_by_pos(chunk->entry->next, buff, &offset, size, &is_done); if(is_done) { chunk->state = 0; if(chunk->entry->codepoints_len) { for (size_t i = 0; i < chunk->entry->codepoints_len; i++) { MyHTML_STRING_REALLOC_IF_NEED(str, (chunk->begin + 4), 32); chunk->begin += myhtml_encoding_codepoint_to_ascii_utf_8(chunk->entry->codepoints[i], &str->data[chunk->begin]); } str->length = chunk->begin; } else { if(chunk->encoding == MyHTML_ENCODING_UTF_8) myhtml_string_append(str, &buff[tmp_offset], (offset - tmp_offset)); else myhtml_string_append_chunk_with_convert_encoding(str, &chunk->res, &buff[tmp_offset], (offset - tmp_offset), chunk->encoding); } } else { if(chunk->encoding == MyHTML_ENCODING_UTF_8) myhtml_string_append(str, &buff[tmp_offset], (offset - tmp_offset)); else myhtml_string_append_chunk_with_convert_encoding(str, &chunk->res, &buff[tmp_offset], (offset - tmp_offset), chunk->encoding); } return offset; }
void myhtml_string_append_charef_end(myhtml_string_char_ref_chunk_t *chunk, myhtml_string_t* str) { if(chunk->state == 4 || chunk->state == 5) { _myhtml_string_append_char_references_state_end(chunk, str); } else if(chunk->state == 2 && chunk->charef_res.last_entry) { const charef_entry_t *entry = chunk->charef_res.last_entry; for (size_t i = 0; i < entry->codepoints_len; i++) { MyHTML_STRING_REALLOC_IF_NEED(str, (chunk->begin + 4), 32); chunk->begin += myhtml_encoding_codepoint_to_ascii_utf_8(entry->codepoints[i], &str->data[chunk->begin]); } str->length = chunk->begin; } if(str->length) { if(str->data[ (str->length - 1) ] == '\r') { str->data[ (str->length - 1) ] = '\n'; } } }
void myhtml_string_append_one(myhtml_string_t* str, const char data) { MyHTML_STRING_REALLOC_IF_NEED(str, 2, 1); MyHTML_STRING_APPEND_BYTE_WITHOUT_REALLOC(data, str); MyHTML_STRING_APPEND_BYTE_WITHOUT_INCREMENT_REALLOC('\0', str); }