// This little function counts how many bytes a certain number of characters take up. static size_t count_bytes_in_utf8(const uint8_t *str, size_t num_chars) { const uint8_t *p = str; for (unsigned int i = 0; i < num_chars; i++) { p += codepoint_size(*p); } return p - str; }
inline iterator operator++(int junk) { iterator copy(raw_iterator_); char octet = *raw_iterator_; int size = codepoint_size(octet); raw_iterator_ += size; return copy; }
static size_t count_wchars_in_utf8(const uint8_t *str, size_t num_chars) { size_t wchars = 0; for (unsigned int i = 0; i < num_chars; i++) { wchars += 1 + NEEDS_TWO_WCHARS(*str); str += codepoint_size(*str); } return wchars; }
// Count the number of characters in a string. static size_t strlen_utf8(const uint8_t *str) { const uint8_t *p = str; size_t i = 0; while (*p) { p += codepoint_size(*p); i++; } return i; }
static size_t count_utf8_in_wchars(const uint8_t *str, size_t num_wchars) { size_t chars = num_wchars; for (unsigned int i = 0; i < num_wchars; i++) { if (NEEDS_TWO_WCHARS(*str)) { chars--; i++; } str += codepoint_size(*str); } return chars; }
inline const codepoint operator*() const { enc_buffer_type::const_iterator copy = raw_iterator_; char octets[max_codepoint_size]; memset(octets, 0, max_codepoint_size); octets[0] = *copy++; difference_type size = codepoint_size(octets[0]); for (int i = 1; i < size; i++) { octets[i] = *copy++; } return octets_to_codepoint(octets, size); }
// Checks if a UTF8 string is ok. Returns the number of bytes in the string if // it is ok, otherwise returns -1. static ssize_t bytelen_and_check_utf8(const uint8_t *str) { const uint8_t *p = str; while (*p != '\0') { size_t size = codepoint_size(*p); if (size == SIZE_MAX) return -1; p++; size--; while (size > 0) { // Check that any middle bytes are of the form 0x10xx xxxx if ((*p & 0xc0) != 0x80) return -1; p++; size--; } } #ifdef DEBUG size_t num = p - str; assert(num == strlen((char *)str)); #endif return p - str; }
// Insert the given utf8 string into the rope at the specified position. static ROPE_RESULT rope_insert_at_iter(rope *r, rope_node *e, rope_iter *iter, const uint8_t *str) { // iter.offset contains how far (in characters) into the current element to skip. // Figure out how much that is in bytes. size_t offset_bytes = 0; // The insertion offset into the destination node. size_t offset = iter->s[0].skip_size; if (offset) { assert(offset <= e->nexts[0].skip_size); offset_bytes = count_bytes_in_utf8(e->str, offset); } // We might be able to insert the new data into the current node, depending on // how big it is. We'll count the bytes, and also check that its valid utf8. ssize_t num_inserted_bytes = bytelen_and_check_utf8(str); if (num_inserted_bytes == -1) return ROPE_INVALID_UTF8; // Can we insert into the current node? bool insert_here = e->num_bytes + num_inserted_bytes <= ROPE_NODE_STR_SIZE; // Can we insert into the subsequent node? rope_node *next = NULL; if (!insert_here && offset_bytes == e->num_bytes) { next = e->nexts[0].node; // We can insert into the subsequent node if: // - We can't insert into the current node // - There _is_ a next node to insert into // - The insert would be at the start of the next node // - There's room in the next node if (next && next->num_bytes + num_inserted_bytes <= ROPE_NODE_STR_SIZE) { offset = offset_bytes = 0; for (int i = 0; i < next->height; i++) { iter->s[i].node = next; // tree offset nodes will not be used. } e = next; insert_here = true; } } if (insert_here) { // First move the current bytes later on in the string. if (offset_bytes < e->num_bytes) { memmove(&e->str[offset_bytes + num_inserted_bytes], &e->str[offset_bytes], e->num_bytes - offset_bytes); } // Then copy in the string bytes memcpy(&e->str[offset_bytes], str, num_inserted_bytes); e->num_bytes += num_inserted_bytes; r->num_bytes += num_inserted_bytes; size_t num_inserted_chars = strlen_utf8(str); r->num_chars += num_inserted_chars; // .... aaaand update all the offset amounts. #if ROPE_WCHAR size_t num_inserted_wchars = count_wchars_in_utf8(str, num_inserted_chars); update_offset_list(r, iter, num_inserted_chars, num_inserted_wchars); #else update_offset_list(r, iter, num_inserted_chars); #endif } else { // There isn't room. We'll need to add at least one new node to the rope. // If we're not at the end of the current node, we'll need to remove // the end of the current node's data and reinsert it later. size_t num_end_chars, num_end_bytes = e->num_bytes - offset_bytes; if (num_end_bytes) { // We'll pretend like the character have been deleted from the node, while leaving // the bytes themselves there (for later). e->num_bytes = offset_bytes; num_end_chars = e->nexts[0].skip_size - offset; #if ROPE_WCHAR size_t num_end_wchars = count_wchars_in_utf8(&e->str[offset_bytes], num_end_chars); update_offset_list(r, iter, -num_end_chars, -num_end_wchars); #else update_offset_list(r, iter, -num_end_chars); #endif r->num_chars -= num_end_chars; r->num_bytes -= num_end_bytes; } // Now we insert new nodes containing the new character data. The data must be broken into // pieces of with a maximum size of ROPE_NODE_STR_SIZE. Node boundaries must not occur in the // middle of a utf8 codepoint. size_t str_offset = 0; while (str_offset < num_inserted_bytes) { size_t new_node_bytes = 0; size_t new_node_chars = 0; while (str_offset + new_node_bytes < num_inserted_bytes) { size_t cs = codepoint_size(str[str_offset + new_node_bytes]); if (cs + new_node_bytes > ROPE_NODE_STR_SIZE) { break; } else { new_node_bytes += cs; new_node_chars++; } } insert_at(r, iter, &str[str_offset], new_node_bytes, new_node_chars); str_offset += new_node_bytes; } if (num_end_bytes) { insert_at(r, iter, &e->str[offset_bytes], num_end_bytes, num_end_chars); } } return ROPE_OK; }
// Insert the given utf8 string into the rope at the specified position. void rope_insert(rope *r, size_t pos, const uint8_t *str) { assert(r); assert(str); #ifdef DEBUG _rope_check(r); #endif pos = MIN(pos, r->num_chars); // There's a good chance we'll have to rewrite a bunch of next pointers and a bunch // of offsets. This variable will store pointers to the elements which need to // be changed. rope_node *nodes[UINT8_MAX]; size_t tree_offsets[UINT8_MAX]; // This is the number of characters to skip in the current node. size_t offset; // First we need to search for the node where we'll insert the string. rope_node *e = go_to_node(r, pos, &offset, nodes, tree_offsets); // offset contains how far (in characters) into the current element to skip. // Figure out how much that is in bytes. size_t offset_bytes = 0; if (e && offset) { assert(offset <= e->num_bytes); offset_bytes = count_bytes_in_chars(e->str, offset); } // Maybe we can insert the characters into the current node? size_t num_inserted_bytes = strlen((char *)str); // Can we insert into the current node? bool insert_here = e && e->num_bytes + num_inserted_bytes <= ROPE_NODE_STR_SIZE; // Can we insert into the subsequent node? bool insert_next = false; rope_node *next = NULL; if (!insert_here) { next = e ? e->nexts[0].node : (r->num_chars ? r->heads[0].node : NULL); // We can insert into the subsequent node if: // - We can't insert into the current node // - There _is_ a next node to insert into // - The insert would be at the start of the next node // - There's room in the next node insert_next = next && (e == NULL || offset_bytes == e->num_bytes) && next->num_bytes + num_inserted_bytes <= ROPE_NODE_STR_SIZE; } if (insert_here || insert_next) { if (insert_next) { offset = offset_bytes = 0; for (int i = 0; i < next->height; i++) { nodes[i] = next; // tree offset nodes not used. } e = next; } // First move the current bytes later on in the string. if (offset_bytes < e->num_bytes) { memmove(&e->str[offset_bytes + num_inserted_bytes], &e->str[offset_bytes], e->num_bytes - offset_bytes); } // Then copy in the string bytes memcpy(&e->str[offset_bytes], str, num_inserted_bytes); e->num_bytes += num_inserted_bytes; r->num_bytes += num_inserted_bytes; size_t num_inserted_chars = strlen_utf8(str); r->num_chars += num_inserted_chars; // .... aaaand update all the offset amounts. update_offset_list(r, nodes, num_inserted_chars); } else { // There isn't room. We'll need to add at least one new node to the rope. // If we're not at the end of the current node, we'll need to remove // the end of the current node's data and reinsert it later. size_t num_end_bytes = 0, num_end_chars; if (e) { num_end_bytes = e->num_bytes - offset_bytes; e->num_bytes = offset_bytes; if (num_end_bytes) { // Count out characters. num_end_chars = e->nexts[0].skip_size - offset; update_offset_list(r, nodes, -num_end_chars); r->num_chars -= num_end_chars; r->num_bytes -= num_end_bytes; } } // Now, we insert new node[s] containing the data. The data must // be broken into pieces of with a maximum size of ROPE_NODE_STR_SIZE. // Node boundaries do not occur in the middle of a utf8 codepoint. size_t str_offset = 0; while (str_offset < num_inserted_bytes) { size_t new_node_bytes = 0; size_t new_node_chars = 0; while (str_offset + new_node_bytes < num_inserted_bytes) { size_t cs = codepoint_size(str[str_offset + new_node_bytes]); if (cs + new_node_bytes > ROPE_NODE_STR_SIZE) { break; } else { new_node_bytes += cs; new_node_chars++; } } insert_at(r, pos, &str[str_offset], new_node_bytes, new_node_chars, nodes, tree_offsets); pos += new_node_chars; str_offset += new_node_bytes; } if (num_end_bytes) { insert_at(r, pos, &e->str[offset_bytes], num_end_bytes, num_end_chars, nodes, tree_offsets); } } #ifdef DEBUG _rope_check(r); #endif }
inline iterator operator++() { difference_type size = codepoint_size(*raw_iterator_); raw_iterator_ += size; return *this; }