Exemple #1
0
// This little function counts how many bytes a certain number of characters take up.
static size_t count_bytes_in_utf8(const uint8_t *str, size_t num_chars) {
  const uint8_t *p = str;
  for (unsigned int i = 0; i < num_chars; i++) {
    p += codepoint_size(*p);
  }
  return p - str;
}
Exemple #2
0
 inline iterator operator++(int junk) {
   iterator copy(raw_iterator_);
   char octet = *raw_iterator_;
   int size = codepoint_size(octet);
   raw_iterator_ += size;
   return copy;
 }
Exemple #3
0
static size_t count_wchars_in_utf8(const uint8_t *str, size_t num_chars) {
  size_t wchars = 0;
  for (unsigned int i = 0; i < num_chars; i++) {
    wchars += 1 + NEEDS_TWO_WCHARS(*str);
    str += codepoint_size(*str);
  }
  return wchars;
}
Exemple #4
0
// Count the number of characters in a string.
static size_t strlen_utf8(const uint8_t *str) {
  const uint8_t *p = str;
  size_t i = 0;
  while (*p) {
    p += codepoint_size(*p);
    i++;
  }
  return i;
}
Exemple #5
0
static size_t count_utf8_in_wchars(const uint8_t *str, size_t num_wchars) {
  size_t chars = num_wchars;
  for (unsigned int i = 0; i < num_wchars; i++) {
    if (NEEDS_TWO_WCHARS(*str)) {
      chars--;
      i++;
    }
    str += codepoint_size(*str);
  }
  return chars;
}
Exemple #6
0
      inline const codepoint operator*() const {
        enc_buffer_type::const_iterator copy = raw_iterator_;
        char octets[max_codepoint_size];
        memset(octets, 0, max_codepoint_size);

        octets[0] = *copy++;
        difference_type size = codepoint_size(octets[0]);
        for (int i = 1; i < size; i++) {
          octets[i] = *copy++;
        }
        return octets_to_codepoint(octets, size);
      }
Exemple #7
0
// Checks if a UTF8 string is ok. Returns the number of bytes in the string if
// it is ok, otherwise returns -1.
static ssize_t bytelen_and_check_utf8(const uint8_t *str) {
  const uint8_t *p = str;
  while (*p != '\0') {
    size_t size = codepoint_size(*p);
    if (size == SIZE_MAX) return -1;
    p++; size--;
    while (size > 0) {
      // Check that any middle bytes are of the form 0x10xx xxxx
      if ((*p & 0xc0) != 0x80)
        return -1;
      p++; size--;
    }
  }

#ifdef DEBUG
  size_t num = p - str;
  assert(num == strlen((char *)str));
#endif

  return p - str;
}
Exemple #8
0
// Insert the given utf8 string into the rope at the specified position.
static ROPE_RESULT rope_insert_at_iter(rope *r, rope_node *e, rope_iter *iter, const uint8_t *str) {
  // iter.offset contains how far (in characters) into the current element to skip.
  // Figure out how much that is in bytes.
  size_t offset_bytes = 0;
  // The insertion offset into the destination node.
  size_t offset = iter->s[0].skip_size;
  if (offset) {
    assert(offset <= e->nexts[0].skip_size);
    offset_bytes = count_bytes_in_utf8(e->str, offset);
  }

  // We might be able to insert the new data into the current node, depending on
  // how big it is. We'll count the bytes, and also check that its valid utf8.
  ssize_t num_inserted_bytes = bytelen_and_check_utf8(str);
  if (num_inserted_bytes == -1) return ROPE_INVALID_UTF8;

  // Can we insert into the current node?
  bool insert_here = e->num_bytes + num_inserted_bytes <= ROPE_NODE_STR_SIZE;

  // Can we insert into the subsequent node?
  rope_node *next = NULL;
  if (!insert_here && offset_bytes == e->num_bytes) {
    next = e->nexts[0].node;
    // We can insert into the subsequent node if:
    // - We can't insert into the current node
    // - There _is_ a next node to insert into
    // - The insert would be at the start of the next node
    // - There's room in the next node
    if (next && next->num_bytes + num_inserted_bytes <= ROPE_NODE_STR_SIZE) {
      offset = offset_bytes = 0;
      for (int i = 0; i < next->height; i++) {
        iter->s[i].node = next;
        // tree offset nodes will not be used.
      }
      e = next;

      insert_here = true;
    }
  }

  if (insert_here) {
    // First move the current bytes later on in the string.
    if (offset_bytes < e->num_bytes) {
      memmove(&e->str[offset_bytes + num_inserted_bytes],
              &e->str[offset_bytes],
              e->num_bytes - offset_bytes);
    }

    // Then copy in the string bytes
    memcpy(&e->str[offset_bytes], str, num_inserted_bytes);
    e->num_bytes += num_inserted_bytes;

    r->num_bytes += num_inserted_bytes;
    size_t num_inserted_chars = strlen_utf8(str);
    r->num_chars += num_inserted_chars;

    // .... aaaand update all the offset amounts.
#if ROPE_WCHAR
    size_t num_inserted_wchars = count_wchars_in_utf8(str, num_inserted_chars);
    update_offset_list(r, iter, num_inserted_chars, num_inserted_wchars);
#else
    update_offset_list(r, iter, num_inserted_chars);
#endif

  } else {
    // There isn't room. We'll need to add at least one new node to the rope.

    // If we're not at the end of the current node, we'll need to remove
    // the end of the current node's data and reinsert it later.
    size_t num_end_chars, num_end_bytes = e->num_bytes - offset_bytes;
    if (num_end_bytes) {
      // We'll pretend like the character have been deleted from the node, while leaving
      // the bytes themselves there (for later).
      e->num_bytes = offset_bytes;
      num_end_chars = e->nexts[0].skip_size - offset;
#if ROPE_WCHAR
      size_t num_end_wchars = count_wchars_in_utf8(&e->str[offset_bytes], num_end_chars);
      update_offset_list(r, iter, -num_end_chars, -num_end_wchars);
#else
      update_offset_list(r, iter, -num_end_chars);
#endif

      r->num_chars -= num_end_chars;
      r->num_bytes -= num_end_bytes;
    }

    // Now we insert new nodes containing the new character data. The data must be broken into
    // pieces of with a maximum size of ROPE_NODE_STR_SIZE. Node boundaries must not occur in the
    // middle of a utf8 codepoint.
    size_t str_offset = 0;
    while (str_offset < num_inserted_bytes) {
      size_t new_node_bytes = 0;
      size_t new_node_chars = 0;

      while (str_offset + new_node_bytes < num_inserted_bytes) {
        size_t cs = codepoint_size(str[str_offset + new_node_bytes]);
        if (cs + new_node_bytes > ROPE_NODE_STR_SIZE) {
          break;
        } else {
          new_node_bytes += cs;
          new_node_chars++;
        }
      }

      insert_at(r, iter, &str[str_offset], new_node_bytes, new_node_chars);
      str_offset += new_node_bytes;
    }

    if (num_end_bytes) {
      insert_at(r, iter, &e->str[offset_bytes], num_end_bytes, num_end_chars);
    }
  }

  return ROPE_OK;
}
Exemple #9
0
// Insert the given utf8 string into the rope at the specified position.
void rope_insert(rope *r, size_t pos, const uint8_t *str) {
  assert(r);
  assert(str);
#ifdef DEBUG
  _rope_check(r);
#endif
  pos = MIN(pos, r->num_chars);

  // There's a good chance we'll have to rewrite a bunch of next pointers and a bunch
  // of offsets. This variable will store pointers to the elements which need to
  // be changed.
  rope_node *nodes[UINT8_MAX];
  size_t tree_offsets[UINT8_MAX];

  // This is the number of characters to skip in the current node.
  size_t offset;
  
  // First we need to search for the node where we'll insert the string.
  rope_node *e = go_to_node(r, pos, &offset, nodes, tree_offsets);
  
  // offset contains how far (in characters) into the current element to skip.
  // Figure out how much that is in bytes.
  size_t offset_bytes = 0;
  if (e && offset) {
    assert(offset <= e->num_bytes);
    offset_bytes = count_bytes_in_chars(e->str, offset);
  }
  
  // Maybe we can insert the characters into the current node?
  size_t num_inserted_bytes = strlen((char *)str);

  // Can we insert into the current node?
  bool insert_here = e && e->num_bytes + num_inserted_bytes <= ROPE_NODE_STR_SIZE;
  
  // Can we insert into the subsequent node?
  bool insert_next = false;
  rope_node *next = NULL;
  if (!insert_here) {
    next = e ? e->nexts[0].node : (r->num_chars ? r->heads[0].node : NULL);
    // We can insert into the subsequent node if:
    // - We can't insert into the current node
    // - There _is_ a next node to insert into
    // - The insert would be at the start of the next node
    // - There's room in the next node
    insert_next = next
        && (e == NULL || offset_bytes == e->num_bytes)
        && next->num_bytes + num_inserted_bytes <= ROPE_NODE_STR_SIZE;
  }
  
  if (insert_here || insert_next) {
    if (insert_next) {
      offset = offset_bytes = 0;
      for (int i = 0; i < next->height; i++) {
        nodes[i] = next;
        // tree offset nodes not used.
      }
      e = next;
    }
    
    // First move the current bytes later on in the string.
    if (offset_bytes < e->num_bytes) {
      memmove(&e->str[offset_bytes + num_inserted_bytes],
              &e->str[offset_bytes],
              e->num_bytes - offset_bytes);
    }
    
    // Then copy in the string bytes
    memcpy(&e->str[offset_bytes], str, num_inserted_bytes);
    e->num_bytes += num_inserted_bytes;
    
    r->num_bytes += num_inserted_bytes;
    size_t num_inserted_chars = strlen_utf8(str);
    r->num_chars += num_inserted_chars;
    
    // .... aaaand update all the offset amounts.
    update_offset_list(r, nodes, num_inserted_chars);
  } else {
    // There isn't room. We'll need to add at least one new node to the rope.
    
    // If we're not at the end of the current node, we'll need to remove
    // the end of the current node's data and reinsert it later.
    size_t num_end_bytes = 0, num_end_chars;
    if (e) {
      num_end_bytes = e->num_bytes - offset_bytes;
      e->num_bytes = offset_bytes;
      if (num_end_bytes) {
        // Count out characters.
        num_end_chars = e->nexts[0].skip_size - offset;
        update_offset_list(r, nodes, -num_end_chars);
        
        r->num_chars -= num_end_chars;
        r->num_bytes -= num_end_bytes;
      }
    }
    
    // Now, we insert new node[s] containing the data. The data must
    // be broken into pieces of with a maximum size of ROPE_NODE_STR_SIZE.
    // Node boundaries do not occur in the middle of a utf8 codepoint.
    size_t str_offset = 0;
    while (str_offset < num_inserted_bytes) {
      size_t new_node_bytes = 0;
      size_t new_node_chars = 0;
      
      while (str_offset + new_node_bytes < num_inserted_bytes) {
        size_t cs = codepoint_size(str[str_offset + new_node_bytes]);
        if (cs + new_node_bytes > ROPE_NODE_STR_SIZE) {
          break;
        } else {
          new_node_bytes += cs;
          new_node_chars++;
        }
      }
      
      insert_at(r, pos, &str[str_offset], new_node_bytes, new_node_chars, nodes, tree_offsets);
      pos += new_node_chars;
      str_offset += new_node_bytes;
    }
    
    if (num_end_bytes) {
      insert_at(r, pos, &e->str[offset_bytes], num_end_bytes, num_end_chars, nodes, tree_offsets);
    }
  }
  
#ifdef DEBUG
  _rope_check(r);
#endif
}
Exemple #10
0
 inline iterator operator++() {
   difference_type size = codepoint_size(*raw_iterator_);
   raw_iterator_ += size;
   return *this;
 }