示例#1
0
// Reads the next UTF-8 character in the iter.
// This assumes that iter->_start points to the beginning of the character.
// When this method returns, iter->_width and iter->_current will be set
// appropriately, as well as any error flags.
static void read_char(Utf8Iterator* iter) {
  if (iter->_start >= iter->_end) {
    // No input left to consume; emit an EOF and set width = 0.
    iter->_current = -1;
    iter->_width = 0;
    return;
  }

  uint32_t code_point = 0;
  uint32_t state = UTF8_ACCEPT;
  for (const char* c = iter->_start; c < iter->_end; ++c) {
    decode(&state, &code_point, (uint32_t)(unsigned char) (*c));
    if (state == UTF8_ACCEPT) {
      iter->_width = c - iter->_start + 1;
      // This is the special handling for carriage returns that is mandated by
      // the HTML5 spec.  Since we're looking for particular 7-bit literal
      // characters, we operate in terms of chars and only need a check for iter
      // overrun, instead of having to read in a full next code point.
      // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream
      if (code_point == '\r') {
        assert(iter->_width == 1);
        const char* next = c + 1;
        if (next < iter->_end && *next == '\n') {
          // Advance the iter, as if the carriage return didn't exist.
          ++iter->_start;
          // Preserve the true offset, since other tools that look at it may be
          // unaware of HTML5's rules for converting \r into \n.
          ++iter->_pos.offset;
        }
        code_point = '\n';
      }
      if (utf8_is_invalid_code_point(code_point)) {
        add_error(iter, GUMBO_ERR_UTF8_INVALID);
        code_point = kUtf8ReplacementChar;
      }
      iter->_current = code_point;
      return;
    } else if (state == UTF8_REJECT) {
      // We don't want to consume the invalid continuation byte of a multi-byte
      // run, but we do want to skip past an invalid first byte.
      iter->_width = c - iter->_start + (c == iter->_start);
      iter->_current = kUtf8ReplacementChar;
      add_error(iter, GUMBO_ERR_UTF8_INVALID);
      return;
    }
  }
  // If we got here without exiting early, then we've reached the end of the
  // iterator.  Add an error for truncated input, set the width to consume the
  // rest of the iterator, and emit a replacement character.  The next time we
  // enter this method, it will detect that there's no input to consume and
  // output an EOF.
  iter->_current = kUtf8ReplacementChar;
  iter->_width = iter->_end - iter->_start;
  add_error(iter, GUMBO_ERR_UTF8_TRUNCATED);
}
示例#2
0
// Reads the next UTF-8 character in the iter.
// This assumes that iter->_start points to the beginning of the character.
// When this method returns, iter->_width and iter->_current will be set
// appropriately, as well as any error flags.
static void read_char(Utf8Iterator* iter) {
    unsigned char c;
    unsigned char mask = '\0';
    int is_bad_char = false;
    
    c = (unsigned char) *iter->_start;
    if (c < 0x80) {
        // Valid one-byte sequence.
        iter->_width = 1;
        mask = 0xFF;
    } else if (c < 0xC0) {
        // Continuation character not following a multibyte sequence.
        // The HTML5 spec here says to consume the byte and output a replacement
        // character.
        iter->_width = 1;
        is_bad_char = true;
    } else if (c < 0xE0) {
        iter->_width = 2;
        mask = 0x1F;                // 00011111 in binary.
        if (c < 0xC2) {
            // Overlong encoding; error according to UTF8/HTML5 spec.
            is_bad_char = true;
        }
    } else if (c < 0xF0) {
        iter->_width = 3;
        mask = 0xF;                 // 00001111 in binary.
    } else if (c < 0xF5) {
        iter->_width = 4;
        mask = 0x7;                 // 00000111 in binary.
    } else if (c < 0xF8) {
        // The following cases are all errors, but we need to handle them separately
        // so that we consume the proper number of bytes from the input stream
        // before replacing them with the replacement char.  The HTML5 spec
        // specifies that we should consume the shorter of the length specified by
        // the first bit or the run leading up to the first non-continuation
        // character.
        iter->_width = 5;
        is_bad_char = true;
    } else if (c < 0xFC) {
        iter->_width = 6;
        is_bad_char = true;
    } else if (c < 0xFE) {
        iter->_width = 7;
        is_bad_char = true;
    } else {
        iter->_width = 1;
        is_bad_char = true;
    }
    
    // Check to make sure we have enough bytes left in the iter to read all that
    // we want.  If not, we set the iter_truncated flag, mark this as a bad
    // character, and adjust the current width so that it consumes the rest of the
    // iter.
    uint64_t code_point = c & mask;
    if (iter->_start + iter->_width > iter->_end) {
        iter->_width = (int)(iter->_end - iter->_start);
        add_error(iter, GUMBO_ERR_UTF8_TRUNCATED);
        is_bad_char = true;
    }
    
    // Now we decode continuation bytes, shift them appropriately, and build up
    // the appropriate code point.
    assert(iter->_width < 8);
    for (int i = 1; i < iter->_width; ++i) {
        c = (unsigned char) iter->_start[i];
        if (c < 0x80 || c > 0xBF) {
            // Per HTML5 spec, we don't include the invalid continuation char in the
            // run that we consume here.
            iter->_width = i;
            is_bad_char = true;
            break;
        }
        code_point = (code_point << 6) | (c & ~0x80);
    }
    if (code_point > 0x10FFFF) is_bad_char = true;
    
    // If we had a decode error, set the current code point to the replacement
    // character and flip the flag indicating that a decode error occurred.
    // Ditto if we have a code point that is explicitly on the list of characters
    // prohibited by the HTML5 spec, such as control characters.
    if (is_bad_char || utf8_is_invalid_code_point((int)code_point)) {
        add_error(iter, GUMBO_ERR_UTF8_INVALID);
        code_point = kUtf8ReplacementChar;
    }
    
    // This is the special handling for carriage returns that is mandated by the
    // HTML5 spec.  Since we're looking for particular 7-bit literal characters,
    // we operate in terms of chars and only need a check for iter overrun,
    // instead of having to read in a full next code point.
    // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream
    if (code_point == '\r') {
        const char* next = iter->_start + iter->_width;
        if (next < iter->_end && *next == '\n') {
            // Advance the iter, as if the carriage return didn't exist.
            ++iter->_start;
            // Preserve the true offset, since other tools that look at it may be
            // unaware of HTML5's rules for converting \r into \n.
            ++iter->_pos.offset;
        }
        code_point = '\n';
    }
    
    // At this point, we know we have a valid character as the code point, so we
    // set it, and we're done.
    iter->_current = (int)code_point;
}