Beispiel #1
0
	void Stream::StreamInUtf32() const
	{
		static int indexes[2][4] = {
			{3, 2, 1, 0},
			{0, 1, 2, 3}
		};

		unsigned long ch = 0;
		unsigned char bytes[4];
		int* pIndexes = (m_charSet == utf32be) ? indexes[1] : indexes[0];

		bytes[0] = GetNextByte();
		bytes[1] = GetNextByte();
		bytes[2] = GetNextByte();
		bytes[3] = GetNextByte();
		if (!m_input.good())
		{
			return;
		}

		for (int i = 0; i < 4; ++i)
		{
			ch <<= 8;
			ch |= bytes[pIndexes[i]];
		}

		QueueUnicodeCodepoint(m_readahead, ch);
	}
Beispiel #2
0
void Stream::StreamInUtf16() const {
  unsigned long ch = 0;
  unsigned char bytes[2];
  int nBigEnd = (m_charSet == utf16be) ? 0 : 1;

  bytes[0] = GetNextByte();
  bytes[1] = GetNextByte();
  if (!m_input.good()) {
    return;
  }
  ch = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) |
       static_cast<unsigned long>(bytes[1 ^ nBigEnd]);

  if (ch >= 0xDC00 && ch < 0xE000) {
    // Trailing (low) surrogate...ugh, wrong order
    QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
    return;
  } else if (ch >= 0xD800 && ch < 0xDC00) {
    // ch is a leading (high) surrogate

    // Four byte UTF-8 code point

    // Read the trailing (low) surrogate
    for (;;) {
      bytes[0] = GetNextByte();
      bytes[1] = GetNextByte();
      if (!m_input.good()) {
        QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);
        return;
      }
      unsigned long chLow = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) |
                            static_cast<unsigned long>(bytes[1 ^ nBigEnd]);
      if (chLow < 0xDC00 || chLow >= 0xE000) {
        // Trouble...not a low surrogate.  Dump a REPLACEMENT CHARACTER into the
        // stream.
        QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER);

        // Deal with the next UTF-16 unit
        if (chLow < 0xD800 || chLow >= 0xE000) {
          // Easiest case: queue the codepoint and return
          QueueUnicodeCodepoint(m_readahead, ch);
          return;
        } else {
          // Start the loop over with the new high surrogate
          ch = chLow;
          continue;
        }
      }

      // Select the payload bits from the high surrogate
      ch &= 0x3FF;
      ch <<= 10;

      // Include bits from low surrogate
      ch |= (chLow & 0x3FF);

      // Add the surrogacy offset
      ch += 0x10000;
      break;
    }
  }

  QueueUnicodeCodepoint(m_readahead, ch);
}