void Stream::StreamInUtf32() const { static int indexes[2][4] = { {3, 2, 1, 0}, {0, 1, 2, 3} }; unsigned long ch = 0; unsigned char bytes[4]; int* pIndexes = (m_charSet == utf32be) ? indexes[1] : indexes[0]; bytes[0] = GetNextByte(); bytes[1] = GetNextByte(); bytes[2] = GetNextByte(); bytes[3] = GetNextByte(); if (!m_input.good()) { return; } for (int i = 0; i < 4; ++i) { ch <<= 8; ch |= bytes[pIndexes[i]]; } QueueUnicodeCodepoint(m_readahead, ch); }
void Stream::StreamInUtf16() const { unsigned long ch = 0; unsigned char bytes[2]; int nBigEnd = (m_charSet == utf16be) ? 0 : 1; bytes[0] = GetNextByte(); bytes[1] = GetNextByte(); if (!m_input.good()) { return; } ch = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) | static_cast<unsigned long>(bytes[1 ^ nBigEnd]); if (ch >= 0xDC00 && ch < 0xE000) { // Trailing (low) surrogate...ugh, wrong order QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER); return; } else if (ch >= 0xD800 && ch < 0xDC00) { // ch is a leading (high) surrogate // Four byte UTF-8 code point // Read the trailing (low) surrogate for (;;) { bytes[0] = GetNextByte(); bytes[1] = GetNextByte(); if (!m_input.good()) { QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER); return; } unsigned long chLow = (static_cast<unsigned long>(bytes[nBigEnd]) << 8) | static_cast<unsigned long>(bytes[1 ^ nBigEnd]); if (chLow < 0xDC00 || chLow >= 0xE000) { // Trouble...not a low surrogate. Dump a REPLACEMENT CHARACTER into the // stream. QueueUnicodeCodepoint(m_readahead, CP_REPLACEMENT_CHARACTER); // Deal with the next UTF-16 unit if (chLow < 0xD800 || chLow >= 0xE000) { // Easiest case: queue the codepoint and return QueueUnicodeCodepoint(m_readahead, ch); return; } else { // Start the loop over with the new high surrogate ch = chLow; continue; } } // Select the payload bits from the high surrogate ch &= 0x3FF; ch <<= 10; // Include bits from low surrogate ch |= (chLow & 0x3FF); // Add the surrogacy offset ch += 0x10000; break; } } QueueUnicodeCodepoint(m_readahead, ch); }