static CFIndex __CFToCanonicalUnicodeCheapEightBitWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) { CFIndex processedByteLen = 0; CFIndex theUsedCharLen = 0; UTF32Char charBuffer[MAX_DECOMPOSED_LENGTH]; CFIndex usedLen; UniChar character; bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false); while ((processedByteLen < numBytes) && (!maxCharLen || (theUsedCharLen < maxCharLen))) { if (!((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->definition->toUnicode)(flags, bytes[processedByteLen], &character)) break; if (CFUniCharIsDecomposableCharacter(character, isHFSPlus)) { CFIndex idx; usedLen = CFUniCharDecomposeCharacter(character, charBuffer, MAX_DECOMPOSED_LENGTH); *usedCharLen = theUsedCharLen; for (idx = 0;idx < usedLen;idx++) { if (charBuffer[idx] > 0xFFFF) { // Non-BMP if (theUsedCharLen + 2 > maxCharLen) return processedByteLen; theUsedCharLen += 2; if (maxCharLen) { charBuffer[idx] = charBuffer[idx] - 0x10000; *(characters++) = (UniChar)(charBuffer[idx] >> 10) + 0xD800UL; *(characters++) = (UniChar)(charBuffer[idx] & 0x3FF) + 0xDC00UL; } } else { if (theUsedCharLen + 1 > maxCharLen) return processedByteLen; ++theUsedCharLen; *(characters++) = charBuffer[idx]; } } } else { if (maxCharLen) *(characters++) = character;
static CFIndex __CFDefaultToBytesFallbackProc(const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) { CFIndex processCharLen = 1, filledBytesLen = 1; uint8_t byte = '?'; if (*characters < 0xA0) { // 0x80 to 0x9F maps to ASCII C0 range byte = (uint8_t)(*characters - 0x80); } else if (*characters < 0x100) { *usedByteLen = __CFToASCIILatin1Fallback(*characters, bytes, maxByteLen); return 1; } else if (*characters >= kSurrogateHighStart && *characters <= kSurrogateLowEnd) { processCharLen = (numChars > 1 && *characters <= kSurrogateLowStart && *(characters + 1) >= kSurrogateLowStart && *(characters + 1) <= kSurrogateLowEnd ? 2 : 1); } else if (CFUniCharIsMemberOf(*characters, kCFUniCharWhitespaceCharacterSet)) { byte = ' '; } else if (CFUniCharIsMemberOf(*characters, kCFUniCharWhitespaceAndNewlineCharacterSet)) { byte = ASCIINewLine; } else if (*characters == 0x2026) { // ellipsis if (0 == maxByteLen) { filledBytesLen = 3; } else if (maxByteLen > 2) { memset(bytes, '.', 3); *usedByteLen = 3; return processCharLen; } } else if (CFUniCharIsMemberOf(*characters, kCFUniCharDecomposableCharacterSet)) { UTF32Char decomposed[MAX_DECOMPOSED_LENGTH]; (void)CFUniCharDecomposeCharacter(*characters, decomposed, MAX_DECOMPOSED_LENGTH); if (*decomposed < 0x80) { byte = (uint8_t)(*decomposed); } else { UTF16Char theChar = *decomposed; return __CFDefaultToBytesFallbackProc(&theChar, 1, bytes, maxByteLen, usedByteLen); } } if (maxByteLen) *bytes = byte; *usedByteLen = filledBytesLen; return processCharLen; }
static CFIndex __CFFromUTF8Len(uint32_t flags, const uint8_t *source, CFIndex numBytes) { uint16_t extraBytesToRead; CFIndex theUsedCharLen = 0; uint32_t ch; bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false); bool needsToDecompose = (flags & kCFStringEncodingUseCanonical || isHFSPlus ? true : false); bool strictUTF8 = (flags & kCFStringEncodingLenientUTF8Conversion ? false : true); UTF32Char decomposed[MAX_DECOMPOSED_LENGTH]; CFIndex decompLength; bool isStrict = !isHFSPlus; while (numBytes) { extraBytesToRead = trailingBytesForUTF8[*source]; if (extraBytesToRead > --numBytes) break; numBytes -= extraBytesToRead; /* Do this check whether lenient or strict */ // We need to allow 0xA9 (copyright in MacRoman and Unicode) not to break existing apps // Will use a flag passed in from upper layers to switch restriction mode for this case in the next release if ((extraBytesToRead > 3) || (strictUTF8 && !__CFIsLegalUTF8(source, extraBytesToRead + 1))) { if ((*source == 0xA9) || (flags & kCFStringEncodingAllowLossyConversion)) { numBytes += extraBytesToRead; ++source; ++theUsedCharLen; continue; } else { break; } } ch = 0; /* * The cases all fall through. See "Note A" below. */ switch (extraBytesToRead) { case 3: ch += *source++; ch <<= 6; case 2: ch += *source++; ch <<= 6; case 1: ch += *source++; ch <<= 6; case 0: ch += *source++; } ch -= offsetsFromUTF8[extraBytesToRead]; if (ch <= kMaximumUCS2) { if (isStrict && (ch >= kSurrogateHighStart && ch <= kSurrogateLowEnd)) { break; } if (needsToDecompose && CFUniCharIsDecomposableCharacter(ch, isHFSPlus)) { decompLength = CFUniCharDecomposeCharacter(ch, decomposed, MAX_DECOMPOSED_LENGTH); theUsedCharLen += decompLength; } else { ++theUsedCharLen; } } else if (ch > kMaximumUTF16) { ++theUsedCharLen; } else { if (needsToDecompose && CFUniCharIsDecomposableCharacter(ch, isHFSPlus)) { decompLength = CFUniCharDecomposeCharacter(ch, decomposed, MAX_DECOMPOSED_LENGTH); while (--decompLength >= 0) theUsedCharLen += (decomposed[decompLength] < 0x10000 ? 1 : 2); } else { theUsedCharLen += 2; } } } return theUsedCharLen; }
static CFIndex __CFFromUTF8(uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) { const uint8_t *source = bytes; uint16_t extraBytesToRead; CFIndex theUsedCharLen = 0; uint32_t ch; bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false); bool needsToDecompose = (flags & kCFStringEncodingUseCanonical || isHFSPlus ? true : false); bool strictUTF8 = (flags & kCFStringEncodingLenientUTF8Conversion ? false : true); UTF32Char decomposed[MAX_DECOMPOSED_LENGTH]; CFIndex decompLength; bool isStrict = !isHFSPlus; while (numBytes && (!maxCharLen || (theUsedCharLen < maxCharLen))) { extraBytesToRead = trailingBytesForUTF8[*source]; if (extraBytesToRead > --numBytes) break; numBytes -= extraBytesToRead; /* Do this check whether lenient or strict */ // We need to allow 0xA9 (copyright in MacRoman and Unicode) not to break existing apps // Will use a flag passed in from upper layers to switch restriction mode for this case in the next release if ((extraBytesToRead > 3) || (strictUTF8 && !__CFIsLegalUTF8(source, extraBytesToRead + 1))) { if ((*source == 0xA9) || (flags & kCFStringEncodingAllowLossyConversion)) { numBytes += extraBytesToRead; ++source; if (maxCharLen) *(characters++) = (UTF16Char)kReplacementCharacter; ++theUsedCharLen; continue; } else { break; } } ch = 0; /* * The cases all fall through. See "Note A" below. */ switch (extraBytesToRead) { case 3: ch += *source++; ch <<= 6; case 2: ch += *source++; ch <<= 6; case 1: ch += *source++; ch <<= 6; case 0: ch += *source++; } ch -= offsetsFromUTF8[extraBytesToRead]; if (ch <= kMaximumUCS2) { if (isStrict && (ch >= kSurrogateHighStart && ch <= kSurrogateLowEnd)) { source -= (extraBytesToRead + 1); break; } if (needsToDecompose && CFUniCharIsDecomposableCharacter(ch, isHFSPlus)) { decompLength = CFUniCharDecomposeCharacter(ch, decomposed, MAX_DECOMPOSED_LENGTH); if (maxCharLen) { if (!CFUniCharFillDestinationBuffer(decomposed, decompLength, (void **)&characters, maxCharLen, &theUsedCharLen, kCFUniCharUTF16Format)) break; } else { theUsedCharLen += decompLength; } } else { if (maxCharLen) *(characters++) = (UTF16Char)ch; ++theUsedCharLen; } } else if (ch > kMaximumUTF16) { if (isStrict) { source -= (extraBytesToRead + 1); break; } if (maxCharLen) *(characters++) = (UTF16Char)kReplacementCharacter; ++theUsedCharLen; } else { if (needsToDecompose && CFUniCharIsDecomposableCharacter(ch, isHFSPlus)) { decompLength = CFUniCharDecomposeCharacter(ch, decomposed, MAX_DECOMPOSED_LENGTH); if (maxCharLen) { if (!CFUniCharFillDestinationBuffer(decomposed, decompLength, (void **)&characters, maxCharLen, &theUsedCharLen, kCFUniCharUTF16Format)) break; } else { while (--decompLength >= 0) theUsedCharLen += (decomposed[decompLength] < 0x10000 ? 1 : 2); } } else { if (maxCharLen) { if ((theUsedCharLen + 2) > maxCharLen) break; ch -= halfBase; *(characters++) = (ch >> halfShift) + kSurrogateHighStart; *(characters++) = (ch & halfMask) + kSurrogateLowStart; } theUsedCharLen += 2; } } } if (usedCharLen) *usedCharLen = theUsedCharLen; return source - bytes; }