C++ (Cpp) CFUniCharDecomposeCharacter Examples

Example #1

0

Show file

File: CFStringEncodingConverter.c Project: AlexShiLucky/swift-corelibs-foundation

static CFIndex __CFToCanonicalUnicodeCheapEightBitWrapper(const void *converter, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
    CFIndex processedByteLen = 0;
    CFIndex theUsedCharLen = 0;
    UTF32Char charBuffer[MAX_DECOMPOSED_LENGTH];
    CFIndex usedLen;
    UniChar character;
    bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false);

    while ((processedByteLen < numBytes) && (!maxCharLen || (theUsedCharLen < maxCharLen))) {
        if (!((CFStringEncodingCheapEightBitToUnicodeProc)((const _CFEncodingConverter*)converter)->definition->toUnicode)(flags, bytes[processedByteLen], &character)) break;

        if (CFUniCharIsDecomposableCharacter(character, isHFSPlus)) {
            CFIndex idx;

            usedLen = CFUniCharDecomposeCharacter(character, charBuffer, MAX_DECOMPOSED_LENGTH);
            *usedCharLen = theUsedCharLen;

            for (idx = 0;idx < usedLen;idx++) {
                if (charBuffer[idx] > 0xFFFF) { // Non-BMP
                    if (theUsedCharLen + 2 > maxCharLen)  return processedByteLen;
                    theUsedCharLen += 2;
                    if (maxCharLen) {
                        charBuffer[idx] = charBuffer[idx] - 0x10000;
                        *(characters++) = (UniChar)(charBuffer[idx] >> 10) + 0xD800UL;
                        *(characters++) = (UniChar)(charBuffer[idx] & 0x3FF) + 0xDC00UL;
                    }
                } else {
                    if (theUsedCharLen + 1 > maxCharLen)  return processedByteLen;
                    ++theUsedCharLen;
                    *(characters++) = charBuffer[idx];
                }
            }
        } else {
            if (maxCharLen) *(characters++) = character;

Example #2

0

Show file

File: CFStringEncodingConverter.c Project: AlexShiLucky/swift-corelibs-foundation

static CFIndex __CFDefaultToBytesFallbackProc(const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
    CFIndex processCharLen = 1, filledBytesLen = 1;
    uint8_t byte = '?';

    if (*characters < 0xA0) { // 0x80 to 0x9F maps to ASCII C0 range
        byte = (uint8_t)(*characters - 0x80);
    } else if (*characters < 0x100) {
        *usedByteLen = __CFToASCIILatin1Fallback(*characters, bytes, maxByteLen);
        return 1;
    } else if (*characters >= kSurrogateHighStart && *characters <= kSurrogateLowEnd) {
        processCharLen = (numChars > 1 && *characters <= kSurrogateLowStart && *(characters + 1) >= kSurrogateLowStart && *(characters + 1) <= kSurrogateLowEnd ? 2 : 1);
    } else if (CFUniCharIsMemberOf(*characters, kCFUniCharWhitespaceCharacterSet)) {
        byte = ' ';
    } else if (CFUniCharIsMemberOf(*characters, kCFUniCharWhitespaceAndNewlineCharacterSet)) {
        byte = ASCIINewLine;
    } else if (*characters == 0x2026) { // ellipsis
        if (0 == maxByteLen) {
            filledBytesLen = 3;
        } else if (maxByteLen > 2) {
            memset(bytes, '.', 3);
            *usedByteLen = 3;
            return processCharLen;
        }
    } else if (CFUniCharIsMemberOf(*characters, kCFUniCharDecomposableCharacterSet)) {
        UTF32Char decomposed[MAX_DECOMPOSED_LENGTH];

        (void)CFUniCharDecomposeCharacter(*characters, decomposed, MAX_DECOMPOSED_LENGTH);
        if (*decomposed < 0x80) {
            byte = (uint8_t)(*decomposed);
        } else {
            UTF16Char theChar = *decomposed;

            return __CFDefaultToBytesFallbackProc(&theChar, 1, bytes, maxByteLen, usedByteLen);
        }
    }
    
    if (maxByteLen) *bytes = byte;
    *usedByteLen = filledBytesLen;
    return processCharLen;
}

Example #3

0

Show file

File: CFBuiltinConverters.c Project: reporter123/corefoundation

static CFIndex __CFFromUTF8Len(uint32_t flags, const uint8_t *source, CFIndex numBytes) {
    uint16_t extraBytesToRead;
    CFIndex theUsedCharLen = 0;
    uint32_t ch;
    bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false);
    bool needsToDecompose = (flags & kCFStringEncodingUseCanonical || isHFSPlus ? true : false);
    bool strictUTF8 = (flags & kCFStringEncodingLenientUTF8Conversion ? false : true);
    UTF32Char decomposed[MAX_DECOMPOSED_LENGTH];
    CFIndex decompLength;
    bool isStrict = !isHFSPlus;

    while (numBytes) {
        extraBytesToRead = trailingBytesForUTF8[*source];

        if (extraBytesToRead > --numBytes) break;
        numBytes -= extraBytesToRead;

        /* Do this check whether lenient or strict */
        // We need to allow 0xA9 (copyright in MacRoman and Unicode) not to break existing apps
        // Will use a flag passed in from upper layers to switch restriction mode for this case in the next release
        if ((extraBytesToRead > 3) || (strictUTF8 && !__CFIsLegalUTF8(source, extraBytesToRead + 1))) {
            if ((*source == 0xA9) || (flags & kCFStringEncodingAllowLossyConversion)) {
                numBytes += extraBytesToRead;
                ++source;
                ++theUsedCharLen;
                continue;
            } else {
                break;
            }
        }


        ch = 0;
        /*
         * The cases all fall through. See "Note A" below.
         */
        switch (extraBytesToRead) {
            case 3:	ch += *source++; ch <<= 6;
            case 2:	ch += *source++; ch <<= 6;
            case 1:	ch += *source++; ch <<= 6;
            case 0:	ch += *source++;
        }
        ch -= offsetsFromUTF8[extraBytesToRead];

        if (ch <= kMaximumUCS2) {
            if (isStrict && (ch >= kSurrogateHighStart && ch <= kSurrogateLowEnd)) {
                break;
            }
            if (needsToDecompose && CFUniCharIsDecomposableCharacter(ch, isHFSPlus)) {
                decompLength = CFUniCharDecomposeCharacter(ch, decomposed, MAX_DECOMPOSED_LENGTH);
                theUsedCharLen += decompLength;
            } else {
                ++theUsedCharLen;
            }
        } else if (ch > kMaximumUTF16) {
            ++theUsedCharLen;
        } else {
            if (needsToDecompose && CFUniCharIsDecomposableCharacter(ch, isHFSPlus)) {
                decompLength = CFUniCharDecomposeCharacter(ch, decomposed, MAX_DECOMPOSED_LENGTH);
                while (--decompLength >= 0) theUsedCharLen += (decomposed[decompLength] < 0x10000 ? 1 : 2);
            } else {
                theUsedCharLen += 2;
            }
        }
    }

    return theUsedCharLen;
}

Example #4

0

Show file

File: CFBuiltinConverters.c Project: reporter123/corefoundation

static CFIndex __CFFromUTF8(uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
    const uint8_t *source = bytes;
    uint16_t extraBytesToRead;
    CFIndex theUsedCharLen = 0;
    uint32_t ch;
    bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false);
    bool needsToDecompose = (flags & kCFStringEncodingUseCanonical || isHFSPlus ? true : false);
    bool strictUTF8 = (flags & kCFStringEncodingLenientUTF8Conversion ? false : true);
    UTF32Char decomposed[MAX_DECOMPOSED_LENGTH];
    CFIndex decompLength;
    bool isStrict = !isHFSPlus;

    while (numBytes && (!maxCharLen || (theUsedCharLen < maxCharLen))) {
        extraBytesToRead = trailingBytesForUTF8[*source];

        if (extraBytesToRead > --numBytes) break;
        numBytes -= extraBytesToRead;

        /* Do this check whether lenient or strict */
        // We need to allow 0xA9 (copyright in MacRoman and Unicode) not to break existing apps
        // Will use a flag passed in from upper layers to switch restriction mode for this case in the next release
        if ((extraBytesToRead > 3) || (strictUTF8 && !__CFIsLegalUTF8(source, extraBytesToRead + 1))) {
            if ((*source == 0xA9) || (flags & kCFStringEncodingAllowLossyConversion)) {
                numBytes += extraBytesToRead;
                ++source;
                if (maxCharLen) *(characters++) = (UTF16Char)kReplacementCharacter;
                ++theUsedCharLen;
                continue;
            } else {
                break;
            }
        }

        ch = 0;
        /*
         * The cases all fall through. See "Note A" below.
         */
        switch (extraBytesToRead) {
            case 3:	ch += *source++; ch <<= 6;
            case 2:	ch += *source++; ch <<= 6;
            case 1:	ch += *source++; ch <<= 6;
            case 0:	ch += *source++;
        }
        ch -= offsetsFromUTF8[extraBytesToRead];

        if (ch <= kMaximumUCS2) {
            if (isStrict && (ch >= kSurrogateHighStart && ch <= kSurrogateLowEnd)) {
                source -= (extraBytesToRead + 1);
                break;
            }
            if (needsToDecompose && CFUniCharIsDecomposableCharacter(ch, isHFSPlus)) {
                decompLength = CFUniCharDecomposeCharacter(ch, decomposed, MAX_DECOMPOSED_LENGTH);

                if (maxCharLen) {
                    if (!CFUniCharFillDestinationBuffer(decomposed, decompLength, (void **)&characters, maxCharLen, &theUsedCharLen, kCFUniCharUTF16Format)) break;
                } else {
                    theUsedCharLen += decompLength;
                }
            } else {
                if (maxCharLen) *(characters++) = (UTF16Char)ch;
                ++theUsedCharLen;
            }
        } else if (ch > kMaximumUTF16) {
            if (isStrict) {
                source -= (extraBytesToRead + 1);
                break;
            }
            if (maxCharLen) *(characters++) = (UTF16Char)kReplacementCharacter;
            ++theUsedCharLen;
        } else {
            if (needsToDecompose && CFUniCharIsDecomposableCharacter(ch, isHFSPlus)) {
                decompLength = CFUniCharDecomposeCharacter(ch, decomposed, MAX_DECOMPOSED_LENGTH);

                if (maxCharLen) {
                    if (!CFUniCharFillDestinationBuffer(decomposed, decompLength, (void **)&characters, maxCharLen, &theUsedCharLen, kCFUniCharUTF16Format)) break;
                } else {
                    while (--decompLength >= 0) theUsedCharLen += (decomposed[decompLength] < 0x10000 ? 1 : 2);
                }
            } else {
                if (maxCharLen) {
                    if ((theUsedCharLen + 2) > maxCharLen) break;
                    ch -= halfBase;
                    *(characters++) = (ch >> halfShift) + kSurrogateHighStart;
                    *(characters++) = (ch & halfMask) + kSurrogateLowStart;
                }
                theUsedCharLen += 2;
            }
        }
    }

    if (usedCharLen) *usedCharLen = theUsedCharLen;

    return source - bytes;
}