UniChar CFStringEncodingPrecomposeLatinCharacter(const UniChar *character, CFIndex numChars, CFIndex *usedChars) {
    if (numChars > 0) {
        UTF32Char ch = *(character++), nextCh, composedChar;
        CFIndex usedCharLen = 1;

        if (CFUniCharIsSurrogateHighCharacter(ch) || CFUniCharIsSurrogateLowCharacter(ch)) {
            if (usedChars) (*usedChars) = usedCharLen;
            return ch;
        }

        while (usedCharLen < numChars) {
            nextCh = *(character++);

            if (CFUniCharIsSurrogateHighCharacter(nextCh) || CFUniCharIsSurrogateLowCharacter(nextCh)) break;

            if (CFUniCharIsMemberOf(nextCh, kCFUniCharNonBaseCharacterSet) && ((composedChar = CFUniCharPrecomposeCharacter(ch, nextCh)) != 0xFFFD)) {
                if (composedChar > 0xFFFF) { // Non-base
                    break;
                } else {
                    ch = composedChar;
                }
            } else {
                break;
            }
            ++usedCharLen;
        }
        if (usedChars) (*usedChars) = usedCharLen;
        if (usedCharLen > 1) return ch;
    }
    return 0xFFFD;
}
Esempio n. 2
0
/*
 [4]  NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | CombiningChar | Extender
 [5]  Name ::= (Letter | '_' | ':') (NameChar)*
 [7]  Nmtoken ::= (NameChar)+
 [84] Letter ::= BaseChar | Ideographic

 We don't do this quite right; we rely on the Unicode charsets to do this analysis.  While
 the productions in the XML spec are based on the Unicode character sets, the definitions
 differ slightly to avoid those areas where the Unicode standard is still being resolved.
 At any rate, I'd lay money that using the Unicode charsets, we will be more correct than
 the vast majority of parsers out there.

 Letter == kCFUniCharLetterCharacterSet
 Digit == kCFUniCharDecimalDigitCharacterSet
 CombiningChar == kCFUniCharNonBaseCharacterSet
 Extender - complex, and not represented by a uniform character set.
 */
CF_PRIVATE Boolean _inputStreamScanXMLName(_CFXMLInputStream *stream, Boolean isNMToken, CFStringRef *str) {
    UniChar ch;
    Boolean success = true;
    stream->parserMark = dropMark(stream);
    if (!isNMToken) {
        // Only difference between an NMToken and a Name is Names have a stricter condition on the first character
        if (!getCharacter(stream, &ch, false)) {
            success = false;
        } else if (!CFUniCharIsMemberOf(ch, kCFUniCharLetterCharacterSet) && ch != '_' && ch != ':') {
            success = false;
        } else {
            getCharacter(stream, &ch, true);
        }
    }
    if (success) {
        while (getCharacter(stream, &ch, true)) {
            if (!CFUniCharIsMemberOf(ch, kCFUniCharLetterCharacterSet) && !CFUniCharIsMemberOf(ch, kCFUniCharDecimalDigitCharacterSet)  && ch != '.' && ch != '-' && ch != '_' && ch != ':' && !CFUniCharIsMemberOf(ch, kCFUniCharNonBaseCharacterSet)) {
                _inputStreamReturnCharacter(stream, ch);
                break;
            }
        }
        if (NULL == stream->currentChar || stream->currentChar == stream->parserMark) {
            success = false; // Must have processed at least one character
        }
    }
    if (success) {
        if (str) {
            if (!stream->nameSet) {
                stream->nameSet = CFSetCreateMutable(stream->allocator, 0, &kCFTypeSetCallBacks);
                stream->tempString = CFStringCreateMutableWithExternalCharactersNoCopy(stream->allocator, NULL, 0, 0, kCFAllocatorNull);
            }
            CFStringSetExternalCharactersNoCopy(stream->tempString, stream->parserMark, stream->currentChar-stream->parserMark, stream->currentChar-stream->parserMark);
            if (!CFSetGetValueIfPresent(stream->nameSet, stream->tempString, (const void **)str)) {
                *str = (CFStringRef)CFStringCreateCopy(stream->allocator, stream->tempString);
                CFSetAddValue(stream->nameSet, *str);
                CFRelease(*str);
            }
        }
    } else {
        restoreToMark(stream, stream->parserMark);
    }
    stream->parserMark = NULL;
    return success;
}
static CFIndex __CFDefaultToBytesFallbackProc(const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
    CFIndex processCharLen = 1, filledBytesLen = 1;
    uint8_t byte = '?';

    if (*characters < 0xA0) { // 0x80 to 0x9F maps to ASCII C0 range
        byte = (uint8_t)(*characters - 0x80);
    } else if (*characters < 0x100) {
        *usedByteLen = __CFToASCIILatin1Fallback(*characters, bytes, maxByteLen);
        return 1;
    } else if (*characters >= kSurrogateHighStart && *characters <= kSurrogateLowEnd) {
        processCharLen = (numChars > 1 && *characters <= kSurrogateLowStart && *(characters + 1) >= kSurrogateLowStart && *(characters + 1) <= kSurrogateLowEnd ? 2 : 1);
    } else if (CFUniCharIsMemberOf(*characters, kCFUniCharWhitespaceCharacterSet)) {
        byte = ' ';
    } else if (CFUniCharIsMemberOf(*characters, kCFUniCharWhitespaceAndNewlineCharacterSet)) {
        byte = ASCIINewLine;
    } else if (*characters == 0x2026) { // ellipsis
        if (0 == maxByteLen) {
            filledBytesLen = 3;
        } else if (maxByteLen > 2) {
            memset(bytes, '.', 3);
            *usedByteLen = 3;
            return processCharLen;
        }
    } else if (CFUniCharIsMemberOf(*characters, kCFUniCharDecomposableCharacterSet)) {
        UTF32Char decomposed[MAX_DECOMPOSED_LENGTH];

        (void)CFUniCharDecomposeCharacter(*characters, decomposed, MAX_DECOMPOSED_LENGTH);
        if (*decomposed < 0x80) {
            byte = (uint8_t)(*decomposed);
        } else {
            UTF16Char theChar = *decomposed;

            return __CFDefaultToBytesFallbackProc(&theChar, 1, bytes, maxByteLen, usedByteLen);
        }
    }
    
    if (maxByteLen) *bytes = byte;
    *usedByteLen = filledBytesLen;
    return processCharLen;
}