UniChar CFStringEncodingPrecomposeLatinCharacter(const UniChar *character, CFIndex numChars, CFIndex *usedChars) { if (numChars > 0) { UTF32Char ch = *(character++), nextCh, composedChar; CFIndex usedCharLen = 1; if (CFUniCharIsSurrogateHighCharacter(ch) || CFUniCharIsSurrogateLowCharacter(ch)) { if (usedChars) (*usedChars) = usedCharLen; return ch; } while (usedCharLen < numChars) { nextCh = *(character++); if (CFUniCharIsSurrogateHighCharacter(nextCh) || CFUniCharIsSurrogateLowCharacter(nextCh)) break; if (CFUniCharIsMemberOf(nextCh, kCFUniCharNonBaseCharacterSet) && ((composedChar = CFUniCharPrecomposeCharacter(ch, nextCh)) != 0xFFFD)) { if (composedChar > 0xFFFF) { // Non-base break; } else { ch = composedChar; } } else { break; } ++usedCharLen; } if (usedChars) (*usedChars) = usedCharLen; if (usedCharLen > 1) return ch; } return 0xFFFD; }
/* [4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | CombiningChar | Extender [5] Name ::= (Letter | '_' | ':') (NameChar)* [7] Nmtoken ::= (NameChar)+ [84] Letter ::= BaseChar | Ideographic We don't do this quite right; we rely on the Unicode charsets to do this analysis. While the productions in the XML spec are based on the Unicode character sets, the definitions differ slightly to avoid those areas where the Unicode standard is still being resolved. At any rate, I'd lay money that using the Unicode charsets, we will be more correct than the vast majority of parsers out there. Letter == kCFUniCharLetterCharacterSet Digit == kCFUniCharDecimalDigitCharacterSet CombiningChar == kCFUniCharNonBaseCharacterSet Extender - complex, and not represented by a uniform character set. */ CF_PRIVATE Boolean _inputStreamScanXMLName(_CFXMLInputStream *stream, Boolean isNMToken, CFStringRef *str) { UniChar ch; Boolean success = true; stream->parserMark = dropMark(stream); if (!isNMToken) { // Only difference between an NMToken and a Name is Names have a stricter condition on the first character if (!getCharacter(stream, &ch, false)) { success = false; } else if (!CFUniCharIsMemberOf(ch, kCFUniCharLetterCharacterSet) && ch != '_' && ch != ':') { success = false; } else { getCharacter(stream, &ch, true); } } if (success) { while (getCharacter(stream, &ch, true)) { if (!CFUniCharIsMemberOf(ch, kCFUniCharLetterCharacterSet) && !CFUniCharIsMemberOf(ch, kCFUniCharDecimalDigitCharacterSet) && ch != '.' && ch != '-' && ch != '_' && ch != ':' && !CFUniCharIsMemberOf(ch, kCFUniCharNonBaseCharacterSet)) { _inputStreamReturnCharacter(stream, ch); break; } } if (NULL == stream->currentChar || stream->currentChar == stream->parserMark) { success = false; // Must have processed at least one character } } if (success) { if (str) { if (!stream->nameSet) { stream->nameSet = CFSetCreateMutable(stream->allocator, 0, &kCFTypeSetCallBacks); stream->tempString = CFStringCreateMutableWithExternalCharactersNoCopy(stream->allocator, NULL, 0, 0, kCFAllocatorNull); } CFStringSetExternalCharactersNoCopy(stream->tempString, stream->parserMark, stream->currentChar-stream->parserMark, stream->currentChar-stream->parserMark); if (!CFSetGetValueIfPresent(stream->nameSet, stream->tempString, (const void **)str)) { *str = (CFStringRef)CFStringCreateCopy(stream->allocator, stream->tempString); CFSetAddValue(stream->nameSet, *str); CFRelease(*str); } } } else { restoreToMark(stream, stream->parserMark); } stream->parserMark = NULL; return success; }
static CFIndex __CFDefaultToBytesFallbackProc(const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) { CFIndex processCharLen = 1, filledBytesLen = 1; uint8_t byte = '?'; if (*characters < 0xA0) { // 0x80 to 0x9F maps to ASCII C0 range byte = (uint8_t)(*characters - 0x80); } else if (*characters < 0x100) { *usedByteLen = __CFToASCIILatin1Fallback(*characters, bytes, maxByteLen); return 1; } else if (*characters >= kSurrogateHighStart && *characters <= kSurrogateLowEnd) { processCharLen = (numChars > 1 && *characters <= kSurrogateLowStart && *(characters + 1) >= kSurrogateLowStart && *(characters + 1) <= kSurrogateLowEnd ? 2 : 1); } else if (CFUniCharIsMemberOf(*characters, kCFUniCharWhitespaceCharacterSet)) { byte = ' '; } else if (CFUniCharIsMemberOf(*characters, kCFUniCharWhitespaceAndNewlineCharacterSet)) { byte = ASCIINewLine; } else if (*characters == 0x2026) { // ellipsis if (0 == maxByteLen) { filledBytesLen = 3; } else if (maxByteLen > 2) { memset(bytes, '.', 3); *usedByteLen = 3; return processCharLen; } } else if (CFUniCharIsMemberOf(*characters, kCFUniCharDecomposableCharacterSet)) { UTF32Char decomposed[MAX_DECOMPOSED_LENGTH]; (void)CFUniCharDecomposeCharacter(*characters, decomposed, MAX_DECOMPOSED_LENGTH); if (*decomposed < 0x80) { byte = (uint8_t)(*decomposed); } else { UTF16Char theChar = *decomposed; return __CFDefaultToBytesFallbackProc(&theChar, 1, bytes, maxByteLen, usedByteLen); } } if (maxByteLen) *bytes = byte; *usedByteLen = filledBytesLen; return processCharLen; }