uint32_t UTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode & /*errorCode*/) { if(pos == length) { c = U_SENTINEL; return Collation::FALLBACK_CE32; } // Optimized combination of U8_NEXT_OR_FFFD() and UTRIE2_U8_NEXT32(). c = u8[pos++]; if(c < 0xc0) { // ASCII 00..7F; trail bytes 80..BF map to error values. return trie->data32[c]; } uint8_t t1, t2; if(c < 0xe0 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) { // U+0080..U+07FF; 00..7F map to error values. uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1]; c = ((c & 0x1f) << 6) | t1; ++pos; return ce32; } else if(c <= 0xef && ((pos + 1) < length || length < 0) && (t1 = (u8[pos] - 0x80)) <= 0x3f && (c != 0xe0 || t1 >= 0x20) && (t2 = (u8[pos + 1] - 0x80)) <= 0x3f ) { // U+0800..U+FFFF; caller maps surrogates to error values. c = (UChar)((c << 12) | (t1 << 6) | t2); pos += 2; return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c); } else { // Function call for supplementary code points and error cases. // Illegal byte sequences yield U+FFFD. c = utf8_nextCharSafeBody(u8, &pos, length, c, -3); return data->getCE32(c); } }
uint32_t FCDUTF8CollationIterator::handleNextCE32(UChar32 &c, UErrorCode &errorCode) { for(;;) { if(state == CHECK_FWD) { // Combination of UTF8CollationIterator::handleNextCE32() with FCD check fastpath. if(pos == length) { c = U_SENTINEL; return Collation::FALLBACK_CE32; } c = u8[pos++]; if(c < 0xc0) { // ASCII 00..7F; trail bytes 80..BF map to error values. return trie->data32[c]; } uint8_t t1, t2; if(c < 0xe0 && pos != length && (t1 = (u8[pos] - 0x80)) <= 0x3f) { // U+0080..U+07FF; 00..7F map to error values. uint32_t ce32 = trie->data32[trie->index[(UTRIE2_UTF8_2B_INDEX_2_OFFSET - 0xc0) + c] + t1]; c = ((c & 0x1f) << 6) | t1; ++pos; if(CollationFCD::hasTccc(c) && pos != length && nextHasLccc()) { pos -= 2; } else { return ce32; } } else if(c <= 0xef && ((pos + 1) < length || length < 0) && (t1 = (u8[pos] - 0x80)) <= 0x3f && (c != 0xe0 || t1 >= 0x20) && (t2 = (u8[pos + 1] - 0x80)) <= 0x3f ) { // U+0800..U+FFFF; caller maps surrogates to error values. c = (UChar)((c << 12) | (t1 << 6) | t2); pos += 2; if(CollationFCD::hasTccc(c) && (CollationFCD::maybeTibetanCompositeVowel(c) || (pos != length && nextHasLccc()))) { pos -= 3; } else { break; // return CE32(BMP) } } else { // Function call for supplementary code points and error cases. // Illegal byte sequences yield U+FFFD. c = utf8_nextCharSafeBody(u8, &pos, length, c, -3); if(c == 0xfffd) { return Collation::FFFD_CE32; } else { U_ASSERT(c > 0xffff); if(CollationFCD::hasTccc(U16_LEAD(c)) && pos != length && nextHasLccc()) { pos -= 4; } else { return data->getCE32FromSupplementary(c); } } } if(!nextSegment(errorCode)) { c = U_SENTINEL; return Collation::FALLBACK_CE32; } continue; } else if(state == IN_FCD_SEGMENT && pos != limit) { return UTF8CollationIterator::handleNextCE32(c, errorCode); } else if(state == IN_NORMALIZED && pos != normalized.length()) { c = normalized[pos++]; break; } else { switchToForward(); } } return UTRIE2_GET32_FROM_U16_SINGLE_LEAD(trie, c); }
U_CAPI UChar* U_EXPORT2 u_strFromUTF8(UChar *dest, int32_t destCapacity, int32_t *pDestLength, const char* src, int32_t srcLength, UErrorCode *pErrorCode){ UChar *pDest = dest; UChar *pDestLimit = dest+destCapacity; UChar32 ch=0; int32_t index = 0; int32_t reqLength = 0; uint8_t* pSrc = (uint8_t*) src; /* args check */ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ return NULL; } if((src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){ *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; return NULL; } if(srcLength == -1){ srcLength = (int32_t)uprv_strlen((char*)pSrc); } while((index < srcLength)&&(pDest<pDestLimit)){ ch = pSrc[index++]; if(ch <=0x7f){ *pDest++=(UChar)ch; }else{ ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -1); if(ch<0){ *pErrorCode = U_INVALID_CHAR_FOUND; return NULL; }else if(ch<=0xFFFF){ *(pDest++)=(UChar)ch; }else{ *(pDest++)=UTF16_LEAD(ch); if(pDest<pDestLimit){ *(pDest++)=UTF16_TRAIL(ch); }else{ reqLength++; break; } } } } /* donot fill the dest buffer just count the UChars needed */ while(index < srcLength){ ch = pSrc[index++]; if(ch <= 0x7f){ reqLength++; }else{ ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -1); if(ch<0){ *pErrorCode = U_INVALID_CHAR_FOUND; return NULL; } reqLength+=UTF_CHAR_LENGTH(ch); } } reqLength+=(int32_t)(pDest - dest); if(pDestLength){ *pDestLength = reqLength; } /* Terminate the buffer */ u_terminateUChars(dest,destCapacity,reqLength,pErrorCode); return dest; }