/** * Like "strlen", but for strings encoded with Java's modified UTF-8. * * The value returned is the number of UTF-16 characters required * to represent this string. */ extern size_t strlen8to16 (const char* utf8Str) { size_t len = 0; int ic; int expected = 0; while ((ic = *utf8Str++) != '\0') { /* bytes that start 0? or 11 are lead bytes and count as characters.*/ /* bytes that start 10 are extention bytes and are not counted */ if ((ic & 0xc0) == 0x80) { /* count the 0x80 extention bytes. if we have more than * expected, then start counting them because strcpy8to16 * will insert UTF16_REPLACEMENT_CHAR's */ expected--; if (expected < 0) { len++; } } else { len++; expected = UTF8_SEQ_LENGTH(ic) - 1; /* this will result in a surrogate pair */ if (expected == 3) { len++; } } } return len; }
int UString::_utf_string_len(const char* buf, size_t size) { int len = 0; int i = 0; if (buf == NULL){ return 0; } while(i < (int)size){ int sl = UTF8_SEQ_LENGTH(buf); if (sl){ len++; buf += sl; } else{ // Something wrong here, failure recover. // utf8_data++; break; } i += sl; } if(i > (int)size){ // Overflow, invalid data stream. return -1; } return len; }
/* * Retrieve the next UTF-32 character from a UTF-8 string. * * Stops at inner \0's * * Returns UTF16_REPLACEMENT_CHAR if an invalid sequence is encountered * * Advances "*pUtf8Ptr" to the start of the next character. */ static inline uint32_t getUtf32FromUtf8(const char** pUtf8Ptr) { uint32_t ret; int seq_len; int i; /* Mask for leader byte for lengths 1, 2, 3, and 4 respectively*/ static const char leaderMask[4] = {0xff, 0x1f, 0x0f, 0x07}; /* Bytes that start with bits "10" are not leading characters. */ if (((**pUtf8Ptr) & 0xc0) == 0x80) { (*pUtf8Ptr)++; return UTF16_REPLACEMENT_CHAR; } /* note we tolerate invalid leader 11111xxx here */ seq_len = UTF8_SEQ_LENGTH(**pUtf8Ptr); ret = (**pUtf8Ptr) & leaderMask [seq_len - 1]; if (**pUtf8Ptr == '\0') return ret; (*pUtf8Ptr)++; for (i = 1; i < seq_len ; i++, (*pUtf8Ptr)++) { if ((**pUtf8Ptr) == '\0') return UTF16_REPLACEMENT_CHAR; if (((**pUtf8Ptr) & 0xc0) != 0x80) return UTF16_REPLACEMENT_CHAR; UTF8_SHIFT_AND_MASK(ret, **pUtf8Ptr); } return ret; }
UString UString::substr(size_t startIndex, size_t endIndex) const { endIndex = endIndex > m_strLen ? m_strLen : endIndex; UString sRet; const char* str = (const char *)m_mallocBuffer; if(endIndex - startIndex != 0){ size_t n = 0; int start = -1, end = -1, index = 0; while(n < m_mallocBufferLen){ int len = UTF8_SEQ_LENGTH(str); if(index == startIndex){ start = n; } if(index == endIndex - 1){ end = n + len; break; } n += len; str += len; index++; } if(start != -1 && end != -1){ sRet.init( (const char*)(m_mallocBuffer+start), end-start); } } return sRet; }