Пример #1
0
/**
 * Like "strlen", but for strings encoded with Java's modified UTF-8.
 *
 * The value returned is the number of UTF-16 characters required
 * to represent this string.
 */
extern size_t strlen8to16 (const char* utf8Str)
{
    size_t len = 0;
    int ic;
    int expected = 0;

    while ((ic = *utf8Str++) != '\0') {
        /* bytes that start 0? or 11 are lead bytes and count as characters.*/
        /* bytes that start 10 are extention bytes and are not counted */
         
        if ((ic & 0xc0) == 0x80) {
            /* count the 0x80 extention bytes. if we have more than
             * expected, then start counting them because strcpy8to16
             * will insert UTF16_REPLACEMENT_CHAR's
             */
            expected--;
            if (expected < 0) {
                len++;
            }
        } else {
            len++;
            expected = UTF8_SEQ_LENGTH(ic) - 1;

            /* this will result in a surrogate pair */
            if (expected == 3) {
                len++;
            }
        }
    }

    return len;
}
Пример #2
0
	int UString::_utf_string_len(const char* buf, size_t size)
	{
		int len = 0;
		int i = 0;
		if (buf == NULL){
			return 0;
		}
		while(i < (int)size){
			int sl = UTF8_SEQ_LENGTH(buf);
			if (sl){
				len++;
				buf += sl;
			}
			else{  // Something wrong here, failure recover.
				// utf8_data++;
				break;
			}
			i += sl;
		}

		if(i > (int)size){ // Overflow, invalid data stream.
			return -1;
		}

		return len;
	}
Пример #3
0
/*
 * Retrieve the next UTF-32 character from a UTF-8 string.
 *
 * Stops at inner \0's
 *
 * Returns UTF16_REPLACEMENT_CHAR if an invalid sequence is encountered
 *
 * Advances "*pUtf8Ptr" to the start of the next character.
 */
static inline uint32_t getUtf32FromUtf8(const char** pUtf8Ptr)
{
    uint32_t ret;
    int seq_len;
    int i;

    /* Mask for leader byte for lengths 1, 2, 3, and 4 respectively*/
    static const char leaderMask[4] = {0xff, 0x1f, 0x0f, 0x07};

    /* Bytes that start with bits "10" are not leading characters. */
    if (((**pUtf8Ptr) & 0xc0) == 0x80) {
        (*pUtf8Ptr)++;
        return UTF16_REPLACEMENT_CHAR;
    }

    /* note we tolerate invalid leader 11111xxx here */    
    seq_len = UTF8_SEQ_LENGTH(**pUtf8Ptr);

    ret = (**pUtf8Ptr) & leaderMask [seq_len - 1];

    if (**pUtf8Ptr == '\0') return ret;

    (*pUtf8Ptr)++;
    for (i = 1; i < seq_len ; i++, (*pUtf8Ptr)++) {
        if ((**pUtf8Ptr) == '\0') return UTF16_REPLACEMENT_CHAR;
        if (((**pUtf8Ptr) & 0xc0) != 0x80) return UTF16_REPLACEMENT_CHAR;

        UTF8_SHIFT_AND_MASK(ret, **pUtf8Ptr);
    }

    return ret;
}
Пример #4
0
	UString UString::substr(size_t startIndex, size_t endIndex) const 
	{
		endIndex = endIndex > m_strLen ? m_strLen : endIndex;

		UString sRet;
		const char* str = (const char *)m_mallocBuffer;
		if(endIndex - startIndex != 0){
			size_t n = 0;
			int start = -1, end = -1, index = 0;
			while(n < m_mallocBufferLen){
				int len = UTF8_SEQ_LENGTH(str);
				if(index == startIndex){
					start = n;
				}
				if(index == endIndex - 1){
					end = n + len;
					break;
				}

				n += len;
				str += len;
				index++;
			}

			if(start != -1 && end != -1){
				sRet.init( (const char*)(m_mallocBuffer+start), end-start);
			}
		}

		return sRet;
	}