//-------------------------------------------------------------------------------------------------- ssize_t le_utf8_NumChars ( const char* string ///< [IN] Pointer to the string. ) { uint_fast8_t i; int_fast8_t numBytes = 0; size_t strIndex = 0; size_t numChars = 0; // Check parameters. if (string == NULL) { return 0; } while (string[strIndex] != '\0') { numBytes = NumBytesInChar(string[strIndex]); if (numBytes < 0) { return LE_FORMAT_ERROR; } // Go through the bytes in this character to make sure all bytes are formatted correctly. for (i = 1; i < numBytes; i++) { if ( !IS_CONTINUATION_BYTE(string[++strIndex]) ) { return LE_FORMAT_ERROR; } } // This character is correct. numChars++; // Move on. strIndex++; } return numChars; }
//-------------------------------------------------------------------------------------------------- bool le_utf8_IsFormatCorrect ( const char* string ///< [IN] The string. ) { uint8_t i; int8_t numBytes = 0; size_t strIndex = 0; // Check parameters. if (string == NULL) { return false; } while (string[strIndex] != '\0') { numBytes = NumBytesInChar(string[strIndex]); if (numBytes < 0) { return false; } // Go through the bytes in this character to make sure all bytes are formatted correctly. for (i = 1; i < numBytes; i++) { if ( !IS_CONTINUATION_BYTE(string[++strIndex]) ) { return false; } } // Move on. strIndex++; } return true; }
EAPI Eina_Unicode evas_common_encoding_utf8_get_next(const char *buf, int *iindex) { /* Reads UTF8 bytes from @buf, starting at *@index and returns * the decoded code point at iindex offset, and advances iindex * to the next code point after this. * * Returns 0 to indicate there is no next char */ /* Note: we don't currently handle overlong forms and some other * broken cases. */ int index = *iindex; Eina_Unicode r; unsigned char d; /* if this char is the null terminator, exit */ if ((d = buf[index++]) == 0) return 0; if ((d & 0x80) == 0) { // 1 byte (7bit) - 0xxxxxxx *iindex = index; return d; } if ((d & 0xe0) == 0xc0) { // 2 byte (11bit) - 110xxxxx 10xxxxxx r = (d & 0x1f) << 6; if (((d = buf[index++]) == 0) || IS_INVALID_BYTE(d) || !IS_CONTINUATION_BYTE(d)) goto error; r |= (d & 0x3f); if (!r) goto error; *iindex = index; return r; } if ((d & 0xf0) == 0xe0) { // 3 byte (16bit) - 1110xxxx 10xxxxxx 10xxxxxx r = (d & 0x0f) << 12; if (((d = buf[index++]) == 0) || IS_INVALID_BYTE(d) || !IS_CONTINUATION_BYTE(d)) goto error; r |= (d & 0x3f) << 6; if (((d = buf[index++]) == 0) || IS_INVALID_BYTE(d) || !IS_CONTINUATION_BYTE(d)) goto error; r |= (d & 0x3f); if (!r) goto error; *iindex = index; return r; } if ((d & 0xf8) == 0xf0) { // 4 byte (21bit) - 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx r = (d & 0x07) << 18; if (((d = buf[index++]) == 0) || IS_INVALID_BYTE(d) || !IS_CONTINUATION_BYTE(d)) goto error; r |= (d & 0x3f) << 12; if (((d = buf[index++]) == 0) || IS_INVALID_BYTE(d) || !IS_CONTINUATION_BYTE(d)) goto error; r |= (d & 0x3f) << 6; if (((d = buf[index++]) == 0) || IS_INVALID_BYTE(d) || !IS_CONTINUATION_BYTE(d)) goto error; r |= (d & 0x3f); if (!r) goto error; *iindex = index; return r; } if ((d & 0xfc) == 0xf8) { // 5 byte (26bit) - 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx r = (d & 0x03) << 24; if (((d = buf[index++]) == 0) || IS_INVALID_BYTE(d) || !IS_CONTINUATION_BYTE(d)) goto error; r |= (d & 0x3f) << 18; if (((d = buf[index++]) == 0) || IS_INVALID_BYTE(d) || !IS_CONTINUATION_BYTE(d)) goto error; r |= (d & 0x3f) << 12; if (((d = buf[index++]) == 0) || IS_INVALID_BYTE(d) || !IS_CONTINUATION_BYTE(d)) goto error; r |= (d & 0x3f) << 6; if (((d = buf[index++]) == 0) || IS_INVALID_BYTE(d) || !IS_CONTINUATION_BYTE(d)) goto error; r |= (d & 0x3f); if (!r) goto error; *iindex = index; return r; } if ((d & 0xfe) == 0xfc) { // 6 byte (31bit) - 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx r = (d & 0x01) << 30; if (((d = buf[index++]) == 0) || IS_INVALID_BYTE(d) || !IS_CONTINUATION_BYTE(d)) goto error; r |= (d & 0x3f) << 24; if (((d = buf[index++]) == 0) || IS_INVALID_BYTE(d) || !IS_CONTINUATION_BYTE(d)) goto error; r |= (d & 0x3f) << 18; if (((d = buf[index++]) == 0) || IS_INVALID_BYTE(d) || !IS_CONTINUATION_BYTE(d)) goto error; r |= (d & 0x3f) << 12; if (((d = buf[index++]) == 0) || IS_INVALID_BYTE(d) || !IS_CONTINUATION_BYTE(d)) goto error; r |= (d & 0x3f) << 6; if (((d = buf[index++]) == 0) || IS_INVALID_BYTE(d) || !IS_CONTINUATION_BYTE(d)) goto error; r |= (d & 0x3f); if (!r) goto error; *iindex = index; return r; } /* Gets here where there was an error and we want to replace the char * we just use the invalid unicode codepoints 8 lower bits represent * the original char */ error: d = buf[*iindex]; (*iindex)++; return ERROR_REPLACEMENT_BASE | d; }