size_t utf8totitle(const char* input, size_t inputSize, char* target, size_t targetSize, int32_t* errors) { CaseMappingState state; size_t bytes_written = 0; /* Validate parameters */ UTF8_VALIDATE_PARAMETERS_CHAR(char, bytes_written); /* Initialize case mapping */ if (!casemapping_initialize( &state, input, inputSize, target, targetSize, TitlecaseIndex1Ptr, TitlecaseIndex2Ptr, TitlecaseDataPtr)) { UTF8_SET_ERROR(NONE); return bytes_written; } /* Execute case mapping as long as input remains */ while (state.src_size > 0) { size_t result = casemapping_execute(&state); if (!result) { UTF8_SET_ERROR(NOT_ENOUGH_SPACE); return bytes_written; } /* The first letter of every word should be titlecase, the rest lowercase */ if (state.property_data == TitlecaseDataPtr) { if ((state.last_general_category & GeneralCategory_Letter) != 0) { state.property_index1 = LowercaseIndex1Ptr; state.property_index2 = LowercaseIndex2Ptr; state.property_data = LowercaseDataPtr; } } else if ( (state.last_general_category & GeneralCategory_Letter) == 0) { state.property_index1 = TitlecaseIndex1Ptr; state.property_index2 = TitlecaseIndex2Ptr; state.property_data = TitlecaseDataPtr; } bytes_written += result; } UTF8_SET_ERROR(NONE); return bytes_written; }
size_t utf8toutf32(const char* input, size_t inputSize, unicode_t* target, size_t targetSize, int32_t* errors) { const char* src; size_t src_size; unicode_t* dst; size_t dst_size; size_t bytes_written = 0; /* Validate parameters */ UTF8_VALIDATE_PARAMETERS(char, unicode_t, bytes_written); /* Setup cursors */ src = input; src_size = inputSize; dst = target; dst_size = targetSize; /* Loop over input */ while (src_size > 0) { unicode_t decoded; uint8_t decoded_length = codepoint_read(src, src_size, &decoded); if (dst != 0) { /* Write to output */ if (dst_size < sizeof(unicode_t)) { UTF8_SET_ERROR(NOT_ENOUGH_SPACE); return bytes_written; } *dst++ = decoded; dst_size -= sizeof(unicode_t); } bytes_written += sizeof(unicode_t); src += decoded_length; src_size -= decoded_length; } UTF8_SET_ERROR(NONE); return bytes_written; }
size_t utf8tolower(const char* input, size_t inputSize, char* target, size_t targetSize, int32_t* errors) { CaseMappingState state; size_t bytes_written = 0; /* Validate parameters */ UTF8_VALIDATE_PARAMETERS_CHAR(char, bytes_written); /* Initialize case mapping */ if (!casemapping_initialize( &state, input, inputSize, target, targetSize, LowercaseIndex1Ptr, LowercaseIndex2Ptr, LowercaseDataPtr)) { UTF8_SET_ERROR(NONE); return bytes_written; } /* Execute case mapping as long as input remains */ while (state.src_size > 0) { size_t result = casemapping_execute(&state); if (!result) { UTF8_SET_ERROR(NOT_ENOUGH_SPACE); return bytes_written; } bytes_written += result; } UTF8_SET_ERROR(NONE); return bytes_written; }
size_t utf8tolower(const char* input, size_t inputSize, char* target, size_t targetSize, size_t locale, int32_t* errors) { CaseMappingState state; /* Validate parameters */ UTF8_VALIDATE_PARAMETERS_CHAR(char, 0); /* Initialize case mapping */ if (!casemapping_initialize( &state, input, inputSize, target, targetSize, LowercaseIndex1Ptr, LowercaseIndex2Ptr, LowercaseDataPtr, QuickCheckCaseMapped_Lowercase, locale, errors)) { return state.total_bytes_needed; } /* Execute case mapping as long as input remains */ while (state.src_size > 0) { size_t converted; if ((converted = casemapping_execute(&state, errors)) == 0) { return state.total_bytes_needed; } state.total_bytes_needed += converted; } UTF8_SET_ERROR(NONE); return state.total_bytes_needed; }
size_t utf16toutf8(const utf16_t* input, size_t inputSize, char* target, size_t targetSize, int32_t* errors) { const utf16_t* src; size_t src_size; char* dst; size_t dst_size; size_t bytes_written = 0; /* Validate parameters */ UTF8_VALIDATE_PARAMETERS_CHAR(utf16_t, bytes_written); UTF8_SET_ERROR(NONE); /* Setup cursors */ src = input; src_size = inputSize; dst = target; dst_size = targetSize; /* Loop over input */ while (src_size > 0) { unicode_t codepoint; uint8_t encoded_size; if (src_size < sizeof(utf16_t)) { /* Not enough data */ goto invaliddata; } codepoint = (unicode_t)*src; if (codepoint >= SURROGATE_HIGH_START && codepoint <= SURROGATE_LOW_END) { /* Decode surrogate pair */ if (codepoint > SURROGATE_HIGH_END) { /* Missing high surrogate codepoint */ codepoint = REPLACEMENT_CHARACTER; UTF8_SET_ERROR(INVALID_DATA); } else if ( src_size < 2 * sizeof(utf16_t)) { /* Not enough data */ goto invaliddata; } else { /* Read low surrogate codepoint */ if (src[1] < SURROGATE_LOW_START || src[1] > SURROGATE_LOW_END) { /* Missing low surrogate codepoint */ codepoint = REPLACEMENT_CHARACTER; UTF8_SET_ERROR(INVALID_DATA); } else { /* Decode codepoint from surrogate pair */ codepoint = (MAX_BASIC_MULTILINGUAL_PLANE + 1) + (src[1] - SURROGATE_LOW_START) + ((src[0] - SURROGATE_HIGH_START) << 10); src++; src_size -= sizeof(utf16_t); } } } encoded_size = codepoint_write(codepoint, &dst, &dst_size); if (encoded_size == 0) { UTF8_SET_ERROR(NOT_ENOUGH_SPACE); return bytes_written; } bytes_written += encoded_size; src++; src_size -= sizeof(utf16_t); } return bytes_written; invaliddata: if (dst != 0) { if (dst_size < REPLACEMENT_CHARACTER_STRING_LENGTH) { UTF8_SET_ERROR(NOT_ENOUGH_SPACE); return bytes_written; } /* Write replacement codepoint to output */ memcpy(dst, REPLACEMENT_CHARACTER_STRING, REPLACEMENT_CHARACTER_STRING_LENGTH); } UTF8_SET_ERROR(INVALID_DATA); return bytes_written + REPLACEMENT_CHARACTER_STRING_LENGTH; }
size_t utf8casefold(const char* input, size_t inputSize, char* target, size_t targetSize, size_t locale, int32_t* errors) { CaseMappingState state; /* Validate parameters */ UTF8_VALIDATE_PARAMETERS_CHAR(char, 0); /* Initialize case mapping */ if (!casemapping_initialize( &state, input, inputSize, target, targetSize, CaseFoldingIndex1Ptr, CaseFoldingIndex2Ptr, CaseFoldingDataPtr, QuickCheckCaseMapped_Casefolded, locale, errors)) { return state.total_bytes_needed; } if (state.locale == UTF8_LOCALE_TURKISH_AND_AZERI_LATIN) { /* Exceptional behavior for Turkish and Azerbaijani (Latin) locales */ while (state.src_size > 0) { const char* resolved = 0; uint8_t bytes_needed = 0; /* Read next code point */ if (!(state.last_code_point_size = codepoint_read(state.src, state.src_size, &state.last_code_point))) { goto invaliddata; } /* Move source cursor */ if (state.src_size >= state.last_code_point_size) { state.src += state.last_code_point_size; state.src_size -= state.last_code_point_size; } else { state.src_size = 0; } /* Resolve case folding */ if ((PROPERTY_GET_CM(state.last_code_point) & QuickCheckCaseMapped_Casefolded) != 0) { if (state.last_code_point == CP_LATIN_CAPITAL_LETTER_I) { resolved = "\xC4\xB1"; bytes_needed = 2; } else if ( state.last_code_point == CP_LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) { resolved = "i"; bytes_needed = 1; } else { resolved = database_querydecomposition(state.last_code_point, state.property_index1, state.property_index2, state.property_data, &bytes_needed); } } /* Write to output */ if (resolved != 0) { /* Write resolved string to output */ if (state.dst != 0) { if (state.dst_size < bytes_needed) { goto outofspace; } memcpy(state.dst, resolved, bytes_needed); state.dst += bytes_needed; state.dst_size -= bytes_needed; } } else { /* Write code point unchanged to output */ if (!(bytes_needed = codepoint_write(state.last_code_point, &state.dst, &state.dst_size))) { goto outofspace; } } state.total_bytes_needed += bytes_needed; } } else { /* Execute case mapping as long as input remains */ while (state.src_size > 0) { const char* resolved = 0; uint8_t bytes_needed = 0; /* Read next code point */ if (!(state.last_code_point_size = codepoint_read(state.src, state.src_size, &state.last_code_point))) { goto invaliddata; } /* Move source cursor */ if (state.src_size >= state.last_code_point_size) { state.src += state.last_code_point_size; state.src_size -= state.last_code_point_size; } else { state.src_size = 0; } /* Resolve case folding */ if ((PROPERTY_GET_CM(state.last_code_point) & QuickCheckCaseMapped_Casefolded) != 0) { resolved = database_querydecomposition(state.last_code_point, state.property_index1, state.property_index2, state.property_data, &bytes_needed); } if (resolved != 0) { /* Write resolved string to output */ if (state.dst != 0) { if (state.dst_size < bytes_needed) { goto outofspace; } memcpy(state.dst, resolved, bytes_needed); state.dst += bytes_needed; state.dst_size -= bytes_needed; } } else { /* Write code point unchanged to output */ if (!(bytes_needed = codepoint_write(state.last_code_point, &state.dst, &state.dst_size))) { goto outofspace; } } state.total_bytes_needed += bytes_needed; } } UTF8_SET_ERROR(NONE); return state.total_bytes_needed; invaliddata: UTF8_SET_ERROR(INVALID_DATA); return state.total_bytes_needed; outofspace: UTF8_SET_ERROR(NOT_ENOUGH_SPACE); return state.total_bytes_needed; }
size_t utf8totitle(const char* input, size_t inputSize, char* target, size_t targetSize, size_t locale, int32_t* errors) { CaseMappingState state; /* Validate parameters */ UTF8_VALIDATE_PARAMETERS_CHAR(char, 0); /* Initialize case mapping */ if (!casemapping_initialize( &state, input, inputSize, target, targetSize, TitlecaseIndex1Ptr, TitlecaseIndex2Ptr, TitlecaseDataPtr, QuickCheckCaseMapped_Titlecase, locale, errors)) { return state.total_bytes_needed; } /* Execute case mapping as long as input remains */ while (state.src_size > 0) { size_t converted; if ((converted = casemapping_execute(&state, errors)) == 0) { return state.total_bytes_needed; } /* The first letter of every word should be titlecase, the rest should be converted to lowercase. */ if (state.last_canonical_combining_class == CCC_NOT_REORDERED) { if (state.property_data == TitlecaseDataPtr) { if ((state.last_general_category & UTF8_CATEGORY_LETTER) != 0) { state.property_index1 = LowercaseIndex1Ptr; state.property_index2 = LowercaseIndex2Ptr; state.property_data = LowercaseDataPtr; state.quickcheck_flags = QuickCheckCaseMapped_Lowercase; } } else if ( (state.last_general_category & UTF8_CATEGORY_LETTER) == 0) { state.property_index1 = TitlecaseIndex1Ptr; state.property_index2 = TitlecaseIndex2Ptr; state.property_data = TitlecaseDataPtr; state.quickcheck_flags = QuickCheckCaseMapped_Titlecase; } } state.total_bytes_needed += converted; } UTF8_SET_ERROR(NONE); return state.total_bytes_needed; }
size_t utf8toutf16(const char* input, size_t inputSize, utf16_t* target, size_t targetSize, int32_t* errors) { const char* src; size_t src_size; utf16_t* dst; size_t dst_size; size_t bytes_written = 0; /* Validate parameters */ UTF8_VALIDATE_PARAMETERS(char, utf16_t, bytes_written); /* Setup cursors */ src = input; src_size = inputSize; dst = target; dst_size = targetSize; /* Loop over input */ while (src_size > 0) { unicode_t decoded; uint8_t decoded_size = codepoint_read(src, src_size, &decoded); if (decoded <= MAX_BASIC_MULTILINGUAL_PLANE) { /* Codepoint fits in a single UTF-16 codepoint */ if (dst != 0) { /* Write to output */ if (dst_size < sizeof(utf16_t)) { UTF8_SET_ERROR(NOT_ENOUGH_SPACE); return bytes_written; } *dst++ = (utf16_t)decoded; dst_size -= sizeof(utf16_t); } bytes_written += sizeof(utf16_t); } else { /* Codepoint must be encoded using a surrogate pair */ if (dst != 0) { /* Write to output */ if (dst_size < 2 * sizeof(utf16_t)) { UTF8_SET_ERROR(NOT_ENOUGH_SPACE); return bytes_written; } /* Encoded value is always beyond BMP */ decoded -= (MAX_BASIC_MULTILINGUAL_PLANE + 1); *dst++ = SURROGATE_HIGH_START + (decoded >> 10); *dst++ = SURROGATE_LOW_START + (decoded & 0x03FF); dst_size -= 2 * sizeof(utf16_t); } bytes_written += 2 * sizeof(utf16_t); } src += decoded_size; src_size -= decoded_size; } UTF8_SET_ERROR(NONE); return bytes_written; }
size_t utf8normalize(const char* input, size_t inputSize, char* target, size_t targetSize, size_t flags, int32_t* errors) { char* dst = target; size_t dst_size = targetSize; StreamState stream[4]; DecomposeState decompose_state; ComposeState compose_state; uint8_t compatibility = (flags & UTF8_NORMALIZE_COMPATIBILITY) != 0; StreamState* stream_output; uint8_t finished = 0; size_t bytes_written = 0; /* Decomposition uses the following process: input --> stream[0] --> (decompose) --> stream[1] --> (accumulate) --> stream[2] --> output The accumulation step is necessary in order to prevent buffer overflow attacks. Composition adds another stream buffer: input --> stream[0] --> (decompose) --> stream[1] --> (accumulate) --> stream[2] --> (compose) --> stream[3] --> output Although four streaming buffers may seem excessive, they are necessary for preventing allocations on the heap. */ /* Check for valid flags */ if ((flags & (UTF8_NORMALIZE_DECOMPOSE | UTF8_NORMALIZE_COMPOSE)) == 0) { UTF8_SET_ERROR(INVALID_FLAG); return bytes_written; } /* Validate parameters */ UTF8_VALIDATE_PARAMETERS_CHAR(char, bytes_written); /* Initialize decomposition */ memset(stream, 0, sizeof(stream)); if (!stream_initialize(&stream[0], input, inputSize) || !decompose_initialize(&decompose_state, &stream[0], &stream[1], compatibility)) { UTF8_SET_ERROR(INVALID_DATA); return bytes_written; } stream_output = &stream[2]; if ((flags & UTF8_NORMALIZE_COMPOSE) != 0) { /* Initialize composition */ if (!compose_initialize(&compose_state, &stream[2], &stream[3], compatibility)) { UTF8_SET_ERROR(INVALID_DATA); return bytes_written; } stream_output = &stream[3]; } do { uint8_t write = 0; /* Accumulate decomposed input in next stream */ if (stream[1].current > 0) { unicode_t* src_codepoint = stream[1].codepoint; unicode_t* dst_codepoint = stream[2].codepoint + stream[2].filled; uint8_t* src_qc = stream[1].quick_check; uint8_t* dst_qc = stream[2].quick_check + stream[2].filled; uint8_t* src_ccc = stream[1].canonical_combining_class; uint8_t* dst_ccc = stream[2].canonical_combining_class + stream[2].filled; if ((flags & UTF8_NORMALIZE_COMPOSE) != 0) { uint8_t i; /* Update stream properties to use composition values */ for (i = 0; i < stream[1].current; ++i) { *dst_qc++ = PROPERTY_GET(compose_state.qc_index, compose_state.qc_data, *src_codepoint); *dst_ccc++ = *src_ccc++; *dst_codepoint++ = *src_codepoint++; } } else { /* Copy directly */ memcpy(dst_codepoint, src_codepoint, stream[1].current * sizeof(unicode_t)); memcpy(dst_qc, src_qc, stream[1].current * sizeof(uint8_t)); memcpy(dst_ccc, src_ccc, stream[1].current * sizeof(uint8_t)); } stream[2].current += stream[1].current; stream[2].filled += stream[1].current; } /* Decompose input sequence into next stream */ finished = !decompose_execute(&decompose_state); if (!finished) { /* Output current stream it it could overflow accumulation buffer */ write = (stream[1].current + stream[2].filled) >= STREAM_SAFE_MAX; } /* Reorder potentially unordered decomposed stream */ if (!stream[1].stable) { stream_reorder(&stream[1]); } /* Write stream to output when overflowing or when accumulation buffer is empty*/ if (write || finished) { uint8_t i; /* Compose accumulation buffer */ if ((flags & UTF8_NORMALIZE_COMPOSE) != 0 && !compose_execute(&compose_state)) { break; } /* Write to output buffer */ for (i = 0; i < stream_output->current; ++i) { uint8_t encoded_size = codepoint_write(stream_output->codepoint[i], &dst, &dst_size); if (encoded_size == 0) { UTF8_SET_ERROR(NOT_ENOUGH_SPACE); return bytes_written; } bytes_written += encoded_size; } /* Reset accumulation buffer */ stream[2].current = 0; stream[2].filled = 0; } } while (!finished); UTF8_SET_ERROR(NONE); return bytes_written; }