Unicode * Unicode_AllocList(char **srcList, // IN: list of strings ssize_t length, // IN: list StringEncoding encoding) // IN: { Unicode *dstList = NULL; ssize_t i; ASSERT(srcList != NULL); encoding = Unicode_ResolveEncoding(encoding); if (length < 0) { length = 0; while (srcList[length] != NULL) { length++; } /* Include the sentinel element. */ length++; } dstList = Util_SafeMalloc(length * sizeof *dstList); for (i = 0; i < length; i++) { dstList[i] = Unicode_Alloc(srcList[i], encoding); } return dstList; }
void * UnicodeGetAllocBytesInternal(ConstUnicode ustr, // IN StringEncoding encoding, // IN ssize_t lengthInBytes, // IN size_t *retLength) // OUT: optional { const char *utf8Str = ustr; char *result = NULL; ASSERT(ustr != NULL); encoding = Unicode_ResolveEncoding(encoding); if (lengthInBytes == -1) { lengthInBytes = Unicode_LengthInBytes(ustr, STRING_ENCODING_UTF8); } switch (encoding) { case STRING_ENCODING_US_ASCII: if (!UnicodeSanityCheck(utf8Str, lengthInBytes, encoding)) { break; } // fall through case STRING_ENCODING_UTF8: result = Util_SafeMalloc(lengthInBytes + 1); memcpy(result, utf8Str, lengthInBytes + 1); if (retLength != NULL) { *retLength = lengthInBytes; } break; case STRING_ENCODING_UTF16_LE: if (!CodeSet_Utf8ToUtf16le(utf8Str, lengthInBytes, &result, retLength)) { // input should be valid UTF-8, no conversion error possible ASSERT_MEM_ALLOC(FALSE); } break; default: if (!CodeSet_GenericToGeneric("UTF-8", utf8Str, lengthInBytes, Unicode_EncodingEnumToName(encoding), CSGTG_NORMAL, &result, retLength)) { // XXX can't distinguish error cause ASSERT(result == NULL); } } return result; }
char * Unicode_EscapeBuffer(const void *buffer, // IN ssize_t lengthInBytes, // IN StringEncoding encoding) // IN { encoding = Unicode_ResolveEncoding(encoding); if (lengthInBytes == -1) { lengthInBytes = Unicode_LengthInBytes(buffer, encoding); } /* * The buffer could have NULs or 8-bit values inside. Escape it. */ return Escape_DoString("\\x", NonPrintableBytesToEscape, buffer, lengthInBytes, NULL); }
Unicode Unicode_AllocWithLength(const void *buffer, // IN: ssize_t lengthInBytes, // IN: StringEncoding encoding) // IN: { Unicode result; ASSERT(lengthInBytes >= 0 || lengthInBytes == -1); if (buffer == NULL) { ASSERT(lengthInBytes <= 0); return NULL; } encoding = Unicode_ResolveEncoding(encoding); if (lengthInBytes == -1) { lengthInBytes = Unicode_LengthInBytes(buffer, encoding); } result = UnicodeAllocInternal(buffer, lengthInBytes, encoding, FALSE); if (result == NULL) { char *escapedBuffer = Unicode_EscapeBuffer(buffer, lengthInBytes, encoding); /* * Log and panic on failure. */ Log("%s: Couldn't convert invalid buffer [%s] from %s to Unicode.\n", __FUNCTION__, escapedBuffer ? escapedBuffer : "(couldn't escape bytes)", Unicode_EncodingEnumToName(encoding)); free(escapedBuffer); PANIC(); } return result; }
ssize_t Unicode_LengthInBytes(const void *buffer, // IN StringEncoding encoding) // IN { ssize_t len; encoding = Unicode_ResolveEncoding(encoding); switch (encoding) { case STRING_ENCODING_UTF32_LE: case STRING_ENCODING_UTF32_BE: case STRING_ENCODING_UTF32_XE: { const int32 *p; for (p = buffer; *p != 0; p++) { } len = (const char *) p - (const char *) buffer; break; } case STRING_ENCODING_UTF16_LE: case STRING_ENCODING_UTF16_BE: case STRING_ENCODING_UTF16_XE: { const utf16_t *p; for (p = buffer; *p != 0; p++) { } len = (const char *) p - (const char *) buffer; break; } default: // XXX assume 8-bit encoding with no embedded null len = strlen(buffer); } return len; }
Bool Unicode_IsBufferValid(const void *buffer, // IN ssize_t lengthInBytes, // IN StringEncoding encoding) // IN { if (buffer == NULL) { ASSERT(lengthInBytes <= 0); return TRUE; } encoding = Unicode_ResolveEncoding(encoding); if (encoding == STRING_ENCODING_US_ASCII) { return UnicodeSanityCheck(buffer, lengthInBytes, encoding); } if (lengthInBytes == -1) { lengthInBytes = Unicode_LengthInBytes(buffer, encoding); } return CodeSet_Validate(buffer, lengthInBytes, Unicode_EncodingEnumToName(encoding)); }
Bool Unicode_CopyBytes(void *destBuffer, // OUT ConstUnicode srcBuffer, // IN size_t maxLengthInBytes, // IN size_t *retLength, // OUT StringEncoding encoding) // IN { const char *utf8Str = (const char *)srcBuffer; Bool success = FALSE; size_t copyBytes = 0; encoding = Unicode_ResolveEncoding(encoding); switch (encoding) { case STRING_ENCODING_US_ASCII: if (!UnicodeSanityCheck(utf8Str, -1, encoding)) { break; } // fall through case STRING_ENCODING_UTF8: { size_t len = strlen(utf8Str); copyBytes = MIN(len, maxLengthInBytes - 1); memcpy(destBuffer, utf8Str, copyBytes); /* * If we truncated, force a null termination in a UTF-8 safe * manner. */ if (copyBytes >= len) { success = TRUE; } else { if (encoding == STRING_ENCODING_UTF8) { copyBytes = CodeSet_Utf8FindCodePointBoundary(destBuffer, copyBytes); } } ((char*)destBuffer)[copyBytes] = '\0'; } break; case STRING_ENCODING_UTF16_LE: { char *utf16Buf; size_t utf16BufLen; if (!CodeSet_Utf8ToUtf16le(utf8Str, strlen(utf8Str), &utf16Buf, &utf16BufLen)) { // input should be valid UTF-8, no conversion error possible ASSERT_MEM_ALLOC(FALSE); break; } copyBytes = MIN(utf16BufLen, maxLengthInBytes - 2); memcpy(destBuffer, utf16Buf, copyBytes); copyBytes = CodeSet_Utf16FindCodePointBoundary(destBuffer, copyBytes); ((utf16_t*)destBuffer)[copyBytes / 2] = 0; free(utf16Buf); if (copyBytes >= utf16BufLen) { success = TRUE; } break; } default: { char *currentBuf; size_t currentBufSize; if (!CodeSet_GenericToGeneric("UTF-8", utf8Str, strlen(utf8Str), Unicode_EncodingEnumToName(encoding), CSGTG_NORMAL, ¤tBuf, ¤tBufSize)) { // XXX can't distinguish error cause break; } copyBytes = MIN(currentBufSize, maxLengthInBytes - 1); memcpy(destBuffer, currentBuf, copyBytes); free(currentBuf); /* * XXX this isn't quite correct, we still need to truncate on * a code point boundary, based on the current encoding type, * rather than just null terminate blindly. */ ((char*)destBuffer)[copyBytes] = 0; if (copyBytes >= currentBufSize) { success = TRUE; } } break; } if (retLength) { *retLength = copyBytes; } return success; }
size_t Unicode_BytesRequired(ConstUnicode str, // IN StringEncoding encoding) // IN { const uint8 *utf8 = (const uint8 *)str; // Number of bytes needed for a code point [U+0000, U+FFFF]. size_t basicCodePointSize; // Number of bytes needed for a code point [U+10000, U+10FFFF]. size_t supplementaryCodePointSize; size_t result = 0; encoding = Unicode_ResolveEncoding(encoding); switch (encoding) { case STRING_ENCODING_UTF8: return strlen((const char *)utf8) + 1; case STRING_ENCODING_US_ASCII: case STRING_ENCODING_ISO_8859_1: case STRING_ENCODING_WINDOWS_1252: // TODO: Lots more encodings can be added here. basicCodePointSize = supplementaryCodePointSize = 1; break; case STRING_ENCODING_UTF16_LE: case STRING_ENCODING_UTF16_BE: case STRING_ENCODING_UTF16_XE: basicCodePointSize = 2; supplementaryCodePointSize = 4; break; case STRING_ENCODING_UTF32_LE: case STRING_ENCODING_UTF32_BE: case STRING_ENCODING_UTF32_XE: basicCodePointSize = 4; supplementaryCodePointSize = 4; break; default: /* * Assume the worst: ISO-2022-JP takes up to 7 bytes per code point. */ basicCodePointSize = 7; supplementaryCodePointSize = 7; break; } /* * Do a simple check of how many bytes are needed to convert the * UTF-8 to the target encoding. This doesn't do UTF-8 validity * checking, but will not overrun the end of the buffer. */ while (*utf8) { size_t utf8NumBytesRemaining; // Advance one code point forward in the UTF-8 input. if (*utf8 <= 0x7F) { utf8NumBytesRemaining = 1; result += basicCodePointSize; } else if (*utf8 & 0xC0) { utf8NumBytesRemaining = 2; result += basicCodePointSize; } else if (*utf8 & 0xE0) { utf8NumBytesRemaining = 3; result += basicCodePointSize; } else if (*utf8 & 0xF0) { utf8NumBytesRemaining = 4; result += supplementaryCodePointSize; } else { // Invalid input; nothing we can do. break; } while (*utf8 && utf8NumBytesRemaining) { utf8NumBytesRemaining--; utf8++; } if (utf8NumBytesRemaining > 0) { // Invalid input; nothing we can do. break; } } // Add enough for NUL expressed in the target encoding. result += UNICODE_UTF16_CODE_UNITS_PADDING * basicCodePointSize; return result; }