Unicode *
Unicode_AllocList(char **srcList,          // IN: list of strings
                  ssize_t length,          // IN: list 
                  StringEncoding encoding) // IN:
{
   Unicode *dstList = NULL;
   ssize_t i;

   ASSERT(srcList != NULL);

   encoding = Unicode_ResolveEncoding(encoding);

   if (length < 0) {
      length = 0;
      while (srcList[length] != NULL) {
         length++;
      }
      
      /* Include the sentinel element. */
      length++;
   }

   dstList = Util_SafeMalloc(length * sizeof *dstList);

   for (i = 0; i < length; i++) {
      dstList[i] = Unicode_Alloc(srcList[i], encoding);
   }

   return dstList;
}
void *
UnicodeGetAllocBytesInternal(ConstUnicode ustr,       // IN
                             StringEncoding encoding, // IN
                             ssize_t lengthInBytes,   // IN
                             size_t *retLength)       // OUT: optional
{
   const char *utf8Str = ustr;
   char *result = NULL;

   ASSERT(ustr != NULL);

   encoding = Unicode_ResolveEncoding(encoding);

   if (lengthInBytes == -1) {
      lengthInBytes = Unicode_LengthInBytes(ustr, STRING_ENCODING_UTF8);
   }

   switch (encoding) {
   case STRING_ENCODING_US_ASCII:
      if (!UnicodeSanityCheck(utf8Str, lengthInBytes, encoding)) {
	 break;
      }
      // fall through
   case STRING_ENCODING_UTF8:
      result = Util_SafeMalloc(lengthInBytes + 1);
      memcpy(result, utf8Str, lengthInBytes + 1);
      if (retLength != NULL) {
	 *retLength = lengthInBytes;
      }
      break;

   case STRING_ENCODING_UTF16_LE:
      if (!CodeSet_Utf8ToUtf16le(utf8Str, lengthInBytes, &result, retLength)) {
	 // input should be valid UTF-8, no conversion error possible
	 ASSERT_MEM_ALLOC(FALSE);
      }
      break;

   default:
      if (!CodeSet_GenericToGeneric("UTF-8", utf8Str, lengthInBytes,
				    Unicode_EncodingEnumToName(encoding),
				    CSGTG_NORMAL,
				    &result, retLength)) {
	 // XXX can't distinguish error cause
	 ASSERT(result == NULL);
      }
   }

   return result;
}
Esempio n. 3
0
char *
Unicode_EscapeBuffer(const void *buffer,      // IN
                     ssize_t lengthInBytes,   // IN
                     StringEncoding encoding) // IN
{
   encoding = Unicode_ResolveEncoding(encoding);

   if (lengthInBytes == -1) {
      lengthInBytes = Unicode_LengthInBytes(buffer, encoding);
   }

   /*
    * The buffer could have NULs or 8-bit values inside.  Escape it.
    */
   return Escape_DoString("\\x",
                          NonPrintableBytesToEscape,
                          buffer,
                          lengthInBytes,
                          NULL);
}
Esempio n. 4
0
Unicode
Unicode_AllocWithLength(const void *buffer,       // IN:
                        ssize_t lengthInBytes,    // IN:
                        StringEncoding encoding)  // IN:
{
   Unicode result;

   ASSERT(lengthInBytes >= 0 || lengthInBytes == -1);

   if (buffer == NULL) {
      ASSERT(lengthInBytes <= 0);
      return NULL;
   }

   encoding = Unicode_ResolveEncoding(encoding);

   if (lengthInBytes == -1) {
      lengthInBytes = Unicode_LengthInBytes(buffer, encoding);
   }

   result = UnicodeAllocInternal(buffer, lengthInBytes, encoding, FALSE);

   if (result == NULL) {
      char *escapedBuffer = Unicode_EscapeBuffer(buffer, lengthInBytes,
                                                 encoding);

      /*
       * Log and panic on failure.
       */

      Log("%s: Couldn't convert invalid buffer [%s] from %s to Unicode.\n",
          __FUNCTION__,
          escapedBuffer ? escapedBuffer : "(couldn't escape bytes)",
          Unicode_EncodingEnumToName(encoding));
      free(escapedBuffer);
      PANIC();
   }

   return result;
}
Esempio n. 5
0
ssize_t
Unicode_LengthInBytes(const void *buffer,      // IN
                      StringEncoding encoding) // IN
{
   ssize_t len;

   encoding = Unicode_ResolveEncoding(encoding);

   switch (encoding) {
   case STRING_ENCODING_UTF32_LE:
   case STRING_ENCODING_UTF32_BE:
   case STRING_ENCODING_UTF32_XE:
   {
      const int32 *p;

      for (p = buffer; *p != 0; p++) {
      }
      len = (const char *) p - (const char *) buffer;
      break;
   }
   case STRING_ENCODING_UTF16_LE:
   case STRING_ENCODING_UTF16_BE:
   case STRING_ENCODING_UTF16_XE:
   {
      const utf16_t *p;

      for (p = buffer; *p != 0; p++) {
      }
      len = (const char *) p - (const char *) buffer;
      break;
   }
   default:
      // XXX assume 8-bit encoding with no embedded null
      len = strlen(buffer);
   }

   return len;
}
Bool
Unicode_IsBufferValid(const void *buffer,      // IN
                      ssize_t lengthInBytes,   // IN
                      StringEncoding encoding) // IN
{
   if (buffer == NULL) {
      ASSERT(lengthInBytes <= 0);
      return TRUE;
   }

   encoding = Unicode_ResolveEncoding(encoding);

   if (encoding == STRING_ENCODING_US_ASCII) {
      return UnicodeSanityCheck(buffer, lengthInBytes, encoding);
   }

   if (lengthInBytes == -1) {
      lengthInBytes = Unicode_LengthInBytes(buffer, encoding);
   }

   return CodeSet_Validate(buffer, lengthInBytes,
			   Unicode_EncodingEnumToName(encoding));
}
Bool
Unicode_CopyBytes(void *destBuffer,        // OUT
                  ConstUnicode srcBuffer,  // IN
                  size_t maxLengthInBytes, // IN
                  size_t *retLength,       // OUT
                  StringEncoding encoding) // IN
{
   const char *utf8Str = (const char *)srcBuffer;
   Bool success = FALSE;
   size_t copyBytes = 0;

   encoding = Unicode_ResolveEncoding(encoding);

   switch (encoding) {
   case STRING_ENCODING_US_ASCII:
      if (!UnicodeSanityCheck(utf8Str, -1, encoding)) {
	 break;
      }
      // fall through
   case STRING_ENCODING_UTF8:
      {
         size_t len = strlen(utf8Str);
         copyBytes = MIN(len, maxLengthInBytes - 1);
         memcpy(destBuffer, utf8Str, copyBytes);

         /*
          * If we truncated, force a null termination in a UTF-8 safe
          * manner.
          */
         if (copyBytes >= len) {
	    success = TRUE;
	 } else {
            if (encoding == STRING_ENCODING_UTF8) {
               copyBytes =
                  CodeSet_Utf8FindCodePointBoundary(destBuffer, copyBytes);
            }
         }

         ((char*)destBuffer)[copyBytes] = '\0';
      }
      break;
   case STRING_ENCODING_UTF16_LE:
      {
         char *utf16Buf;
         size_t utf16BufLen;

         if (!CodeSet_Utf8ToUtf16le(utf8Str,
                                    strlen(utf8Str),
                                    &utf16Buf,
                                    &utf16BufLen)) {
	    // input should be valid UTF-8, no conversion error possible
	    ASSERT_MEM_ALLOC(FALSE);
            break;
         }
         copyBytes = MIN(utf16BufLen, maxLengthInBytes - 2);
         memcpy(destBuffer, utf16Buf, copyBytes);
         copyBytes = CodeSet_Utf16FindCodePointBoundary(destBuffer, copyBytes);
         ((utf16_t*)destBuffer)[copyBytes / 2] = 0;
         free(utf16Buf);

         if (copyBytes >= utf16BufLen) {
            success = TRUE;
         }

         break;
      }
   default:
      {
         char *currentBuf;
         size_t currentBufSize;

	 if (!CodeSet_GenericToGeneric("UTF-8", utf8Str, strlen(utf8Str),
				       Unicode_EncodingEnumToName(encoding),
				       CSGTG_NORMAL,
				       &currentBuf, &currentBufSize)) {
	    // XXX can't distinguish error cause
	    break;
	 }
         copyBytes = MIN(currentBufSize, maxLengthInBytes - 1);
         memcpy(destBuffer, currentBuf, copyBytes);
         free(currentBuf);

         /* 
          * XXX this isn't quite correct, we still need to truncate on
          * a code point boundary, based on the current encoding type,
          * rather than just null terminate blindly.
          */

         ((char*)destBuffer)[copyBytes] = 0;

         if (copyBytes >= currentBufSize) {
            success = TRUE;
         }
      }
      break;
   }

   if (retLength) {
      *retLength = copyBytes;
   }
   return success;
}
size_t
Unicode_BytesRequired(ConstUnicode str,        // IN
                      StringEncoding encoding) // IN
{
   const uint8 *utf8 = (const uint8 *)str;

   // Number of bytes needed for a code point [U+0000, U+FFFF].
   size_t basicCodePointSize;

   // Number of bytes needed for a code point [U+10000, U+10FFFF].
   size_t supplementaryCodePointSize;

   size_t result = 0;

   encoding = Unicode_ResolveEncoding(encoding);

   switch (encoding) {
   case STRING_ENCODING_UTF8:
      return strlen((const char *)utf8) + 1;
   case STRING_ENCODING_US_ASCII:
   case STRING_ENCODING_ISO_8859_1:
   case STRING_ENCODING_WINDOWS_1252:
      // TODO: Lots more encodings can be added here.
      basicCodePointSize = supplementaryCodePointSize = 1;
      break;
   case STRING_ENCODING_UTF16_LE:
   case STRING_ENCODING_UTF16_BE:
   case STRING_ENCODING_UTF16_XE:
      basicCodePointSize = 2;
      supplementaryCodePointSize = 4;
      break;
   case STRING_ENCODING_UTF32_LE:
   case STRING_ENCODING_UTF32_BE:
   case STRING_ENCODING_UTF32_XE:
      basicCodePointSize = 4;
      supplementaryCodePointSize = 4;
      break;
   default:
      /*
       * Assume the worst: ISO-2022-JP takes up to 7 bytes per code point.
       */
      basicCodePointSize = 7;
      supplementaryCodePointSize = 7;
      break;
   }

   /*
    * Do a simple check of how many bytes are needed to convert the
    * UTF-8 to the target encoding.  This doesn't do UTF-8 validity
    * checking, but will not overrun the end of the buffer.
    */
   while (*utf8) {
      size_t utf8NumBytesRemaining;

      // Advance one code point forward in the UTF-8 input.
      if (*utf8 <= 0x7F) {
         utf8NumBytesRemaining = 1;
         result += basicCodePointSize;
      } else if (*utf8 & 0xC0) {
         utf8NumBytesRemaining = 2;
         result += basicCodePointSize;
      } else if (*utf8 & 0xE0) {
         utf8NumBytesRemaining = 3;
         result += basicCodePointSize;
      } else if (*utf8 & 0xF0) {
         utf8NumBytesRemaining = 4;
         result += supplementaryCodePointSize;
      } else {
         // Invalid input; nothing we can do.
         break;
      }

      while (*utf8 && utf8NumBytesRemaining) {
         utf8NumBytesRemaining--;
         utf8++;
      }

      if (utf8NumBytesRemaining > 0) {
         // Invalid input; nothing we can do.
         break;
      }
   }

   // Add enough for NUL expressed in the target encoding.
   result += UNICODE_UTF16_CODE_UNITS_PADDING * basicCodePointSize;

   return result;
}