Unicode
UnicodeAllocInternal(const void *buffer,      // IN
                     ssize_t lengthInBytes,   // IN
                     StringEncoding encoding, // IN
		     Bool strict)             // IN
{
   char *utf8Result = NULL;

   ASSERT(buffer != NULL);
   ASSERT(lengthInBytes >= 0);
   ASSERT(Unicode_IsEncodingValid(encoding));

   if (!strict) {
      CodeSet_GenericToGeneric(Unicode_EncodingEnumToName(encoding),
                               buffer, lengthInBytes,
			       "UTF-8", CSGTG_TRANSLIT, &utf8Result, NULL);
      return utf8Result;
   }

   switch (encoding) {
   case STRING_ENCODING_US_ASCII:
   case STRING_ENCODING_UTF8:
      if (Unicode_IsBufferValid(buffer, lengthInBytes, encoding)) {
	 utf8Result = Util_SafeStrndup(buffer, lengthInBytes);
      }
      break;
   case STRING_ENCODING_UTF16_LE:
      // utf8Result will be left NULL on failure.
      CodeSet_Utf16leToUtf8((const char *)buffer,
                            lengthInBytes,
                            &utf8Result,
                            NULL);
      break;
   default:
      CodeSet_GenericToGeneric(Unicode_EncodingEnumToName(encoding),
                               buffer, lengthInBytes,
			       "UTF-8", 0, &utf8Result, NULL);
      break;
   }

   return (Unicode)utf8Result;
}
void *
UnicodeGetAllocBytesInternal(ConstUnicode ustr,       // IN
                             StringEncoding encoding, // IN
                             ssize_t lengthInBytes,   // IN
                             size_t *retLength)       // OUT: optional
{
   const char *utf8Str = ustr;
   char *result = NULL;

   ASSERT(ustr != NULL);

   encoding = Unicode_ResolveEncoding(encoding);

   if (lengthInBytes == -1) {
      lengthInBytes = Unicode_LengthInBytes(ustr, STRING_ENCODING_UTF8);
   }

   switch (encoding) {
   case STRING_ENCODING_US_ASCII:
      if (!UnicodeSanityCheck(utf8Str, lengthInBytes, encoding)) {
	 break;
      }
      // fall through
   case STRING_ENCODING_UTF8:
      result = Util_SafeMalloc(lengthInBytes + 1);
      memcpy(result, utf8Str, lengthInBytes + 1);
      if (retLength != NULL) {
	 *retLength = lengthInBytes;
      }
      break;

   case STRING_ENCODING_UTF16_LE:
      if (!CodeSet_Utf8ToUtf16le(utf8Str, lengthInBytes, &result, retLength)) {
	 // input should be valid UTF-8, no conversion error possible
	 ASSERT_MEM_ALLOC(FALSE);
      }
      break;

   default:
      if (!CodeSet_GenericToGeneric("UTF-8", utf8Str, lengthInBytes,
				    Unicode_EncodingEnumToName(encoding),
				    CSGTG_NORMAL,
				    &result, retLength)) {
	 // XXX can't distinguish error cause
	 ASSERT(result == NULL);
      }
   }

   return result;
}
Ejemplo n.º 3
0
Unicode
Unicode_AllocWithLength(const void *buffer,       // IN:
                        ssize_t lengthInBytes,    // IN:
                        StringEncoding encoding)  // IN:
{
   Unicode result;

   ASSERT(lengthInBytes >= 0 || lengthInBytes == -1);

   if (buffer == NULL) {
      ASSERT(lengthInBytes <= 0);
      return NULL;
   }

   encoding = Unicode_ResolveEncoding(encoding);

   if (lengthInBytes == -1) {
      lengthInBytes = Unicode_LengthInBytes(buffer, encoding);
   }

   result = UnicodeAllocInternal(buffer, lengthInBytes, encoding, FALSE);

   if (result == NULL) {
      char *escapedBuffer = Unicode_EscapeBuffer(buffer, lengthInBytes,
                                                 encoding);

      /*
       * Log and panic on failure.
       */

      Log("%s: Couldn't convert invalid buffer [%s] from %s to Unicode.\n",
          __FUNCTION__,
          escapedBuffer ? escapedBuffer : "(couldn't escape bytes)",
          Unicode_EncodingEnumToName(encoding));
      free(escapedBuffer);
      PANIC();
   }

   return result;
}
Bool
Unicode_IsBufferValid(const void *buffer,      // IN
                      ssize_t lengthInBytes,   // IN
                      StringEncoding encoding) // IN
{
   if (buffer == NULL) {
      ASSERT(lengthInBytes <= 0);
      return TRUE;
   }

   encoding = Unicode_ResolveEncoding(encoding);

   if (encoding == STRING_ENCODING_US_ASCII) {
      return UnicodeSanityCheck(buffer, lengthInBytes, encoding);
   }

   if (lengthInBytes == -1) {
      lengthInBytes = Unicode_LengthInBytes(buffer, encoding);
   }

   return CodeSet_Validate(buffer, lengthInBytes,
			   Unicode_EncodingEnumToName(encoding));
}
Bool
Unicode_CopyBytes(void *destBuffer,        // OUT
                  ConstUnicode srcBuffer,  // IN
                  size_t maxLengthInBytes, // IN
                  size_t *retLength,       // OUT
                  StringEncoding encoding) // IN
{
   const char *utf8Str = (const char *)srcBuffer;
   Bool success = FALSE;
   size_t copyBytes = 0;

   encoding = Unicode_ResolveEncoding(encoding);

   switch (encoding) {
   case STRING_ENCODING_US_ASCII:
      if (!UnicodeSanityCheck(utf8Str, -1, encoding)) {
	 break;
      }
      // fall through
   case STRING_ENCODING_UTF8:
      {
         size_t len = strlen(utf8Str);
         copyBytes = MIN(len, maxLengthInBytes - 1);
         memcpy(destBuffer, utf8Str, copyBytes);

         /*
          * If we truncated, force a null termination in a UTF-8 safe
          * manner.
          */
         if (copyBytes >= len) {
	    success = TRUE;
	 } else {
            if (encoding == STRING_ENCODING_UTF8) {
               copyBytes =
                  CodeSet_Utf8FindCodePointBoundary(destBuffer, copyBytes);
            }
         }

         ((char*)destBuffer)[copyBytes] = '\0';
      }
      break;
   case STRING_ENCODING_UTF16_LE:
      {
         char *utf16Buf;
         size_t utf16BufLen;

         if (!CodeSet_Utf8ToUtf16le(utf8Str,
                                    strlen(utf8Str),
                                    &utf16Buf,
                                    &utf16BufLen)) {
	    // input should be valid UTF-8, no conversion error possible
	    ASSERT_MEM_ALLOC(FALSE);
            break;
         }
         copyBytes = MIN(utf16BufLen, maxLengthInBytes - 2);
         memcpy(destBuffer, utf16Buf, copyBytes);
         copyBytes = CodeSet_Utf16FindCodePointBoundary(destBuffer, copyBytes);
         ((utf16_t*)destBuffer)[copyBytes / 2] = 0;
         free(utf16Buf);

         if (copyBytes >= utf16BufLen) {
            success = TRUE;
         }

         break;
      }
   default:
      {
         char *currentBuf;
         size_t currentBufSize;

	 if (!CodeSet_GenericToGeneric("UTF-8", utf8Str, strlen(utf8Str),
				       Unicode_EncodingEnumToName(encoding),
				       CSGTG_NORMAL,
				       &currentBuf, &currentBufSize)) {
	    // XXX can't distinguish error cause
	    break;
	 }
         copyBytes = MIN(currentBufSize, maxLengthInBytes - 1);
         memcpy(destBuffer, currentBuf, copyBytes);
         free(currentBuf);

         /* 
          * XXX this isn't quite correct, we still need to truncate on
          * a code point boundary, based on the current encoding type,
          * rather than just null terminate blindly.
          */

         ((char*)destBuffer)[copyBytes] = 0;

         if (copyBytes >= currentBufSize) {
            success = TRUE;
         }
      }
      break;
   }

   if (retLength) {
      *retLength = copyBytes;
   }
   return success;
}