static Unicode
UnicodeTrimInternal(ConstUnicode str,      // IN
                    UnicodeTrimSide side)  // IN
{
   Unicode trimmed;
   utf16_t *utf16;
   utf16_t *utf16Start;
   utf16_t *utf16End;

   ASSERT(str);

   utf16 = Unicode_GetAllocBytes(str, STRING_ENCODING_UTF16);
   utf16Start = utf16;
   utf16End = utf16 + Unicode_UTF16Strlen(utf16);

   if (side & UNICODE_TRIMLEFT) {
      while (utf16Start != utf16End && UnicodeSimpleIsWhiteSpace(*utf16Start)) {
         utf16Start++;
      }
   }

   if (side & UNICODE_TRIMRIGHT) {
      while (utf16End != utf16Start && UnicodeSimpleIsWhiteSpace(*(utf16End - 1))) {
         utf16End--;
      }
   }

   *utf16End = 0;

   trimmed = Unicode_AllocWithUTF16(utf16Start);
   free(utf16);

   return trimmed;
}
Exemple #2
0
utf16_t *
Unicode_UTF16Strdup(const utf16_t *utf16) // IN: May be NULL.
{
   utf16_t *copy;
   ssize_t numBytes;

   // Follow Util_SafeStrdup semantics.
   if (utf16 == NULL) {
      return NULL;
   }

   numBytes = (Unicode_UTF16Strlen(utf16) + 1 /* NUL */) * sizeof *copy;
   copy = Util_SafeMalloc(numBytes);
   memcpy(copy, utf16, numBytes);

   return copy;
}
int
Unicode_CompareRange(ConstUnicode str1,       // IN
                     UnicodeIndex str1Start,  // IN
                     UnicodeIndex str1Length, // IN
                     ConstUnicode str2,       // IN
                     UnicodeIndex str2Start,  // IN
                     UnicodeIndex str2Length, // IN
                     Bool ignoreCase)         // IN
{
   int result = -1;
   Unicode substr1 = NULL;
   Unicode substr2 = NULL;
   utf16_t *substr1UTF16 = NULL;
   utf16_t *substr2UTF16 = NULL;
   UnicodeIndex i = 0;
   UnicodeIndex utf16Index;
   utf16_t codeUnit1;
   utf16_t codeUnit2;
   uint32 codePoint1;
   uint32 codePoint2;

   UnicodePinIndices(str1, &str1Start, &str1Length);
   UnicodePinIndices(str2, &str2Start, &str2Length);

   /*
    * TODO: Allocating substrings is a performance hit.  We should do
    * this search in-place.  (However, searching UTF-8 requires tender loving
    * care, and it's just easier to search UTF-16.)
    */
   substr1 = Unicode_Substr(str1, str1Start, str1Length);
   if (!substr1) {
      goto out;
   }

   substr2 = Unicode_Substr(str2, str2Start, str2Length);
   if (!substr2) {
      goto out;
   }

   /*
    * XXX TODO: Need to normalize the incoming strings to NFC or NFD.
    */
   substr1UTF16 = Unicode_GetAllocUTF16(substr1);
   if (!substr1UTF16) {
      goto out;
   }

   substr2UTF16 = Unicode_GetAllocUTF16(substr2);
   if (!substr2UTF16) {
      goto out;
   }

   /*
    * TODO: This is the naive string search algorithm, which is
    * O(n * m).  We can do better with KMP or Boyer-Moore if this
    * proves to be a bottleneck.
    */
   while (TRUE) {
      codeUnit1 = *(substr1UTF16 + i);
      codeUnit2 = *(substr2UTF16 + i);

      /*
       * TODO: Simple case folding doesn't handle the situation where
       * more than one code unit is needed to store the result of the
       * case folding.
       *
       * This means that German "straBe" (where B = sharp S, U+00DF)
       * will not match "STRASSE", even though the two strings are the
       * same.
       */
      if (ignoreCase) {
         codeUnit1 = UnicodeSimpleCaseFold(codeUnit1);
         codeUnit2 = UnicodeSimpleCaseFold(codeUnit2);
      }

      if (codeUnit1 != codeUnit2) {
         break;
      }

      if (codeUnit1 == 0) {
         // End of both strings reached: strings are equal.
         result = 0;
         goto out;
      }

      i++;
   }

   /*
    * The two UTF-16 code units differ.  If they're the first code unit
    * of a surrogate pair (for Unicode values past U+FFFF), decode the
    * surrogate pair into a full Unicode code point.
    */
   if (U16_IS_SURROGATE(codeUnit1)) {
      ssize_t substrUTF16Len = Unicode_UTF16Strlen(substr1UTF16);

      // U16_NEXT modifies the index, so let it work on a copy.
      utf16Index = i;

      // Decode the surrogate if needed.
      U16_NEXT(substr1UTF16, utf16Index, substrUTF16Len, codePoint1);
   } else {
      // Not a surrogate?  Then the code point value is the code unit.
      codePoint1 = codeUnit1;
   }

   if (U16_IS_SURROGATE(codeUnit2)) {
      ssize_t substrUTF16Len = Unicode_UTF16Strlen(substr2UTF16);

      utf16Index = i;
      U16_NEXT(substr2UTF16, utf16Index, substrUTF16Len, codePoint2);
   } else {
      codePoint2 = codeUnit2;
   }

   if (codePoint1 < codePoint2) {
      result = -1;
   } else if (codePoint1 > codePoint2) {
      result = 1;
   } else {
      // If we hit the end of the string, we've already gone to 'out'.
      NOT_REACHED();
   }

  out:
   free(substr1UTF16);
   free(substr2UTF16);

   Unicode_Free(substr1);
   Unicode_Free(substr2);

   return result;
}