int
Unicode_CompareRange(const char *str1,         // IN:
                     UnicodeIndex str1Start,   // IN:
                     UnicodeIndex str1Length,  // IN:
                     const char *str2,         // IN:
                     UnicodeIndex str2Start,   // IN:
                     UnicodeIndex str2Length,  // IN:
                     Bool ignoreCase)          // IN:
{
   int result = -1;
   char *substr1 = NULL;
   char *substr2 = NULL;
   utf16_t *substr1UTF16 = NULL;
   utf16_t *substr2UTF16 = NULL;
   UnicodeIndex i = 0;
   UnicodeIndex utf16Index;
   utf16_t codeUnit1;
   utf16_t codeUnit2;
   uint32 codePoint1;
   uint32 codePoint2;

   /*
    * TODO: Allocating substrings is a performance hit.  We should do this
    * search in-place.  (However, searching UTF-8 requires tender loving
    * care, and it's just easier to search UTF-16.)
    */

   substr1 = Unicode_Substr(str1, str1Start, str1Length);
   if (!substr1) {
      goto out;
   }

   substr2 = Unicode_Substr(str2, str2Start, str2Length);
   if (!substr2) {
      goto out;
   }

   /*
    * XXX TODO: Need to normalize the incoming strings to NFC or NFD.
    */

   substr1UTF16 = Unicode_GetAllocUTF16(substr1);
   if (!substr1UTF16) {
      goto out;
   }

   substr2UTF16 = Unicode_GetAllocUTF16(substr2);
   if (!substr2UTF16) {
      goto out;
   }

   /*
    * TODO: This is the naive string search algorithm, which is O(n * m). We
    * can do better with KMP or Boyer-Moore if this proves to be a bottleneck.
    */

   while (TRUE) {
      codeUnit1 = *(substr1UTF16 + i);
      codeUnit2 = *(substr2UTF16 + i);

      /*
       * TODO: Simple case folding doesn't handle the situation where more
       * than one code unit is needed to store the result of the case folding.
       *
       * This means that German "straBe" (where B = sharp S, U+00DF) will not
       * match "STRASSE", even though the two strings are the same.
       */

      if (ignoreCase) {
         codeUnit1 = UnicodeSimpleCaseFold(codeUnit1);
         codeUnit2 = UnicodeSimpleCaseFold(codeUnit2);
      }

      if (codeUnit1 != codeUnit2) {
         break;
      }

      if (codeUnit1 == 0) {
         // End of both strings reached: strings are equal.
         result = 0;
         goto out;
      }

      i++;
   }

   /*
    * The two UTF-16 code units differ. If they're the first code unit of a
    * surrogate pair (for Unicode values past U+FFFF), decode the surrogate
    * pair into a full Unicode code point.
    */

   if (U16_IS_SURROGATE(codeUnit1)) {
      ssize_t substrUTF16Len = Unicode_UTF16Strlen(substr1UTF16);

      // U16_NEXT modifies the index, so let it work on a copy.
      utf16Index = i;

      // Decode the surrogate if needed.
      U16_NEXT(substr1UTF16, utf16Index, substrUTF16Len, codePoint1);
   } else {
      // Not a surrogate?  Then the code point value is the code unit.
      codePoint1 = codeUnit1;
   }

   if (U16_IS_SURROGATE(codeUnit2)) {
      ssize_t substrUTF16Len = Unicode_UTF16Strlen(substr2UTF16);

      utf16Index = i;
      U16_NEXT(substr2UTF16, utf16Index, substrUTF16Len, codePoint2);
   } else {
      codePoint2 = codeUnit2;
   }

   if (codePoint1 < codePoint2) {
      result = -1;
   } else if (codePoint1 > codePoint2) {
      result = 1;
   } else {
      // If we hit the end of the string, we've already gone to 'out'.
      NOT_REACHED();
   }

  out:
   free(substr1UTF16);
   free(substr2UTF16);

   free(substr1);
   free(substr2);

   return result;
}
U_CAPI int32_t U_EXPORT2
uspoof_check(const USpoofChecker *sc,
             const UChar *text, int32_t length,
             int32_t *position,
             UErrorCode *status) {
             
    const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
    if (This == NULL) {
        return 0;
    }
    if (length < -1) {
        *status = U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }
    if (length == -1) {
        // It's not worth the bother to handle nul terminated strings everywhere.
        //   Just get the length and be done with it.
        length = u_strlen(text);
    }

    int32_t result = 0;
    int32_t failPos = 0x7fffffff;   // TODO: do we have a #define for max int32?

    // A count of the number of non-Common or inherited scripts.
    // Needed for both the SINGLE_SCRIPT and the WHOLE/MIXED_SCIRPT_CONFUSABLE tests.
    // Share the computation when possible.  scriptCount == -1 means that we haven't
    // done it yet.
    int32_t scriptCount = -1;

    if ((This->fChecks) & USPOOF_SINGLE_SCRIPT) {
        scriptCount = This->scriptScan(text, length, failPos, *status);
        // printf("scriptCount (clipped to 2) = %d\n", scriptCount);
        if ( scriptCount >= 2) {
            // Note: scriptCount == 2 covers all cases of the number of scripts >= 2
            result |= USPOOF_SINGLE_SCRIPT;
        }
    }

    if (This->fChecks & USPOOF_CHAR_LIMIT) {
        int32_t i;
        UChar32 c;
        for (i=0; i<length ;) {
            U16_NEXT(text, i, length, c);
            if (!This->fAllowedCharsSet->contains(c)) {
                result |= USPOOF_CHAR_LIMIT;
                if (i < failPos) {
                    failPos = i;
                }
                break;
            }
        }
    }

    if (This->fChecks & 
        (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_INVISIBLE)) {
        // These are the checks that need to be done on NFD input
        NFDBuffer   normalizedInput(text, length, *status);
        const UChar  *nfdText = normalizedInput.getBuffer();
        int32_t      nfdLength = normalizedInput.getLength();

        if (This->fChecks & USPOOF_INVISIBLE) {
           
            // scan for more than one occurence of the same non-spacing mark
            // in a sequence of non-spacing marks.
            int32_t     i;
            UChar32     c;
            UChar32     firstNonspacingMark = 0;
            UBool       haveMultipleMarks = FALSE;  
            UnicodeSet  marksSeenSoFar;   // Set of combining marks in a single combining sequence.
            
            for (i=0; i<nfdLength ;) {
                U16_NEXT(nfdText, i, nfdLength, c);
                if (u_charType(c) != U_NON_SPACING_MARK) {
                    firstNonspacingMark = 0;
                    if (haveMultipleMarks) {
                        marksSeenSoFar.clear();
                        haveMultipleMarks = FALSE;
                    }
                    continue;
                }
                if (firstNonspacingMark == 0) {
                    firstNonspacingMark = c;
                    continue;
                }
                if (!haveMultipleMarks) {
                    marksSeenSoFar.add(firstNonspacingMark);
                    haveMultipleMarks = TRUE;
                }
                if (marksSeenSoFar.contains(c)) {
                    // report the error, and stop scanning.
                    // No need to find more than the first failure.
                    result |= USPOOF_INVISIBLE;
                    failPos = i;
                    // TODO: Bug 8655: failPos is the position in the NFD buffer, but what we want
                    //       to give back to our caller is a position in the original input string.
                    if (failPos > length) {
                        failPos = length;
                    }
                    break;
                }
                marksSeenSoFar.add(c);
            }
        }
       
        
        if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE)) {
            // The basic test is the same for both whole and mixed script confusables.
            // Compute the set of scripts that every input character has a confusable in.
            // For this computation an input character is always considered to be
            //    confusable with itself in its own script.
            // If the number of such scripts is two or more, and the input consisted of
            //   characters all from a single script, we have a whole script confusable.
            //   (The two scripts will be the original script and the one that is confusable)
            // If the number of such scripts >= one, and the original input contained characters from
            //   more than one script, we have a mixed script confusable.  (We can transform
            //   some of the characters, and end up with a visually similar string all in
            //   one script.)

            if (scriptCount == -1) {
                int32_t t;
                scriptCount = This->scriptScan(text, length, t, *status);
            }
            
            ScriptSet scripts;
            This->wholeScriptCheck(nfdText, nfdLength, &scripts, *status);
            int32_t confusableScriptCount = scripts.countMembers();
            //printf("confusableScriptCount = %d\n", confusableScriptCount);
            
            if ((This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE) &&
                confusableScriptCount >= 2 &&
                scriptCount == 1) {
                result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
            }
        
            if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) &&
                confusableScriptCount >= 1 &&
                scriptCount > 1) {
                result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
            }
        }
    }
    if (position != NULL && failPos != 0x7fffffff) {
        *position = failPos;
    }
    return result;
}
U_CAPI int32_t U_EXPORT2
uspoof_getSkeleton(const USpoofChecker *sc,
                   uint32_t type,
                   const UChar *s,  int32_t length,
                   UChar *dest, int32_t destCapacity,
                   UErrorCode *status) {

    // TODO:  this function could be sped up a bit
    //        Skip the input normalization when not needed, work from callers data.
    //        Put the initial skeleton straight into the caller's destination buffer.
    //        It probably won't need normalization.
    //        But these would make the structure more complicated.  

    const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
    if (U_FAILURE(*status)) {
        return 0;
    }
    if (length<-1 || destCapacity<0 || (destCapacity==0 && dest!=NULL) ||
        (type & ~(USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE)) != 0) {
        *status = U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

   int32_t tableMask = 0;
   switch (type) {
      case 0:
        tableMask = USPOOF_ML_TABLE_FLAG;
        break;
      case USPOOF_SINGLE_SCRIPT_CONFUSABLE:
        tableMask = USPOOF_SL_TABLE_FLAG;
        break;
      case USPOOF_ANY_CASE:
        tableMask = USPOOF_MA_TABLE_FLAG;
        break;
      case USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE:
        tableMask = USPOOF_SA_TABLE_FLAG;
        break;
      default:
        *status = U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    // NFD transform of the user supplied input
    
    UChar nfdStackBuf[USPOOF_STACK_BUFFER_SIZE];
    UChar *nfdInput = nfdStackBuf;
    int32_t normalizedLen = unorm_normalize(
        s, length, UNORM_NFD, 0, nfdInput, USPOOF_STACK_BUFFER_SIZE, status);
    if (*status == U_BUFFER_OVERFLOW_ERROR) {
        nfdInput = (UChar *)uprv_malloc((normalizedLen+1)*sizeof(UChar));
        if (nfdInput == NULL) {
            *status = U_MEMORY_ALLOCATION_ERROR;
            return 0;
        }
        *status = U_ZERO_ERROR;
        normalizedLen = unorm_normalize(s, length, UNORM_NFD, 0,
                                        nfdInput, normalizedLen+1, status);
    }
    if (U_FAILURE(*status)) {
        if (nfdInput != nfdStackBuf) {
            uprv_free(nfdInput);
        }
        return 0;
    }

    // buffer to hold the Unicode defined skeleton mappings for a single code point
    UChar buf[USPOOF_MAX_SKELETON_EXPANSION];

    // Apply the skeleton mapping to the NFD normalized input string
    // Accumulate the skeleton, possibly unnormalized, in a UnicodeString.
    int32_t inputIndex = 0;
    UnicodeString skelStr;
    while (inputIndex < normalizedLen) {
        UChar32 c;
        U16_NEXT(nfdInput, inputIndex, normalizedLen, c);
        int32_t replaceLen = This->confusableLookup(c, tableMask, buf);
        skelStr.append(buf, replaceLen);
    }

    if (nfdInput != nfdStackBuf) {
        uprv_free(nfdInput);
    }
    
    const UChar *result = skelStr.getBuffer();
    int32_t  resultLen  = skelStr.length();
    UChar   *normedResult = NULL;

    // Check the skeleton for NFD, normalize it if needed.
    // Unnormalized results should be very rare.
    if (!unorm_isNormalized(result, resultLen, UNORM_NFD, status)) {
        normalizedLen = unorm_normalize(result, resultLen, UNORM_NFD, 0, NULL, 0, status);
        normedResult = static_cast<UChar *>(uprv_malloc((normalizedLen+1)*sizeof(UChar)));
        if (normedResult == NULL) {
            *status = U_MEMORY_ALLOCATION_ERROR;
            return 0;
        }
        *status = U_ZERO_ERROR;
        unorm_normalize(result, resultLen, UNORM_NFD, 0, normedResult, normalizedLen+1, status);
        result = normedResult;
        resultLen = normalizedLen;
    }

    // Copy the skeleton to the caller's buffer
    if (U_SUCCESS(*status)) {
        if (destCapacity == 0 || resultLen > destCapacity) {
            *status = resultLen>destCapacity ? U_BUFFER_OVERFLOW_ERROR : U_STRING_NOT_TERMINATED_WARNING;
        } else {
            u_memcpy(dest, result, resultLen);
            if (destCapacity > resultLen) {
                dest[resultLen] = 0;
            } else {
                *status = U_STRING_NOT_TERMINATED_WARNING;
            }
        }
     }       
     uprv_free(normedResult);
     return resultLen;
}
Exemple #4
0
static void TestNextPrevChar(){

    static UChar input[]={0x0061, 0xd800, 0xdc00, 0xdbff, 0xdfff, 0x0062, 0xd841, 0xd7ff, 0xd841, 0xdc41, 0xdc00, 0x0000};
    static UChar32 result[]={
    /*next_unsafe    next_safe_ns  next_safe_s       prev_unsafe   prev_safe_ns     prev_safe_s*/
        0x0061,        0x0061,       0x0061,           0x0000,       0x0000,          0x0000,
        0x10000,       0x10000,      0x10000,          0x120400,     0xdc00,          UTF_ERROR_VALUE, 
        0xdc00,        0xdc00,       UTF_ERROR_VALUE,  0x20441,      0x20441,         0x20441,
        0x10ffff,      0x10ffff,     0x10ffff,         0xd841,       0xd841,          UTF_ERROR_VALUE, 
        0xdfff,        0xdfff,       UTF_ERROR_VALUE,  0xd7ff,       0xd7ff,          0xd7ff,   
        0x0062,        0x0062,       0x0062,           0xd841,       0xd841,          UTF_ERROR_VALUE,     
        0x1ffff,       0xd841,       UTF_ERROR_VALUE,  0x0062,       0x0062,          0x0062,
        0xd7ff,        0xd7ff,       0xd7ff,           0x10ffff,     0x10ffff,        0x10ffff,
        0x20441,       0x20441,      0x20441,          0xdbff,       0xdbff,          UTF_ERROR_VALUE,      
        0xdc41,        0xdc41,       UTF_ERROR_VALUE,  0x10000,      0x10000,         0x10000,
        0xdc00,        0xdc00,       UTF_ERROR_VALUE,  0xd800,       0xd800,          UTF_ERROR_VALUE,
        0x0000,        0x0000,       0x0000,           0x0061,       0x0061,          0x0061
    };
    static uint16_t movedOffset[]={
   /*next_unsafe    next_safe_ns  next_safe_s       prev_unsafe   prev_safe_ns     prev_safe_s*/
        1,            1,           1,                11,           11,               11,
        3,            3,           3,                9,            10 ,              10, 
        3,            3,           3,                8,            8,                8,  
        5,            5,           4,                8,            8,                8, 
        5,            5,           5,                7,            7,                7,
        6,            6,           6,                6,            6,                6,
        8,            7,           7,                5,            5,                5,
        8,            8,           8,                3,            3,                3, 
        10,           10,          10,               3,            3,                3,         
        10,           10,          10,               1,            1,                1, 
        11,           11,          11,               1,            1,                1, 
        12,           12,          12,               0,            0,                0, 
    };
      

    UChar32 c=0x0000;
    uint16_t i=0;
    uint16_t offset=0, setOffset=0;
    for(offset=0; offset<sizeof(input)/U_SIZEOF_UCHAR; offset++){
         setOffset=offset;
         UTF16_NEXT_CHAR_UNSAFE(input, setOffset, c);
         if(setOffset != movedOffset[i]){
             log_err("ERROR: UTF16_NEXT_CHAR_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                 offset, movedOffset[i], setOffset);
         }
         if(c != result[i]){
             log_err("ERROR: UTF16_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
         }

         setOffset=offset;
         U16_NEXT_UNSAFE(input, setOffset, c);
         if(setOffset != movedOffset[i]){
             log_err("ERROR: U16_NEXT_CHAR_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                 offset, movedOffset[i], setOffset);
         }
         if(c != result[i]){
             log_err("ERROR: U16_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
         }

         setOffset=offset;
         UTF16_NEXT_CHAR_SAFE(input, setOffset, sizeof(input)/U_SIZEOF_UCHAR, c, FALSE);
         if(setOffset != movedOffset[i+1]){
             log_err("ERROR: UTF16_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                 offset, movedOffset[i+1], setOffset);
         }
         if(c != result[i+1]){
             log_err("ERROR: UTF16_NEXT_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
         }

         setOffset=offset;
         U16_NEXT(input, setOffset, sizeof(input)/U_SIZEOF_UCHAR, c);
         if(setOffset != movedOffset[i+1]){
             log_err("ERROR: U16_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                 offset, movedOffset[i+1], setOffset);
         }
         if(c != result[i+1]){
             log_err("ERROR: U16_NEXT failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
         }

         setOffset=offset;
         UTF16_NEXT_CHAR_SAFE(input, setOffset, sizeof(input)/U_SIZEOF_UCHAR, c, TRUE);
         if(setOffset != movedOffset[i+1]){
             log_err("ERROR: UTF16_NEXT_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                 offset, movedOffset[i+2], setOffset);
         }
         if(c != result[i+2]){
             log_err("ERROR: UTF16_NEXT_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
         }

         i=(uint16_t)(i+6);
    }
    i=0;
    for(offset=(uint16_t)sizeof(input)/U_SIZEOF_UCHAR; offset > 0; --offset){
         setOffset=offset;
         UTF16_PREV_CHAR_UNSAFE(input, setOffset, c);
         if(setOffset != movedOffset[i+3]){
             log_err("ERROR: UTF16_PREV_CHAR_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                 offset, movedOffset[i+3], setOffset);
         }
         if(c != result[i+3]){
             log_err("ERROR: UTF16_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+3], c);
         }

         setOffset=offset;
         U16_PREV_UNSAFE(input, setOffset, c);
         if(setOffset != movedOffset[i+3]){
             log_err("ERROR: U16_PREV_CHAR_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                 offset, movedOffset[i+3], setOffset);
         }
         if(c != result[i+3]){
             log_err("ERROR: U16_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+3], c);
         }

         setOffset=offset;
         UTF16_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE);
         if(setOffset != movedOffset[i+4]){
             log_err("ERROR: UTF16_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                 offset, movedOffset[i+4], setOffset);
         }
         if(c != result[i+4]){
             log_err("ERROR: UTF16_PREV_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c);
         }

         setOffset=offset;
         U16_PREV(input, 0, setOffset, c);
         if(setOffset != movedOffset[i+4]){
             log_err("ERROR: U16_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                 offset, movedOffset[i+4], setOffset);
         }
         if(c != result[i+4]){
             log_err("ERROR: U16_PREV failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c);
         }

         setOffset=offset;
         UTF16_PREV_CHAR_SAFE(input, 0,  setOffset, c, TRUE);
         if(setOffset != movedOffset[i+5]){
             log_err("ERROR: UTF16_PREV_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                 offset, movedOffset[i+5], setOffset);
         } 
         if(c != result[i+5]){
             log_err("ERROR: UTF16_PREV_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+5], c);
         }

         i=(uint16_t)(i+6);
    }

}
U_CAPI UBool U_EXPORT2
u_hasBinaryProperty(UChar32 c, UProperty which) {
    /* c is range-checked in the functions that are called from here */
    if(which<UCHAR_BINARY_START || UCHAR_BINARY_LIMIT<=which) {
        /* not a known binary property */
    } else {
        uint32_t mask=binProps[which].mask;
        int32_t column=binProps[which].column;
        if(mask!=0) {
            /* systematic, directly stored properties */
            return (u_getUnicodeProperties(c, column)&mask)!=0;
        } else {
            if(column==UPROPS_SRC_CASE) {
                return ucase_hasBinaryProperty(c, which);
            } else if(column==UPROPS_SRC_NORM) {
#if !UCONFIG_NO_NORMALIZATION
                /* normalization properties from unorm.icu */
                switch(which) {
                case UCHAR_SEGMENT_STARTER:
                    return unorm_isCanonSafeStart(c);
                default:
                    break;
                }
#endif
            } else if(column==UPROPS_SRC_NFC) {
#if !UCONFIG_NO_NORMALIZATION
                UErrorCode errorCode=U_ZERO_ERROR;
                switch(which) {
                case UCHAR_FULL_COMPOSITION_EXCLUSION: {
                    // By definition, Full_Composition_Exclusion is the same as NFC_QC=No.
                    const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
                    return U_SUCCESS(errorCode) && impl->isCompNo(impl->getNorm16(c));
                    break;
                }
                default: {
                    // UCHAR_NF[CD]_INERT properties
                    const Normalizer2 *norm2=Normalizer2Factory::getInstance(
                        (UNormalizationMode)(which-UCHAR_NFD_INERT+UNORM_NFD), errorCode);
                    return U_SUCCESS(errorCode) && norm2->isInert(c);
                }
                }
#endif
            } else if(column==UPROPS_SRC_NFKC) {
#if !UCONFIG_NO_NORMALIZATION
                // UCHAR_NFK[CD]_INERT properties
                UErrorCode errorCode=U_ZERO_ERROR;
                const Normalizer2 *norm2=Normalizer2Factory::getInstance(
                    (UNormalizationMode)(which-UCHAR_NFD_INERT+UNORM_NFD), errorCode);
                return U_SUCCESS(errorCode) && norm2->isInert(c);
#endif
            } else if(column==UPROPS_SRC_NFKC_CF) {
                // currently only for UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED
#if !UCONFIG_NO_NORMALIZATION
                UErrorCode errorCode=U_ZERO_ERROR;
                const Normalizer2Impl *kcf=Normalizer2Factory::getNFKC_CFImpl(errorCode);
                if(U_SUCCESS(errorCode)) {
                    UnicodeString src(c);
                    UnicodeString dest;
                    {
                        // The ReorderingBuffer must be in a block because its destructor
                        // needs to release dest's buffer before we look at its contents.
                        ReorderingBuffer buffer(*kcf, dest);
                        // Small destCapacity for NFKC_CF(c).
                        if(buffer.init(5, errorCode)) {
                            const UChar *srcArray=src.getBuffer();
                            kcf->compose(srcArray, srcArray+src.length(), FALSE,
                                         TRUE, buffer, errorCode);
                        }
                    }
                    return U_SUCCESS(errorCode) && dest!=src;
                }
#endif
            } else if(column==UPROPS_SRC_BIDI) {
                /* bidi/shaping properties */
                const UBiDiProps *bdp=GET_BIDI_PROPS();
                if(bdp!=NULL) {
                    switch(which) {
                    case UCHAR_BIDI_MIRRORED:
                        return ubidi_isMirrored(bdp, c);
                    case UCHAR_BIDI_CONTROL:
                        return ubidi_isBidiControl(bdp, c);
                    case UCHAR_JOIN_CONTROL:
                        return ubidi_isJoinControl(bdp, c);
                    default:
                        break;
                    }
                }
                /* else return FALSE below */
            } else if(column==UPROPS_SRC_CHAR) {
                switch(which) {
                case UCHAR_POSIX_BLANK:
                    return u_isblank(c);
                case UCHAR_POSIX_GRAPH:
                    return u_isgraphPOSIX(c);
                case UCHAR_POSIX_PRINT:
                    return u_isprintPOSIX(c);
                case UCHAR_POSIX_XDIGIT:
                    return u_isxdigit(c);
                default:
                    break;
                }
            } else if(column==UPROPS_SRC_CHAR_AND_PROPSVEC) {
                switch(which) {
                case UCHAR_POSIX_ALNUM:
                    return u_isalnumPOSIX(c);
                default:
                    break;
                }
            } else if(column==UPROPS_SRC_CASE_AND_NORM) {
#if !UCONFIG_NO_NORMALIZATION
                UChar nfdBuffer[4];
                const UChar *nfd;
                int32_t nfdLength;
                UErrorCode errorCode=U_ZERO_ERROR;
                const Normalizer2Impl *nfcImpl=Normalizer2Factory::getNFCImpl(errorCode);
                if(U_FAILURE(errorCode)) {
                    return FALSE;
                }
                switch(which) {
                case UCHAR_CHANGES_WHEN_CASEFOLDED:
                    nfd=nfcImpl->getDecomposition(c, nfdBuffer, nfdLength);
                    if(nfd!=NULL) {
                        /* c has a decomposition */
                        if(nfdLength==1) {
                            c=nfd[0];  /* single BMP code point */
                        } else if(nfdLength<=U16_MAX_LENGTH) {
                            int32_t i=0;
                            U16_NEXT(nfd, i, nfdLength, c);
                            if(i==nfdLength) {
                                /* single supplementary code point */
                            } else {
                                c=U_SENTINEL;
                            }
                        } else {
                            c=U_SENTINEL;
                        }
                    } else if(c<0) {
                        return FALSE;  /* protect against bad input */
                    }
                    errorCode=U_ZERO_ERROR;
                    if(c>=0) {
                        /* single code point */
                        const UCaseProps *csp=ucase_getSingleton(&errorCode);
                        const UChar *resultString;
                        return (UBool)(ucase_toFullFolding(csp, c, &resultString, U_FOLD_CASE_DEFAULT)>=0);
                    } else {
                        /* guess some large but stack-friendly capacity */
                        UChar dest[2*UCASE_MAX_STRING_LENGTH];
                        int32_t destLength;
                        destLength=u_strFoldCase(dest, LENGTHOF(dest), nfd, nfdLength, U_FOLD_CASE_DEFAULT, &errorCode);
                        return (UBool)(U_SUCCESS(errorCode) && 0!=u_strCompare(nfd, nfdLength, dest, destLength, FALSE));
                    }
                default:
                    break;
                }
#endif
            }
        }
    }
    return FALSE;
}
UBool
Normalizer2DataBuilder::decompose(UChar32 start, UChar32 end, uint32_t value) {
    if(norms[value].hasMapping()) {
        Norm &norm=norms[value];
        const UnicodeString &m=*norm.mapping;
        UnicodeString *decomposed=NULL;
        const UChar *s=m.getBuffer();
        int32_t length=m.length();
        int32_t prev, i=0;
        UChar32 c;
        while(i<length) {
            prev=i;
            U16_NEXT(s, i, length, c);
            if(start<=c && c<=end) {
                fprintf(stderr,
                        "gennorm2 error: U+%04lX maps to itself directly or indirectly\n",
                        (long)c);
                exit(U_INVALID_FORMAT_ERROR);
            }
            const Norm &cNorm=getNormRef(c);
            if(cNorm.hasMapping()) {
                if(norm.mappingType==Norm::ROUND_TRIP) {
                    if(prev==0) {
                        if(cNorm.mappingType!=Norm::ROUND_TRIP) {
                            fprintf(stderr,
                                    "gennorm2 error: "
                                    "U+%04lX's round-trip mapping's starter "
                                    "U+%04lX one-way-decomposes, "
                                    "not possible in Unicode normalization\n",
                                    (long)start, (long)c);
                            exit(U_INVALID_FORMAT_ERROR);
                        }
                        uint8_t myTrailCC=getCC(m.char32At(i));
                        UChar32 cTrailChar=cNorm.mapping->char32At(cNorm.mapping->length()-1);
                        uint8_t cTrailCC=getCC(cTrailChar);
                        if(cTrailCC>myTrailCC) {
                            fprintf(stderr,
                                    "gennorm2 error: "
                                    "U+%04lX's round-trip mapping's starter "
                                    "U+%04lX decomposes and the "
                                    "inner/earlier tccc=%hu > outer/following tccc=%hu, "
                                    "not possible in Unicode normalization\n",
                                    (long)start, (long)c,
                                    (short)cTrailCC, (short)myTrailCC);
                            exit(U_INVALID_FORMAT_ERROR);
                        }
                    } else {
                        fprintf(stderr,
                                "gennorm2 error: "
                                "U+%04lX's round-trip mapping's non-starter "
                                "U+%04lX decomposes, "
                                "not possible in Unicode normalization\n",
                                (long)start, (long)c);
                        exit(U_INVALID_FORMAT_ERROR);
                    }
                }
                if(decomposed==NULL) {
                    decomposed=new UnicodeString(m, 0, prev);
                }
                decomposed->append(*cNorm.mapping);
            } else if(Hangul::isHangul(c)) {
                UChar buffer[3];
                int32_t hangulLength=Hangul::decompose(c, buffer);
                if(norm.mappingType==Norm::ROUND_TRIP && prev!=0) {
                    fprintf(stderr,
                            "gennorm2 error: "
                            "U+%04lX's round-trip mapping's non-starter "
                            "U+%04lX decomposes, "
                            "not possible in Unicode normalization\n",
                            (long)start, (long)c);
                    exit(U_INVALID_FORMAT_ERROR);
                }
                if(decomposed==NULL) {
                    decomposed=new UnicodeString(m, 0, prev);
                }
                decomposed->append(buffer, hangulLength);
            } else if(decomposed!=NULL) {
                decomposed->append(m, prev, i-prev);
            }
        }
        if(decomposed!=NULL) {
            if(norm.rawMapping==NULL) {
                // Remember the original mapping when decomposing recursively.
                norm.rawMapping=norm.mapping;
            } else {
                delete norm.mapping;
            }
            norm.mapping=decomposed;
            // Not  norm.setMappingCP();  because the original mapping
            // is most likely to be encodable as a delta.
            return TRUE;
        }
    }
    return FALSE;
}
Exemple #7
0
int32_t NamePrepTransform::process( const UChar* src, int32_t srcLength, 
                                    UChar* dest, int32_t destCapacity, 
                                    UBool allowUnassigned,
                                    UParseError* parseError,
                                    UErrorCode& status ){
    // check error status
    if(U_FAILURE(status)){
        return 0;
    }
    
    //check arguments
    if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
        status=U_ILLEGAL_ARGUMENT_ERROR;
        return 0;
    }

    UnicodeString b1String;
    UChar *b1 = b1String.getBuffer(MAX_BUFFER_SIZE);
    int32_t b1Len;

    int32_t b1Index = 0;
    UCharDirection direction=U_CHAR_DIRECTION_COUNT, firstCharDir=U_CHAR_DIRECTION_COUNT;
    UBool leftToRight=FALSE, rightToLeft=FALSE;

    b1Len = map(src, srcLength, b1, b1String.getCapacity(), allowUnassigned, parseError, status);
    b1String.releaseBuffer(b1Len);

    if(status == U_BUFFER_OVERFLOW_ERROR){
        // redo processing of string
        /* we do not have enough room so grow the buffer*/
        b1 = b1String.getBuffer(b1Len);
        status = U_ZERO_ERROR; // reset error
        b1Len = map(src, srcLength, b1, b1String.getCapacity(), allowUnassigned, parseError, status);
        b1String.releaseBuffer(b1Len);
    }

    if(U_FAILURE(status)){
        b1Len = 0;
        goto CLEANUP;
    }


    for(; b1Index<b1Len; ){

        UChar32 ch = 0;

        U16_NEXT(b1, b1Index, b1Len, ch);

        if(prohibited.contains(ch) && ch!=0x0020){
            status = U_IDNA_PROHIBITED_ERROR;
            b1Len = 0;
            goto CLEANUP;
        }

        direction = u_charDirection(ch);
        if(firstCharDir==U_CHAR_DIRECTION_COUNT){
            firstCharDir = direction;
        }
        if(direction == U_LEFT_TO_RIGHT){
            leftToRight = TRUE;
        }
        if(direction == U_RIGHT_TO_LEFT || direction == U_RIGHT_TO_LEFT_ARABIC){
            rightToLeft = TRUE;
        }
    }       
    
    // satisfy 2
    if( leftToRight == TRUE && rightToLeft == TRUE){
        status = U_IDNA_CHECK_BIDI_ERROR;
        b1Len = 0;
        goto CLEANUP;
    }

    //satisfy 3
    if( rightToLeft == TRUE && 
        !((firstCharDir == U_RIGHT_TO_LEFT || firstCharDir == U_RIGHT_TO_LEFT_ARABIC) &&
          (direction == U_RIGHT_TO_LEFT || direction == U_RIGHT_TO_LEFT_ARABIC))
       ){
        status = U_IDNA_CHECK_BIDI_ERROR;
        return FALSE;
    }

    if(b1Len <= destCapacity){
        u_memmove(dest, b1, b1Len);
    }

CLEANUP:
    return u_terminateUChars(dest, destCapacity, b1Len, &status);
}
Exemple #8
0
static void
TestTable32(void) {
    static const struct {
        const char *key;
        int32_t number;
    } testcases[]={
        { "ooooooooooooooooo", 0 },
        { "oooooooooooooooo1", 1 },
        { "ooooooooooooooo1o", 2 },
        { "oo11ooo1ooo11111o", 25150 },
        { "oo11ooo1ooo111111", 25151 },
        { "o1111111111111111", 65535 },
        { "1oooooooooooooooo", 65536 },
        { "1ooooooo11o11ooo1", 65969 },
        { "1ooooooo11o11oo1o", 65970 },
        { "1ooooooo111oo1111", 65999 }
    };

    /* ### TODO UResourceBundle staticItem={ 0 }; - need to know the size */
    UResourceBundle *res, *item;
    const UChar *s;
    const char *key;
    UErrorCode errorCode;
    int32_t i, j, number, parsedNumber, length, count;

    errorCode=U_ZERO_ERROR;
    res=ures_open(loadTestData(&errorCode), "testtable32", &errorCode);
    if(U_FAILURE(errorCode)) {
        log_data_err("unable to open testdata/testtable32.res - %s\n", u_errorName(errorCode));
        return;
    }
    if(ures_getType(res)!=URES_TABLE) {
        log_data_err("testdata/testtable32.res has type %d instead of URES_TABLE\n", ures_getType(res));
    }

    count=ures_getSize(res);
    if(count!=66000) {
        log_err("testdata/testtable32.res should have 66000 entries but has %d\n", count);
    }

    /* get the items by index */
    item=NULL;
    for(i=0; i<count; ++i) {
        item=ures_getByIndex(res, i, item, &errorCode);
        if(U_FAILURE(errorCode)) {
            log_err("unable to get item %d of %d in testdata/testtable32.res - %s\n",
                    i, count, u_errorName(errorCode));
            break;
        }

        key=ures_getKey(item);
        parsedNumber=parseTable32Key(key);

        switch(ures_getType(item)) {
        case URES_STRING:
            s=ures_getString(item, &length, &errorCode);
            if(U_FAILURE(errorCode) || s==NULL) {
                log_err("unable to access the string \"%s\" at %d in testdata/testtable32.res - %s\n",
                        key, i, u_errorName(errorCode));
                number=-1;
            } else {
                j=0;
                U16_NEXT(s, j, length, number);
            }
            break;
        case URES_INT:
            number=ures_getInt(item, &errorCode);
            if(U_FAILURE(errorCode)) {
                log_err("unable to access the integer \"%s\" at %d in testdata/testtable32.res - %s\n",
                        key, i, u_errorName(errorCode));
                number=-1;
            }
            break;
        default:
            log_err("unexpected resource type %d for \"%s\" at %d in testdata/testtable32.res - %s\n",
                    ures_getType(item), key, i, u_errorName(errorCode));
            number=-1;
            break;
        }

        if(number>=0 && number!=parsedNumber) {
            log_err("\"%s\" at %d in testdata/testtable32.res has a string/int value of %d, expected %d\n",
                    key, i, number, parsedNumber);
        }
    }

    /* search for some items by key */
    for(i=0; i<LENGTHOF(testcases); ++i) {
        item=ures_getByKey(res, testcases[i].key, item, &errorCode);
        if(U_FAILURE(errorCode)) {
            log_err("unable to find the key \"%s\" in testdata/testtable32.res - %s\n",
                    testcases[i].key, u_errorName(errorCode));
            continue;
        }

        switch(ures_getType(item)) {
        case URES_STRING:
            s=ures_getString(item, &length, &errorCode);
            if(U_FAILURE(errorCode) || s==NULL) {
                log_err("unable to access the string \"%s\" in testdata/testtable32.res - %s\n",
                        testcases[i].key, u_errorName(errorCode));
                number=-1;
            } else {
                j=0;
                U16_NEXT(s, j, length, number);
            }
            break;
        case URES_INT:
            number=ures_getInt(item, &errorCode);
            if(U_FAILURE(errorCode)) {
                log_err("unable to access the integer \"%s\" in testdata/testtable32.res - %s\n",
                        testcases[i].key, u_errorName(errorCode));
                number=-1;
            }
            break;
        default:
            log_err("unexpected resource type %d for \"%s\" in testdata/testtable32.res - %s\n",
                    ures_getType(item), testcases[i].key, u_errorName(errorCode));
            number=-1;
            break;
        }

        if(number>=0 && number!=testcases[i].number) {
            log_err("\"%s\" in testdata/testtable32.res has a string/int value of %d, expected %d\n",
                    testcases[i].key, number, testcases[i].number);
        }

        key=ures_getKey(item);
        if(0!=uprv_strcmp(key, testcases[i].key)) {
            log_err("\"%s\" in testdata/testtable32.res claims to have the key \"%s\"\n",
                    testcases[i].key, key);
        }
    }

    ures_close(item);
    ures_close(res);
}
Exemple #9
0
U_CFUNC int32_t U_CALLCONV
ustrcase_internalToTitle(const UCaseMap *csm,
                         UChar *dest, int32_t destCapacity,
                         const UChar *src, int32_t srcLength,
                         UErrorCode *pErrorCode) {
    const UChar *s;
    UChar32 c;
    int32_t prev, titleStart, titleLimit, idx, destIndex, length;
    UBool isFirstIndex;

    if(U_FAILURE(*pErrorCode)) {
        return 0;
    }

    // Use the C++ abstract base class to minimize dependencies.
    // TODO: Change UCaseMap.iter to store a BreakIterator directly.
    BreakIterator *bi=reinterpret_cast<BreakIterator *>(csm->iter);

    /* set up local variables */
    int32_t locCache=csm->locCache;
    UCaseContext csc=UCASECONTEXT_INITIALIZER;
    csc.p=(void *)src;
    csc.limit=srcLength;
    destIndex=0;
    prev=0;
    isFirstIndex=TRUE;

    /* titlecasing loop */
    while(prev<srcLength) {
        /* find next index where to titlecase */
        if(isFirstIndex) {
            isFirstIndex=FALSE;
            idx=bi->first();
        } else {
            idx=bi->next();
        }
        if(idx==UBRK_DONE || idx>srcLength) {
            idx=srcLength;
        }

        /*
         * Unicode 4 & 5 section 3.13 Default Case Operations:
         *
         * R3  toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
         * #29, "Text Boundaries." Between each pair of word boundaries, find the first
         * cased character F. If F exists, map F to default_title(F); then map each
         * subsequent character C to default_lower(C).
         *
         * In this implementation, segment [prev..index[ into 3 parts:
         * a) uncased characters (copy as-is) [prev..titleStart[
         * b) first case letter (titlecase)         [titleStart..titleLimit[
         * c) subsequent characters (lowercase)                 [titleLimit..index[
         */
        if(prev<idx) {
            /* find and copy uncased characters [prev..titleStart[ */
            titleStart=titleLimit=prev;
            U16_NEXT(src, titleLimit, idx, c);
            if((csm->options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) {
                /* Adjust the titlecasing index (titleStart) to the next cased character. */
                for(;;) {
                    titleStart=titleLimit;
                    if(titleLimit==idx) {
                        /*
                         * only uncased characters in [prev..index[
                         * stop with titleStart==titleLimit==index
                         */
                        break;
                    }
                    U16_NEXT(src, titleLimit, idx, c);
                    if(UCASE_NONE!=ucase_getType(csm->csp, c)) {
                        break; /* cased letter at [titleStart..titleLimit[ */
                    }
                }
                length=titleStart-prev;
                if(length>0) {
                    if((destIndex+length)<=destCapacity) {
                        uprv_memcpy(dest+destIndex, src+prev, length*U_SIZEOF_UCHAR);
                    }
                    destIndex+=length;
                }
            }

            if(titleStart<titleLimit) {
                /* titlecase c which is from [titleStart..titleLimit[ */
                csc.cpStart=titleStart;
                csc.cpLimit=titleLimit;
                c=ucase_toFullTitle(csm->csp, c, utf16_caseContextIterator, &csc, &s, csm->locale, &locCache);
                destIndex=appendResult(dest, destIndex, destCapacity, c, s);

                /* Special case Dutch IJ titlecasing */
                if ( titleStart+1 < idx &&
                     ucase_getCaseLocale(csm->locale,&locCache) == UCASE_LOC_DUTCH &&
                     ( src[titleStart] == (UChar32) 0x0049 || src[titleStart] == (UChar32) 0x0069 ) &&
                     ( src[titleStart+1] == (UChar32) 0x004A || src[titleStart+1] == (UChar32) 0x006A )) {
                            c=(UChar32) 0x004A;
                            destIndex=appendResult(dest, destIndex, destCapacity, c, s);
                            titleLimit++;
                }

                /* lowercase [titleLimit..index[ */
                if(titleLimit<idx) {
                    if((csm->options&U_TITLECASE_NO_LOWERCASE)==0) {
                        /* Normal operation: Lowercase the rest of the word. */
                        destIndex+=
                            _caseMap(
                                csm, ucase_toFullLower,
                                dest+destIndex, destCapacity-destIndex,
                                src, &csc,
                                titleLimit, idx,
                                pErrorCode);
                    } else {
                        /* Optionally just copy the rest of the word unchanged. */
                        length=idx-titleLimit;
                        if((destIndex+length)<=destCapacity) {
                            uprv_memcpy(dest+destIndex, src+titleLimit, length*U_SIZEOF_UCHAR);
                        }
                        destIndex+=length;
                    }
                }
            }
        }

        prev=idx;
    }

    if(destIndex>destCapacity) {
        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    }
    return destIndex;
}
Exemple #10
0
/*
 * When we have UBIDI_OUTPUT_REVERSE set on ubidi_writeReordered(), then we
 * semantically write RTL runs in reverse and later reverse them again.
 * Instead, we actually write them in forward order to begin with.
 * However, if the RTL run was to be mirrored, we need to mirror here now
 * since the implicit second reversal must not do it.
 * It looks strange to do mirroring in LTR output, but it is only because
 * we are writing RTL output in reverse.
 */
static int32_t
doWriteForward(const UChar *src, int32_t srcLength,
               UChar *dest, int32_t destSize,
               uint16_t options,
               UErrorCode *pErrorCode) {
    /* optimize for several combinations of options */
    switch(options&(UBIDI_REMOVE_BIDI_CONTROLS|UBIDI_DO_MIRRORING)) {
    case 0: {
        /* simply copy the LTR run to the destination */
        int32_t length=srcLength;
        if(destSize<length) {
            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
            return srcLength;
        }
        do {
            *dest++=*src++;
        } while(--length>0);
        return srcLength;
    }
    case UBIDI_DO_MIRRORING: {
        /* do mirroring */
        int32_t i=0, j=0;
        UChar32 c;

        if(destSize<srcLength) {
            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
            return srcLength;
        }
        do {
            U16_NEXT(src, i, srcLength, c);
            c=u_charMirror(c);
            U16_APPEND_UNSAFE(dest, j, c);
        } while(i<srcLength);
        return srcLength;
    }
    case UBIDI_REMOVE_BIDI_CONTROLS: {
        /* copy the LTR run and remove any BiDi control characters */
        int32_t remaining=destSize;
        UChar c;
        do {
            c=*src++;
            if(!IS_BIDI_CONTROL_CHAR(c)) {
                if(--remaining<0) {
                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

                    /* preflight the length */
                    while(--srcLength>0) {
                        c=*src++;
                        if(!IS_BIDI_CONTROL_CHAR(c)) {
                            --remaining;
                        }
                    }
                    return destSize-remaining;
                }
                *dest++=c;
            }
        } while(--srcLength>0);
        return destSize-remaining;
    }
    default: {
        /* remove BiDi control characters and do mirroring */
        int32_t remaining=destSize;
        int32_t i, j=0;
        UChar32 c;
        do {
            i=0;
            U16_NEXT(src, i, srcLength, c);
            src+=i;
            srcLength-=i;
            if(!IS_BIDI_CONTROL_CHAR(c)) {
                remaining-=i;
                if(remaining<0) {
                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

                    /* preflight the length */
                    while(srcLength>0) {
                        c=*src++;
                        if(!IS_BIDI_CONTROL_CHAR(c)) {
                            --remaining;
                        }
                        --srcLength;
                    }
                    return destSize-remaining;
                }
                c=u_charMirror(c);
                U16_APPEND_UNSAFE(dest, j, c);
            }
        } while(srcLength>0);
        return j;
    }
    } /* end of switch */
}
Exemple #11
0
static int icuNext(
  sqlite3_tokenizer_cursor *pCursor,  
  const char **ppToken,               
  int *pnBytes,                       
  int *piStartOffset,                 
  int *piEndOffset,                   
  int *piPosition                     
){
  IcuCursor *pCsr = (IcuCursor *)pCursor;

  int iStart = 0;
  int iEnd = 0;
  int nByte = 0;

  while( iStart==iEnd ){
    UChar32 c;

    iStart = ubrk_current(pCsr->pIter);
    iEnd = ubrk_next(pCsr->pIter);
    if( iEnd==UBRK_DONE ){
      return SQLITE_DONE;
    }

    while( iStart<iEnd ){
      int iWhite = iStart;
      U16_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);
      if( u_isspace(c) ){
        iStart = iWhite;
      }else{
        break;
      }
    }
    assert(iStart<=iEnd);
  }

  do {
    UErrorCode status = U_ZERO_ERROR;
    if( nByte ){
      char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);
      if( !zNew ){
        return SQLITE_NOMEM;
      }
      pCsr->zBuffer = zNew;
      pCsr->nBuffer = nByte;
    }

    u_strToUTF8(
        pCsr->zBuffer, pCsr->nBuffer, &nByte,    
        &pCsr->aChar[iStart], iEnd-iStart,       
        &status                                  
    );
  } while( nByte>pCsr->nBuffer );

  *ppToken = pCsr->zBuffer;
  *pnBytes = nByte;
  *piStartOffset = pCsr->aOffset[iStart];
  *piEndOffset = pCsr->aOffset[iEnd];
  *piPosition = pCsr->iToken++;

  return SQLITE_OK;
}
/**
 * Greek string uppercasing with a state machine.
 * Probably simpler than a stateless function that has to figure out complex context-before
 * for each character.
 * TODO: Try to re-consolidate one way or another with the non-Greek function.
 */
int32_t toUpper(const UCaseMap *csm,
                UChar *dest, int32_t destCapacity,
                const UChar *src, int32_t srcLength,
                UErrorCode *pErrorCode) {
    int32_t locCache = UCASE_LOC_GREEK;
    int32_t destIndex=0;
    uint32_t state = 0;
    for (int32_t i = 0; i < srcLength;) {
        int32_t nextIndex = i;
        UChar32 c;
        U16_NEXT(src, nextIndex, srcLength, c);
        uint32_t nextState = 0;
        int32_t type = ucase_getTypeOrIgnorable(csm->csp, c);
        if ((type & UCASE_IGNORABLE) != 0) {
            // c is case-ignorable
            nextState |= (state & AFTER_CASED);
        } else if (type != UCASE_NONE) {
            // c is cased
            nextState |= AFTER_CASED;
        }
        uint32_t data = getLetterData(c);
        if (data > 0) {
            uint32_t upper = data & UPPER_MASK;
            // Add a dialytika to this iota or ypsilon vowel
            // if we removed a tonos from the previous vowel,
            // and that previous vowel did not also have (or gain) a dialytika.
            // Adding one only to the final vowel in a longer sequence
            // (which does not occur in normal writing) would require lookahead.
            // Set the same flag as for preserving an existing dialytika.
            if ((data & HAS_VOWEL) != 0 && (state & AFTER_VOWEL_WITH_ACCENT) != 0 &&
                    (upper == 0x399 || upper == 0x3A5)) {
                data |= HAS_DIALYTIKA;
            }
            int32_t numYpogegrammeni = 0;  // Map each one to a trailing, spacing, capital iota.
            if ((data & HAS_YPOGEGRAMMENI) != 0) {
                numYpogegrammeni = 1;
            }
            // Skip combining diacritics after this Greek letter.
            while (nextIndex < srcLength) {
                uint32_t diacriticData = getDiacriticData(src[nextIndex]);
                if (diacriticData != 0) {
                    data |= diacriticData;
                    if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
                        ++numYpogegrammeni;
                    }
                    ++nextIndex;
                } else {
                    break;  // not a Greek diacritic
                }
            }
            if ((data & HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA) == HAS_VOWEL_AND_ACCENT) {
                nextState |= AFTER_VOWEL_WITH_ACCENT;
            }
            // Map according to Greek rules.
            UBool addTonos = FALSE;
            if (upper == 0x397 &&
                    (data & HAS_ACCENT) != 0 &&
                    numYpogegrammeni == 0 &&
                    (state & AFTER_CASED) == 0 &&
                    !isFollowedByCasedLetter(csm->csp, src, nextIndex, srcLength)) {
                // Keep disjunctive "or" with (only) a tonos.
                // We use the same "word boundary" conditions as for the Final_Sigma test.
                if (i == nextIndex) {
                    upper = 0x389;  // Preserve the precomposed form.
                } else {
                    addTonos = TRUE;
                }
            } else if ((data & HAS_DIALYTIKA) != 0) {
                // Preserve a vowel with dialytika in precomposed form if it exists.
                if (upper == 0x399) {
                    upper = 0x3AA;
                    data &= ~HAS_EITHER_DIALYTIKA;
                } else if (upper == 0x3A5) {
                    upper = 0x3AB;
                    data &= ~HAS_EITHER_DIALYTIKA;
                }
            }
            destIndex=appendUChar(dest, destIndex, destCapacity, (UChar)upper);
            if (destIndex >= 0 && (data & HAS_EITHER_DIALYTIKA) != 0) {
                destIndex=appendUChar(dest, destIndex, destCapacity, 0x308);  // restore or add a dialytika
            }
            if (destIndex >= 0 && addTonos) {
                destIndex=appendUChar(dest, destIndex, destCapacity, 0x301);
            }
            while (destIndex >= 0 && numYpogegrammeni > 0) {
                destIndex=appendUChar(dest, destIndex, destCapacity, 0x399);
                --numYpogegrammeni;
            }
            if(destIndex<0) {
                *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
                return 0;
            }
        } else {
            const UChar *s;
            UChar32 c2 = 0;
            c=ucase_toFullUpper(csm->csp, c, NULL, NULL, &s, csm->locale, &locCache);
            if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) {
                /* fast path version of appendResult() for BMP results */
                dest[destIndex++]=(UChar)c2;
            } else {
                destIndex=appendResult(dest, destIndex, destCapacity, c, s);
                if(destIndex<0) {
                    *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
                    return 0;
                }
            }
        }
        i = nextIndex;
        state = nextState;
    }

    if(destIndex>destCapacity) {
        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    }
    return destIndex;
}
// FIXME:
//  - Handle 'Inherited', 'Common' and 'Unknown'
//    (see http://www.unicode.org/reports/tr24/#Usage_Model )
//    For 'Inherited' and 'Common', perhaps we need to
//    accept another parameter indicating the previous family
//    and just return it.
//  - All the characters (or characters up to the point a single
//    font can cover) need to be taken into account
const UChar* getFallbackFamily(const UChar* characters,
                               int length,
                               FontDescription::GenericFamilyType generic,
                               UChar32* charChecked,
                               UScriptCode* scriptChecked)
{
    ASSERT(characters && characters[0] && length > 0);
    UScriptCode script = USCRIPT_COMMON;

    // Sometimes characters common to script (e.g. space) is at
    // the beginning of a string so that we need to skip them
    // to get a font required to render the string.
    int i = 0;
    UChar32 ucs4 = 0;
    while (i < length && script == USCRIPT_COMMON || script == USCRIPT_INVALID_CODE) {
        U16_NEXT(characters, i, length, ucs4);
        UErrorCode err = U_ZERO_ERROR;
        script = uscript_getScript(ucs4, &err);
        // silently ignore the error
    }

    // For the full-width ASCII characters (U+FF00 - U+FF5E), use the font for
    // Han (determined in a locale-dependent way above). Full-width ASCII
    // characters are rather widely used in Japanese and Chinese documents and
    // they're fully covered by Chinese, Japanese and Korean fonts.
    if (0xFF00 < ucs4 && ucs4 < 0xFF5F)
        script = USCRIPT_HAN;

    // There are a lot of characters in USCRIPT_COMMON that can be covered
    // by fonts for scripts closely related to them. See
    // http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Script=Common:]
    // FIXME: make this more efficient with a wider coverage
    if (script == USCRIPT_COMMON || script == USCRIPT_INHERITED) {
        UBlockCode block = ublock_getCode(ucs4);
        switch (block) {
        case UBLOCK_BASIC_LATIN:
            script = USCRIPT_LATIN;
            break;
        case UBLOCK_CJK_SYMBOLS_AND_PUNCTUATION:
            script = USCRIPT_HAN;
            break;
        case UBLOCK_HIRAGANA:
        case UBLOCK_KATAKANA:
            script = USCRIPT_HIRAGANA;
            break;
        case UBLOCK_ARABIC:
            script = USCRIPT_ARABIC;
            break;
        case UBLOCK_GREEK:
            script = USCRIPT_GREEK;
            break;
        case UBLOCK_DEVANAGARI:
            // For Danda and Double Danda (U+0964, U+0965), use a Devanagari
            // font for now although they're used by other scripts as well.
            // Without a context, we can't do any better.
            script = USCRIPT_DEVANAGARI;
            break;
        case UBLOCK_ARMENIAN:
            script = USCRIPT_ARMENIAN;
            break;
        case UBLOCK_GEORGIAN:
            script = USCRIPT_GEORGIAN;
            break;
        case UBLOCK_KANNADA:
            script = USCRIPT_KANNADA;
            break;
        }
    }

    // Another lame work-around to cover non-BMP characters.
    const UChar* family = getFontFamilyForScript(script, generic);
    if (!family) {
        int plane = ucs4 >> 16;
        switch (plane) {
        case 1:
            family = L"code2001";
            break;
        case 2:
            family = L"simsun-extb";
            break;
        default:
            family = L"lucida sans unicode";
        }
    }
Exemple #14
0
static void demoCaseMapInC() {
    /*
     * input=
     *   "aB<capital sigma>"
     *   "iI<small dotless i><capital dotted I> "
     *   "<sharp s> <small lig. ffi>"
     *   "<small final sigma><small sigma><capital sigma>"
     */
    static const UChar input[]={
        0x61, 0x42, 0x3a3,
        0x69, 0x49, 0x131, 0x130, 0x20,
        0xdf, 0x20, 0xfb03,
        0x3c2, 0x3c3, 0x3a3, 0
    };
    UChar buffer[32];

    UErrorCode errorCode;
    UChar32 c;
    int32_t i, j, length;
    UBool isError;

    printf("\n* demoCaseMapInC() ----------------- ***\n\n");

    /*
     * First, use simple case mapping functions which provide
     * 1:1 code point mappings without context/locale ID.
     *
     * Note that some mappings will not be "right" because some "real"
     * case mappings require context, depend on the locale ID,
     * and/or result in a change in the number of code points.
     */
    printUString("input string: ", input, -1);

    /* uppercase */
    isError=FALSE;
    for(i=j=0; j<UPRV_LENGTHOF(buffer) && !isError; /* U16_NEXT post-increments */) {
        U16_NEXT(input, i, INT32_MAX, c); /* without length because NUL-terminated */
        if(c==0) {
            break; /* stop at terminating NUL, no need to terminate buffer */
        }
        c=u_toupper(c);
        U16_APPEND(buffer, j, UPRV_LENGTHOF(buffer), c, isError);
    }
    printUString("simple-uppercased: ", buffer, j);
    /* lowercase */
    isError=FALSE;
    for(i=j=0; j<UPRV_LENGTHOF(buffer) && !isError; /* U16_NEXT post-increments */) {
        U16_NEXT(input, i, INT32_MAX, c); /* without length because NUL-terminated */
        if(c==0) {
            break; /* stop at terminating NUL, no need to terminate buffer */
        }
        c=u_tolower(c);
        U16_APPEND(buffer, j, UPRV_LENGTHOF(buffer), c, isError);
    }
    printUString("simple-lowercased: ", buffer, j);
    /* titlecase */
    isError=FALSE;
    for(i=j=0; j<UPRV_LENGTHOF(buffer) && !isError; /* U16_NEXT post-increments */) {
        U16_NEXT(input, i, INT32_MAX, c); /* without length because NUL-terminated */
        if(c==0) {
            break; /* stop at terminating NUL, no need to terminate buffer */
        }
        c=u_totitle(c);
        U16_APPEND(buffer, j, UPRV_LENGTHOF(buffer), c, isError);
    }
    printUString("simple-titlecased: ", buffer, j);
    /* case-fold/default */
    isError=FALSE;
    for(i=j=0; j<UPRV_LENGTHOF(buffer) && !isError; /* U16_NEXT post-increments */) {
        U16_NEXT(input, i, INT32_MAX, c); /* without length because NUL-terminated */
        if(c==0) {
            break; /* stop at terminating NUL, no need to terminate buffer */
        }
        c=u_foldCase(c, U_FOLD_CASE_DEFAULT);
        U16_APPEND(buffer, j, UPRV_LENGTHOF(buffer), c, isError);
    }
    printUString("simple-case-folded/default: ", buffer, j);
    /* case-fold/Turkic */
    isError=FALSE;
    for(i=j=0; j<UPRV_LENGTHOF(buffer) && !isError; /* U16_NEXT post-increments */) {
        U16_NEXT(input, i, INT32_MAX, c); /* without length because NUL-terminated */
        if(c==0) {
            break; /* stop at terminating NUL, no need to terminate buffer */
        }
        c=u_foldCase(c, U_FOLD_CASE_EXCLUDE_SPECIAL_I);
        U16_APPEND(buffer, j, UPRV_LENGTHOF(buffer), c, isError);
    }
    printUString("simple-case-folded/Turkic: ", buffer, j);

    /*
     * Second, use full case mapping functions which provide
     * 1:n code point mappings (n can be 0!) and are sensitive to context and locale ID.
     *
     * Note that lower/upper/titlecasing take a locale ID while case-folding
     * has bit flag options instead, by design of the Unicode SpecialCasing.txt UCD file.
     *
     * Also, string titlecasing requires a BreakIterator to find starts of words.
     * The sample code here passes in a NULL pointer; u_strToTitle() will open and close a default
     * titlecasing BreakIterator automatically.
     * For production code where many strings are titlecased it would be more efficient
     * to open a BreakIterator externally and pass it in.
     */
    printUString("\ninput string: ", input, -1);

    /* lowercase/English */
    errorCode=U_ZERO_ERROR;
    length=u_strToLower(buffer, UPRV_LENGTHOF(buffer), input, -1, "en", &errorCode);
    if(U_SUCCESS(errorCode)) {
        printUString("full-lowercased/en: ", buffer, length);
    } else {
        printf("error in u_strToLower(en)=%ld error=%s\n", length, u_errorName(errorCode));
    }
    /* lowercase/Turkish */
    errorCode=U_ZERO_ERROR;
    length=u_strToLower(buffer, UPRV_LENGTHOF(buffer), input, -1, "tr", &errorCode);
    if(U_SUCCESS(errorCode)) {
        printUString("full-lowercased/tr: ", buffer, length);
    } else {
        printf("error in u_strToLower(tr)=%ld error=%s\n", length, u_errorName(errorCode));
    }
    /* uppercase/English */
    errorCode=U_ZERO_ERROR;
    length=u_strToUpper(buffer, UPRV_LENGTHOF(buffer), input, -1, "en", &errorCode);
    if(U_SUCCESS(errorCode)) {
        printUString("full-uppercased/en: ", buffer, length);
    } else {
        printf("error in u_strToUpper(en)=%ld error=%s\n", length, u_errorName(errorCode));
    }
    /* uppercase/Turkish */
    errorCode=U_ZERO_ERROR;
    length=u_strToUpper(buffer, UPRV_LENGTHOF(buffer), input, -1, "tr", &errorCode);
    if(U_SUCCESS(errorCode)) {
        printUString("full-uppercased/tr: ", buffer, length);
    } else {
        printf("error in u_strToUpper(tr)=%ld error=%s\n", length, u_errorName(errorCode));
    }
    /* titlecase/English */
    errorCode=U_ZERO_ERROR;
    length=u_strToTitle(buffer, UPRV_LENGTHOF(buffer), input, -1, NULL, "en", &errorCode);
    if(U_SUCCESS(errorCode)) {
        printUString("full-titlecased/en: ", buffer, length);
    } else {
        printf("error in u_strToTitle(en)=%ld error=%s\n", length, u_errorName(errorCode));
    }
    /* titlecase/Turkish */
    errorCode=U_ZERO_ERROR;
    length=u_strToTitle(buffer, UPRV_LENGTHOF(buffer), input, -1, NULL, "tr", &errorCode);
    if(U_SUCCESS(errorCode)) {
        printUString("full-titlecased/tr: ", buffer, length);
    } else {
        printf("error in u_strToTitle(tr)=%ld error=%s\n", length, u_errorName(errorCode));
    }
    /* case-fold/default */
    errorCode=U_ZERO_ERROR;
    length=u_strFoldCase(buffer, UPRV_LENGTHOF(buffer), input, -1, U_FOLD_CASE_DEFAULT, &errorCode);
    if(U_SUCCESS(errorCode)) {
        printUString("full-case-folded/default: ", buffer, length);
    } else {
        printf("error in u_strFoldCase(default)=%ld error=%s\n", length, u_errorName(errorCode));
    }
    /* case-fold/Turkic */
    errorCode=U_ZERO_ERROR;
    length=u_strFoldCase(buffer, UPRV_LENGTHOF(buffer), input, -1, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &errorCode);
    if(U_SUCCESS(errorCode)) {
        printUString("full-case-folded/Turkic: ", buffer, length);
    } else {
        printf("error in u_strFoldCase(Turkic)=%ld error=%s\n", length, u_errorName(errorCode));
    }
}
Exemple #15
0
/*
** Extract the next token from a tokenization cursor.
*/
static int icuNext(
  sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by simpleOpen */
  const char **ppToken,               /* OUT: *ppToken is the token text */
  int *pnBytes,                       /* OUT: Number of bytes in token */
  int *piStartOffset,                 /* OUT: Starting offset of token */
  int *piEndOffset,                   /* OUT: Ending offset of token */
  int *piPosition                     /* OUT: Position integer of token */
){
  IcuCursor *pCsr = (IcuCursor *)pCursor;

  int iStart = 0;
  int iEnd = 0;
  int nByte = 0;

  while( iStart==iEnd ){
    UChar32 c;

    iStart = ubrk_current(pCsr->pIter);
    iEnd = ubrk_next(pCsr->pIter);
    if( iEnd==UBRK_DONE ){
      return SQLITE_DONE;
    }

    while( iStart<iEnd ){
      int iWhite = iStart;
      U16_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);
      if( u_isspace(c) ){
        iStart = iWhite;
      }else{
        break;
      }
    }
    assert(iStart<=iEnd);
  }

  do {
    UErrorCode status = U_ZERO_ERROR;
    if( nByte ){
      char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);
      if( !zNew ){
        return SQLITE_NOMEM;
      }
      pCsr->zBuffer = zNew;
      pCsr->nBuffer = nByte;
    }

    u_strToUTF8(
        pCsr->zBuffer, pCsr->nBuffer, &nByte,    /* Output vars */
        &pCsr->aChar[iStart], iEnd-iStart,       /* Input vars */
        &status                                  /* Output success/failure */
    );
  } while( nByte>pCsr->nBuffer );

  *ppToken = pCsr->zBuffer;
  *pnBytes = nByte;
  *piStartOffset = pCsr->aOffset[iStart];
  *piEndOffset = pCsr->aOffset[iEnd];
  *piPosition = pCsr->iToken++;

  return SQLITE_OK;
}
Exemple #16
0
/**
 * See if the decomposition of cp2 is at segment starting at segmentPos 
 * (with canonical rearrangment!)
 * If so, take the remainder, and return the equivalents 
 */
Hashtable *CanonicalIterator::extract(Hashtable *fillinResult, UChar32 comp, const UChar *segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) {
//Hashtable *CanonicalIterator::extract(UChar32 comp, const UnicodeString &segment, int32_t segLen, int32_t segmentPos, UErrorCode &status) {
    //if (PROGRESS) printf(" extract: %s, ", UToS(Tr(UnicodeString(comp))));
    //if (PROGRESS) printf("%s, %i\n", UToS(Tr(segment)), segmentPos);

    if (U_FAILURE(status)) {
        return NULL;
    }

    UnicodeString temp(comp);
    int32_t inputLen=temp.length();
    UnicodeString decompString;
    nfd.normalize(temp, decompString, status);
    if (U_FAILURE(status)) {
        return NULL;
    }
    if (decompString.isBogus()) {
        status = U_MEMORY_ALLOCATION_ERROR;
        return NULL;
    }
    const UChar *decomp=decompString.getBuffer();
    int32_t decompLen=decompString.length();

    // See if it matches the start of segment (at segmentPos)
    UBool ok = FALSE;
    UChar32 cp;
    int32_t decompPos = 0;
    UChar32 decompCp;
    U16_NEXT(decomp, decompPos, decompLen, decompCp);

    int32_t i = segmentPos;
    while(i < segLen) {
        U16_NEXT(segment, i, segLen, cp);

        if (cp == decompCp) { // if equal, eat another cp from decomp

            //if (PROGRESS) printf("  matches: %s\n", UToS(Tr(UnicodeString(cp))));

            if (decompPos == decompLen) { // done, have all decomp characters!
                temp.append(segment+i, segLen-i);
                ok = TRUE;
                break;
            }
            U16_NEXT(decomp, decompPos, decompLen, decompCp);
        } else {
            //if (PROGRESS) printf("  buffer: %s\n", UToS(Tr(UnicodeString(cp))));

            // brute force approach
            temp.append(cp);

            /* TODO: optimize
            // since we know that the classes are monotonically increasing, after zero
            // e.g. 0 5 7 9 0 3
            // we can do an optimization
            // there are only a few cases that work: zero, less, same, greater
            // if both classes are the same, we fail
            // if the decomp class < the segment class, we fail

            segClass = getClass(cp);
            if (decompClass <= segClass) return null;
            */
        }
    }
    if (!ok)
        return NULL; // we failed, characters left over

    //if (PROGRESS) printf("Matches\n");

    if (inputLen == temp.length()) {
        fillinResult->put(UnicodeString(), new UnicodeString(), status);
        return fillinResult; // succeed, but no remainder
    }

    // brute force approach
    // check to make sure result is canonically equivalent
    UnicodeString trial;
    nfd.normalize(temp, trial, status);
    if(U_FAILURE(status) || trial.compare(segment+segmentPos, segLen - segmentPos) != 0) {
        return NULL;
    }

    return getEquivalents2(fillinResult, temp.getBuffer()+inputLen, temp.length()-inputLen, status);
}
void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
                                                      UBool isIncremental) const {
    // start and limit of the input range
    int32_t start = offsets.start;
    int32_t limit = offsets.limit;
    int32_t length, delta;

    if(start >= limit) {
        return;
    }

    // a C code unit iterator, implemented around the Replaceable
    UCharIterator iter;
    uiter_setReplaceable(&iter, &text);

    // the output string and buffer pointer
    UnicodeString output;
    UChar *buffer;
    UBool neededToNormalize;

    UErrorCode errorCode;

    /*
     * Normalize as short chunks at a time as possible even in
     * bulk mode, so that styled text is minimally disrupted.
     * In incremental mode, a chunk that ends with offsets.limit
     * must not be normalized.
     *
     * If it was known that the input text is not styled, then
     * a bulk mode normalization could look like this:
     *

    UChar staticChars[256];
    UnicodeString input;

    length = limit - start;
    input.setTo(staticChars, 0, sizeof(staticChars)/U_SIZEOF_UCHAR); // writable alias

    _Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
    input.releaseBuffer(length);

    UErrorCode status = U_ZERO_ERROR;
    Normalizer::normalize(input, fMode, options, output, status);

    text.handleReplaceBetween(start, limit, output);

    int32_t delta = output.length() - length;
    offsets.contextLimit += delta;
    offsets.limit += delta;
    offsets.start = limit + delta;

     *
     */
    while(start < limit) {
        // set the iterator limits for the remaining input range
        // this is a moving target because of the replacements in the text object
        iter.start = iter.index = start;
        iter.limit = limit;

        // incrementally normalize a small chunk of the input
        buffer = output.getBuffer(-1);
        errorCode = U_ZERO_ERROR;
        length = unorm_next(&iter, buffer, output.getCapacity(),
                            fMode, 0,
                            TRUE, &neededToNormalize,
                            &errorCode);
        output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);

        if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
            // use a larger output string buffer and do it again from the start
            iter.index = start;
            buffer = output.getBuffer(length);
            errorCode = U_ZERO_ERROR;
            length = unorm_next(&iter, buffer, output.getCapacity(),
                                fMode, 0,
                                TRUE, &neededToNormalize,
                                &errorCode);
            output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
        }

        if(U_FAILURE(errorCode)) {
            break;
        }

        limit = iter.index;
        if(isIncremental && limit == iter.limit) {
            // stop in incremental mode when we reach the input limit
            // in case there are additional characters that could change the
            // normalization result

            // UNLESS all characters in the result of the normalization of
            // the last run are in the skippable set
            const UChar *s=output.getBuffer();
            int32_t i=0, outLength=output.length();
            UChar32 c;

            while(i<outLength) {
                U16_NEXT(s, i, outLength, c);
                if(!unorm_isNFSkippable(c, fMode)) {
                    outLength=-1; // I wish C++ had labeled loops and break outer; ...
                    break;
                }
            }
            if (outLength<0) {
                break;
            }
        }

        if(neededToNormalize) {
            // replace the input chunk with its normalized form
            text.handleReplaceBetween(start, limit, output);

            // update all necessary indexes accordingly
            delta = length - (limit - start);   // length change in the text object
            start = limit += delta;             // the next chunk starts where this one ends, with adjustment
            limit = offsets.limit += delta;     // set the iteration limit to the adjusted end of the input range
            offsets.contextLimit += delta;
        } else {
            // delta == 0
            start = limit;
            limit = offsets.limit;
        }
    }

    offsets.start = start;
}
Exemple #18
0
bool HarfBuzzShaper::collectHarfBuzzRuns()
{
    const UChar* normalizedBufferEnd = m_normalizedBuffer.get() + m_normalizedBufferLength;
    SurrogatePairAwareTextIterator iterator(m_normalizedBuffer.get(), 0, m_normalizedBufferLength, m_normalizedBufferLength);
    UChar32 character;
    unsigned clusterLength = 0;
    unsigned startIndexOfCurrentRun = 0;
    if (!iterator.consume(character, clusterLength))
        return false;

    const SimpleFontData* nextFontData = m_font->glyphDataForCharacter(character, false).fontData;
    UErrorCode errorCode = U_ZERO_ERROR;
    UScriptCode nextScript = uscript_getScript(character, &errorCode);
    if (U_FAILURE(errorCode))
        return false;

    do {
        const UChar* currentCharacterPosition = iterator.characters();
        const SimpleFontData* currentFontData = nextFontData;
        UScriptCode currentScript = nextScript;

        for (iterator.advance(clusterLength); iterator.consume(character, clusterLength); iterator.advance(clusterLength)) {
            if (Font::treatAsZeroWidthSpace(character))
                continue;

            if (U_GET_GC_MASK(character) & U_GC_M_MASK) {
                int markLength = clusterLength;
                const UChar* markCharactersEnd = iterator.characters() + clusterLength;
                while (markCharactersEnd < normalizedBufferEnd) {
                    UChar32 nextCharacter;
                    int nextCharacterLength = 0;
                    U16_NEXT(markCharactersEnd, nextCharacterLength, normalizedBufferEnd - markCharactersEnd, nextCharacter);
                    if (!(U_GET_GC_MASK(nextCharacter) & U_GC_M_MASK))
                        break;
                    markLength += nextCharacterLength;
                    markCharactersEnd += nextCharacterLength;
                }

                if (currentFontData->canRenderCombiningCharacterSequence(currentCharacterPosition, markCharactersEnd - currentCharacterPosition)) {
                    clusterLength = markLength;
                    continue;
                }
                nextFontData = m_font->glyphDataForCharacter(character, false).fontData;
            } else
                nextFontData = m_font->glyphDataForCharacter(character, false).fontData;

            nextScript = uscript_getScript(character, &errorCode);
            if (U_FAILURE(errorCode))
                return false;
            if ((nextFontData != currentFontData) || ((currentScript != nextScript) && (nextScript != USCRIPT_INHERITED) && (!uscript_hasScript(character, currentScript))))
                break;
            if (nextScript == USCRIPT_INHERITED)
                nextScript = currentScript;
            currentCharacterPosition = iterator.characters();
        }
        unsigned numCharactersOfCurrentRun = iterator.currentCharacter() - startIndexOfCurrentRun;
        hb_script_t script = hb_icu_script_to_script(currentScript);
        m_harfBuzzRuns.append(HarfBuzzRun::create(currentFontData, startIndexOfCurrentRun, numCharactersOfCurrentRun, m_run.direction(), script));
        currentFontData = nextFontData;
        startIndexOfCurrentRun = iterator.currentCharacter();
    } while (iterator.consume(character, clusterLength));

    return !m_harfBuzzRuns.isEmpty();
}
Exemple #19
0
/*
 * Match each code point in a string against each code point in the matchSet.
 * Return the index of the first string code point that
 * is (polarity==TRUE) or is not (FALSE) contained in the matchSet.
 * Return -(string length)-1 if there is no such code point.
 */
static int32_t
_matchFromSet(const UChar* string, const UChar* matchSet, UBool polarity) {
    int32_t matchLen, matchBMPLen, strItr, matchItr;
    UChar32 stringCh, matchCh;
    UChar c, c2;

    /* first part of matchSet contains only BMP code points */
    matchBMPLen = 0;
    while ((c = matchSet[matchBMPLen]) != 0 && U16_IS_SINGLE(c)) {
        ++matchBMPLen;
    }

    /* second part of matchSet contains BMP and supplementary code points */
    matchLen = matchBMPLen;
    while (matchSet[matchLen] != 0) {
        ++matchLen;
    }

    for (strItr = 0; (c = string[strItr]) != 0;) {
        ++strItr;
        if (U16_IS_SINGLE(c)) {
            if (polarity) {
                for (matchItr = 0; matchItr < matchLen; ++matchItr) {
                    if (c == matchSet[matchItr]) {
                        return strItr - 1; /* one matches */
                    }
                }
            } else {
                for (matchItr = 0; matchItr < matchLen; ++matchItr) {
                    if (c == matchSet[matchItr]) {
                        goto endloop;
                    }
                }
                return strItr - 1; /* none matches */
            }
        } else {
            /*
             * No need to check for string length before U16_IS_TRAIL
             * because c2 could at worst be the terminating NUL.
             */
            if (U16_IS_SURROGATE_LEAD(c) && U16_IS_TRAIL(c2 = string[strItr])) {
                ++strItr;
                stringCh = U16_GET_SUPPLEMENTARY(c, c2);
            } else {
                stringCh = c; /* unpaired trail surrogate */
            }

            if (polarity) {
                for (matchItr = matchBMPLen; matchItr < matchLen;) {
                    U16_NEXT(matchSet, matchItr, matchLen, matchCh);
                    if (stringCh == matchCh) {
                        return strItr - U16_LENGTH(stringCh); /* one matches */
                    }
                }
            } else {
                for (matchItr = matchBMPLen; matchItr < matchLen;) {
                    U16_NEXT(matchSet, matchItr, matchLen, matchCh);
                    if (stringCh == matchCh) {
                        goto endloop;
                    }
                }
                return strItr - U16_LENGTH(stringCh); /* none matches */
            }
        }
        endloop:
        /* wish C had continue with labels like Java... */;
    }

    /* Didn't find it. */
    return -strItr - 1;
}
Exemple #20
0
/* keep this in sync with utf8tst.c's TestNulTerminated() */
static void TestNulTerminated() {
    static const UChar input[]={
        /*  0 */  0x61,
        /*  1 */  0xd801, 0xdc01,
        /*  3 */  0xdc01,
        /*  4 */  0x62,
        /*  5 */  0xd801,
        /*  6 */  0x00
        /*  7 */
    };
    static const UChar32 result[]={
        0x61,
        0x10401,
        0xdc01,
        0x62,
        0xd801,
        0
    };

    UChar32 c, c2;
    int32_t i0, i=0, j, k, expectedIndex;
    int32_t cpIndex=0;
    do {
        i0=i;
        U16_NEXT(input, i, -1, c);
        if(c!=result[cpIndex]) {
            log_err("U16_NEXT(from %d)=U+%04x != U+%04x\n", i0, c, result[cpIndex]);
        }
        j=i0;
        U16_FWD_1(input, j, -1);
        if(j!=i) {
            log_err("U16_FWD_1() moved to index %d but U16_NEXT() moved to %d\n", j, i);
        }
        ++cpIndex;
        /*
         * Move by this many code points from the start.
         * U16_FWD_N() stops at the end of the string, that is, at the NUL if necessary.
         */
        expectedIndex= (c==0) ? i-1 : i;
        k=0;
        U16_FWD_N(input, k, -1, cpIndex);
        if(k!=expectedIndex) {
            log_err("U16_FWD_N(code points from 0) moved to index %d but expected %d\n", k, expectedIndex);
        }
    } while(c!=0);

    i=0;
    do {
        j=i0=i;
        U16_NEXT(input, i, -1, c);
        do {
            U16_GET(input, 0, j, -1, c2);
            if(c2!=c) {
                log_err("U16_NEXT(from %d)=U+%04x != U+%04x=U16_GET(at %d)\n", i0, c, c2, j);
            }
            /* U16_SET_CP_LIMIT moves from a non-lead byte to the limit of the code point */
            k=j+1;
            U16_SET_CP_LIMIT(input, 0, k, -1);
            if(k!=i) {
                log_err("U16_NEXT() moved to %d but U16_SET_CP_LIMIT(%d) moved to %d\n", i, j+1, k);
            }
        } while(++j<i);
    } while(c!=0);
}