int ICUTransService::compareNIString(const XMLCh* const comp1 , const XMLCh* const comp2 , const unsigned int maxChars) { if (maxChars > 0) { // Note that this function has somewhat broken semantics, as it's // possible for two strings of different lengths to compare as equal // in a case-insensitive manner, since one character could be // represented as a surrogate pair. size_t i = 0; size_t j = 0; for(;;) { UChar32 ch1; UChar32 ch2; U16_NEXT_UNSAFE(comp1, i, ch1); U16_NEXT_UNSAFE(comp2, j, ch2); const UChar32 folded1 = u_foldCase(ch1, U_FOLD_CASE_DEFAULT); const UChar32 folded2 = u_foldCase(ch2, U_FOLD_CASE_DEFAULT); if (folded1 != folded2) { return folded1 - folded2; } else if (i == maxChars) { // If we're at the end of both strings, return 0. // Otherwise, we've run out of characters in the // left string, so return -1. return j == maxChars ? 0 : -1; } else if (j == maxChars) { // We've run out of characters in the right string, // but not the left, so return 1. return 1; } } } return 0; }
U_CFUNC UBool U_EXPORT2 ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length, const USetAdder *sa) { const UChar *unfold, *p; int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth; if(csp->unfold==NULL || s==NULL) { return FALSE; /* no reverse case folding data, or no string */ } if(length<=1) { /* the string is too short to find any match */ /* * more precise would be: * if(!u_strHasMoreChar32Than(s, length, 1)) * but this does not make much practical difference because * a single supplementary code point would just not be found */ return FALSE; } unfold=csp->unfold; unfoldRows=unfold[UCASE_UNFOLD_ROWS]; unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH]; unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH]; unfold+=unfoldRowWidth; if(length>unfoldStringWidth) { /* the string is too long to find any match */ return FALSE; } /* do a binary search for the string */ start=0; limit=unfoldRows; while(start<limit) { i=(start+limit)/2; p=unfold+(i*unfoldRowWidth); result=strcmpMax(s, length, p, unfoldStringWidth); if(result==0) { /* found the string: add each code point, and its case closure */ UChar32 c; for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) { U16_NEXT_UNSAFE(p, i, c); sa->add(sa->set, c); ucase_addCaseClosure(csp, c, sa); } return TRUE; } else if(result<0) { limit=i; } else /* result>0 */ { start=i+1; } } return FALSE; /* string not found */ }
// --------------------------------------------------------------------------- // ICUTransService: The virtual transcoding service API // --------------------------------------------------------------------------- int ICUTransService::compareIString(const XMLCh* const comp1 , const XMLCh* const comp2) { size_t i = 0; size_t j = 0; for(;;) { UChar32 ch1; UChar32 ch2; U16_NEXT_UNSAFE(comp1, i, ch1); U16_NEXT_UNSAFE(comp2, j, ch2); const UChar32 folded1 = u_foldCase(ch1, U_FOLD_CASE_DEFAULT); const UChar32 folded2 = u_foldCase(ch2, U_FOLD_CASE_DEFAULT); if (folded1 != folded2) { return folded1 - folded2; } else if (ch1 == 0) { // If ch1 is 0, the ch2 must also be // 0. Otherwise, the previous if // would have failed. break; } } return 0; }
static void doCaseConvert( XMLCh* convertString, FunctionType caseFunction) { // Note the semantics of this function are broken, since it's // possible that changing the case of a string could increase // its length, but there's no way to handle such a situation. const unsigned int len = XMLString::stringLen(convertString); size_t readPos = 0; size_t writePos = 0; while(readPos < len) { UChar32 original; // Get the next Unicode code point. U16_NEXT_UNSAFE(convertString, readPos, original); // Convert the code point const UChar32 converted = caseFunction(original); // OK, now here's where it gets ugly. if (!U_IS_BMP(converted) && U_IS_BMP(original) && readPos - writePos == 1) { // We do not have room to convert the // character without overwriting the next // character, so we will just stop. break; } else { U16_APPEND_UNSAFE(convertString, writePos, converted); } } convertString[writePos] = 0; }
static void TestNextPrevChar(){ static UChar input[]={0x0061, 0xd800, 0xdc00, 0xdbff, 0xdfff, 0x0062, 0xd841, 0xd7ff, 0xd841, 0xdc41, 0xdc00, 0x0000}; static UChar32 result[]={ /*next_unsafe next_safe_ns next_safe_s prev_unsafe prev_safe_ns prev_safe_s*/ 0x0061, 0x0061, 0x0061, 0x0000, 0x0000, 0x0000, 0x10000, 0x10000, 0x10000, 0x120400, 0xdc00, UTF_ERROR_VALUE, 0xdc00, 0xdc00, UTF_ERROR_VALUE, 0x20441, 0x20441, 0x20441, 0x10ffff, 0x10ffff, 0x10ffff, 0xd841, 0xd841, UTF_ERROR_VALUE, 0xdfff, 0xdfff, UTF_ERROR_VALUE, 0xd7ff, 0xd7ff, 0xd7ff, 0x0062, 0x0062, 0x0062, 0xd841, 0xd841, UTF_ERROR_VALUE, 0x1ffff, 0xd841, UTF_ERROR_VALUE, 0x0062, 0x0062, 0x0062, 0xd7ff, 0xd7ff, 0xd7ff, 0x10ffff, 0x10ffff, 0x10ffff, 0x20441, 0x20441, 0x20441, 0xdbff, 0xdbff, UTF_ERROR_VALUE, 0xdc41, 0xdc41, UTF_ERROR_VALUE, 0x10000, 0x10000, 0x10000, 0xdc00, 0xdc00, UTF_ERROR_VALUE, 0xd800, 0xd800, UTF_ERROR_VALUE, 0x0000, 0x0000, 0x0000, 0x0061, 0x0061, 0x0061 }; static uint16_t movedOffset[]={ /*next_unsafe next_safe_ns next_safe_s prev_unsafe prev_safe_ns prev_safe_s*/ 1, 1, 1, 11, 11, 11, 3, 3, 3, 9, 10 , 10, 3, 3, 3, 8, 8, 8, 5, 5, 4, 8, 8, 8, 5, 5, 5, 7, 7, 7, 6, 6, 6, 6, 6, 6, 8, 7, 7, 5, 5, 5, 8, 8, 8, 3, 3, 3, 10, 10, 10, 3, 3, 3, 10, 10, 10, 1, 1, 1, 11, 11, 11, 1, 1, 1, 12, 12, 12, 0, 0, 0, }; UChar32 c=0x0000; uint16_t i=0; uint16_t offset=0, setOffset=0; for(offset=0; offset<sizeof(input)/U_SIZEOF_UCHAR; offset++){ setOffset=offset; UTF16_NEXT_CHAR_UNSAFE(input, setOffset, c); if(setOffset != movedOffset[i]){ log_err("ERROR: UTF16_NEXT_CHAR_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i], setOffset); } if(c != result[i]){ log_err("ERROR: UTF16_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c); } setOffset=offset; U16_NEXT_UNSAFE(input, setOffset, c); if(setOffset != movedOffset[i]){ log_err("ERROR: U16_NEXT_CHAR_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i], setOffset); } if(c != result[i]){ log_err("ERROR: U16_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c); } setOffset=offset; UTF16_NEXT_CHAR_SAFE(input, setOffset, sizeof(input)/U_SIZEOF_UCHAR, c, FALSE); if(setOffset != movedOffset[i+1]){ log_err("ERROR: UTF16_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i+1], setOffset); } if(c != result[i+1]){ log_err("ERROR: UTF16_NEXT_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c); } setOffset=offset; U16_NEXT(input, setOffset, sizeof(input)/U_SIZEOF_UCHAR, c); if(setOffset != movedOffset[i+1]){ log_err("ERROR: U16_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i+1], setOffset); } if(c != result[i+1]){ log_err("ERROR: U16_NEXT failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c); } setOffset=offset; UTF16_NEXT_CHAR_SAFE(input, setOffset, sizeof(input)/U_SIZEOF_UCHAR, c, TRUE); if(setOffset != movedOffset[i+1]){ log_err("ERROR: UTF16_NEXT_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i+2], setOffset); } if(c != result[i+2]){ log_err("ERROR: UTF16_NEXT_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c); } i=(uint16_t)(i+6); } i=0; for(offset=(uint16_t)sizeof(input)/U_SIZEOF_UCHAR; offset > 0; --offset){ setOffset=offset; UTF16_PREV_CHAR_UNSAFE(input, setOffset, c); if(setOffset != movedOffset[i+3]){ log_err("ERROR: UTF16_PREV_CHAR_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i+3], setOffset); } if(c != result[i+3]){ log_err("ERROR: UTF16_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+3], c); } setOffset=offset; U16_PREV_UNSAFE(input, setOffset, c); if(setOffset != movedOffset[i+3]){ log_err("ERROR: U16_PREV_CHAR_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i+3], setOffset); } if(c != result[i+3]){ log_err("ERROR: U16_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+3], c); } setOffset=offset; UTF16_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE); if(setOffset != movedOffset[i+4]){ log_err("ERROR: UTF16_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i+4], setOffset); } if(c != result[i+4]){ log_err("ERROR: UTF16_PREV_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c); } setOffset=offset; U16_PREV(input, 0, setOffset, c); if(setOffset != movedOffset[i+4]){ log_err("ERROR: U16_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i+4], setOffset); } if(c != result[i+4]){ log_err("ERROR: U16_PREV failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c); } setOffset=offset; UTF16_PREV_CHAR_SAFE(input, 0, setOffset, c, TRUE); if(setOffset != movedOffset[i+5]){ log_err("ERROR: UTF16_PREV_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i+5], setOffset); } if(c != result[i+5]){ log_err("ERROR: UTF16_PREV_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+5], c); } i=(uint16_t)(i+6); } }
U_CFUNC void U_EXPORT2 ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa) { uint16_t props; /* * Hardcode the case closure of i and its relatives and ignore the * data file data for these characters. * The Turkic dotless i and dotted I with their case mapping conditions * and case folding option make the related characters behave specially. * This code matches their closure behavior to their case folding behavior. */ switch(c) { case 0x49: /* regular i and I are in one equivalence class */ sa->add(sa->set, 0x69); return; case 0x69: sa->add(sa->set, 0x49); return; case 0x130: /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */ sa->addString(sa->set, iDot, 2); return; case 0x131: /* dotless i is in a class by itself */ return; default: /* otherwise use the data file data */ break; } props=UTRIE2_GET16(&csp->trie, c); if(!PROPS_HAS_EXCEPTION(props)) { if(UCASE_GET_TYPE(props)!=UCASE_NONE) { /* add the one simple case mapping, no matter what type it is */ int32_t delta=UCASE_GET_DELTA(props); if(delta!=0) { sa->add(sa->set, c+delta); } } } else { /* * c has exceptions, so there may be multiple simple and/or * full case mappings. Add them all. */ const uint16_t *pe0, *pe=GET_EXCEPTIONS(csp, props); const UChar *closure; uint16_t excWord=*pe++; int32_t idx, closureLength, fullLength, length; pe0=pe; /* add all simple case mappings */ for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) { if(HAS_SLOT(excWord, idx)) { pe=pe0; GET_SLOT_VALUE(excWord, idx, pe, c); sa->add(sa->set, c); } } /* get the closure string pointer & length */ if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) { pe=pe0; GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength); closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */ closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */ } else { closureLength=0; closure=NULL; } /* add the full case folding */ if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { pe=pe0; GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength); /* start of full case mapping strings */ ++pe; fullLength&=0xffff; /* bits 16 and higher are reserved */ /* skip the lowercase result string */ pe+=fullLength&UCASE_FULL_LOWER; fullLength>>=4; /* add the full case folding string */ length=fullLength&0xf; if(length!=0) { sa->addString(sa->set, (const UChar *)pe, length); pe+=length; } /* skip the uppercase and titlecase strings */ fullLength>>=4; pe+=fullLength&0xf; fullLength>>=4; pe+=fullLength; closure=(const UChar *)pe; /* behind full case mappings */ } /* add each code point in the closure string */ for(idx=0; idx<closureLength;) { U16_NEXT_UNSAFE(closure, idx, c); sa->add(sa->set, c); } } }