/* * Compare two strings as presented by UCharIterators. * Use code unit or code point order. * When the function returns, it is undefined where the iterators * have stopped. */ U_CAPI int32_t U_EXPORT2 u_strCompareIter(UCharIterator* iter1, UCharIterator* iter2, UBool codePointOrder) { UChar32 c1, c2; /* argument checking */ if (iter1 == NULL || iter2 == NULL) { return 0; /* bad arguments */ } if (iter1 == iter2) { return 0; /* identical iterators */ } /* reset iterators to start? */ iter1->move(iter1, 0, UITER_START); iter2->move(iter2, 0, UITER_START); /* compare identical prefixes - they do not need to be fixed up */ for (; ;) { c1 = iter1->next(iter1); c2 = iter2->next(iter2); if (c1 != c2) { break; } if (c1 == -1) { return 0; } } /* if both values are in or above the surrogate range, fix them up */ if (c1 >= 0xd800 && c2 >= 0xd800 && codePointOrder) { /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ if ( (c1 <= 0xdbff && UTF_IS_TRAIL(iter1->current(iter1))) || (UTF_IS_TRAIL(c1) && (iter1->previous(iter1), UTF_IS_LEAD(iter1->previous(iter1)))) ) { /* part of a surrogate pair, leave >=d800 */ } else { /* BMP code point - may be surrogate code point - make <d800 */ c1 -= 0x2800; } if ( (c2 <= 0xdbff && UTF_IS_TRAIL(iter2->current(iter2))) || (UTF_IS_TRAIL(c2) && (iter2->previous(iter2), UTF_IS_LEAD(iter2->previous(iter2)))) ) { /* part of a surrogate pair, leave >=d800 */ } else { /* BMP code point - may be surrogate code point - make <d800 */ c2 -= 0x2800; } } /* now c1 and c2 are in the requested (code unit or code point) order */ return (int32_t) c1 - (int32_t) c2; }
U_CAPI int32_t U_EXPORT2 u_countChar32(const UChar* s, int32_t length) { int32_t count; if (s == NULL || length < -1) { return 0; } count = 0; if (length >= 0) { while (length > 0) { ++count; if (UTF_IS_LEAD(*s) && length >= 2 && UTF_IS_TRAIL(*(s + 1))) { s += 2; length -= 2; } else { ++s; --length; } } } else /* length==-1 */ { UChar c; for (; ;) { if ((c = *s++) == 0) { break; } ++count; /* * sufficient to look ahead one because of UTF-16; * safe to look ahead one because at worst that would be the terminating NUL */ if (UTF_IS_LEAD(c) && UTF_IS_TRAIL(*s)) { ++s; } } } return count; }
/* get a UChar32 from the stream*/ U_CAPI int32_t U_EXPORT2 ucbuf_getc32(UCHARBUF* buf,UErrorCode* error){ int32_t retVal = (int32_t)U_EOF; if(error==NULL || U_FAILURE(*error)){ return FALSE; } if(buf->currentPos+1>=buf->bufLimit){ if(buf->remaining==0){ return U_EOF; } buf=ucbuf_fillucbuf(buf,error); if(U_FAILURE(*error)){ return U_EOF; } } if(UTF_IS_LEAD(*(buf->currentPos))){ retVal=UTF16_GET_PAIR_VALUE(buf->currentPos[0],buf->currentPos[1]); buf->currentPos+=2; }else{ retVal = *(buf->currentPos++); } return retVal; }
U_CAPI int32_t U_EXPORT2 uprv_strCompare(const UChar* s1, int32_t length1, const UChar* s2, int32_t length2, UBool strncmpStyle, UBool codePointOrder) { const UChar* start1, * start2, * limit1, * limit2; UChar c1, c2; /* setup for fix-up */ start1 = s1; start2 = s2; /* compare identical prefixes - they do not need to be fixed up */ if (length1 < 0 && length2 < 0) { /* strcmp style, both NUL-terminated */ if (s1 == s2) { return 0; } for (; ;) { c1 = *s1; c2 = *s2; if (c1 != c2) { break; } if (c1 == 0) { return 0; } ++s1; ++s2; } /* setup for fix-up */ limit1 = limit2 = NULL; } else if (strncmpStyle) { /* special handling for strncmp, assume length1==length2>=0 but also check for NUL */ if (s1 == s2) { return 0; } limit1 = start1 + length1; for (; ;) { /* both lengths are same, check only one limit */ if (s1 == limit1) { return 0; } c1 = *s1; c2 = *s2; if (c1 != c2) { break; } if (c1 == 0) { return 0; } ++s1; ++s2; } /* setup for fix-up */ limit2 = start2 + length1; /* use length1 here, too, to enforce assumption */ } else { /* memcmp/UnicodeString style, both length-specified */ int32_t lengthResult; if (length1 < 0) { length1 = u_strlen(s1); } if (length2 < 0) { length2 = u_strlen(s2); } /* limit1=start1+min(lenght1, length2) */ if (length1 < length2) { lengthResult = -1; limit1 = start1 + length1; } else if (length1 == length2) { lengthResult = 0; limit1 = start1 + length1; } else /* length1>length2 */ { lengthResult = 1; limit1 = start1 + length2; } if (s1 == s2) { return lengthResult; } for (; ;) { /* check pseudo-limit */ if (s1 == limit1) { return lengthResult; } c1 = *s1; c2 = *s2; if (c1 != c2) { break; } ++s1; ++s2; } /* setup for fix-up */ limit1 = start1 + length1; limit2 = start2 + length2; } /* if both values are in or above the surrogate range, fix them up */ if (c1 >= 0xd800 && c2 >= 0xd800 && codePointOrder) { /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ if ( (c1 <= 0xdbff && (s1 + 1) != limit1 && UTF_IS_TRAIL(*(s1 + 1))) || (UTF_IS_TRAIL(c1) && start1 != s1 && UTF_IS_LEAD(*(s1 - 1))) ) { /* part of a surrogate pair, leave >=d800 */ } else { /* BMP code point - may be surrogate code point - make <d800 */ c1 -= 0x2800; } if ( (c2 <= 0xdbff && (s2 + 1) != limit2 && UTF_IS_TRAIL(*(s2 + 1))) || (UTF_IS_TRAIL(c2) && start2 != s2 && UTF_IS_LEAD(*(s2 - 1))) ) { /* part of a surrogate pair, leave >=d800 */ } else { /* BMP code point - may be surrogate code point - make <d800 */ c2 -= 0x2800; } } /* now c1 and c2 are in the requested (code unit or code point) order */ return (int32_t) c1 - (int32_t) c2; }
void Transliterator::_transliterate(Replaceable& text, UTransPosition& index, const UnicodeString* insertion, UErrorCode &status) const { if (U_FAILURE(status)) { return; } if (!positionIsValid(index, text.length())) { status = U_ILLEGAL_ARGUMENT_ERROR; return; } // int32_t originalStart = index.contextStart; if (insertion != 0) { text.handleReplaceBetween(index.limit, index.limit, *insertion); index.limit += insertion->length(); index.contextLimit += insertion->length(); } if (index.limit > 0 && UTF_IS_LEAD(text.charAt(index.limit - 1))) { // Oops, there is a dangling lead surrogate in the buffer. // This will break most transliterators, since they will // assume it is part of a pair. Don't transliterate until // more text comes in. return; } filteredTransliterate(text, index, TRUE, TRUE); #if 0 // TODO // I CAN'T DO what I'm attempting below now that the Kleene star // operator is supported. For example, in the rule // ([:Lu:]+) { x } > $1; // what is the maximum context length? getMaximumContextLength() // will return 1, but this is just the length of the ante context // part of the pattern string -- 1 character, which is a standin // for a Quantifier, which contains a StringMatcher, which // contains a UnicodeSet. // There is a complicated way to make this work again, and that's // to add a "maximum left context" protocol into the // UnicodeMatcher hierarchy. At present I'm not convinced this is // worth it. // --- // The purpose of the code below is to keep the context small // while doing incremental transliteration. When part of the left // context (between contextStart and start) is no longer needed, // we try to advance contextStart past that portion. We use the // maximum context length to do so. int32_t newCS = index.start; int32_t n = getMaximumContextLength(); while (newCS > originalStart && n-- > 0) { --newCS; newCS -= UTF_CHAR_LENGTH(text.char32At(newCS)) - 1; } index.contextStart = uprv_max(newCS, originalStart); #endif }
static void _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, UErrorCode *pErrorCode) { UConverter *cnv; const UChar *source, *sourceLimit; uint8_t *target; int32_t targetCapacity; int32_t *offsets; int32_t prev, c, diff; int32_t sourceIndex, nextSourceIndex; U_ALIGN_CODE(16) /* set up the local pointers */ cnv=pArgs->converter; source=pArgs->source; sourceLimit=pArgs->sourceLimit; target=(uint8_t *)pArgs->target; targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); offsets=pArgs->offsets; /* get the converter state from UConverter */ c=cnv->fromUChar32; prev=(int32_t)cnv->fromUnicodeStatus; if(prev==0) { prev=BOCU1_ASCII_PREV; } /* sourceIndex=-1 if the current character began in the previous buffer */ sourceIndex= c==0 ? 0 : -1; nextSourceIndex=0; /* conversion loop */ if(c!=0 && targetCapacity>0) { goto getTrail; } fastSingle: /* fast loop for single-byte differences */ /* use only one loop counter variable, targetCapacity, not also source */ diff=(int32_t)(sourceLimit-source); if(targetCapacity>diff) { targetCapacity=diff; } while(targetCapacity>0 && (c=*source)<0x3000) { if(c<=0x20) { if(c!=0x20) { prev=BOCU1_ASCII_PREV; } *target++=(uint8_t)c; *offsets++=nextSourceIndex++; ++source; --targetCapacity; } else { diff=c-prev; if(DIFF_IS_SINGLE(diff)) { prev=BOCU1_SIMPLE_PREV(c); *target++=(uint8_t)PACK_SINGLE_DIFF(diff); *offsets++=nextSourceIndex++; ++source; --targetCapacity; } else { break; } } } /* restore real values */ targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target); sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */ /* regular loop for all cases */ while(source<sourceLimit) { if(targetCapacity>0) { c=*source++; ++nextSourceIndex; if(c<=0x20) { /* * ISO C0 control & space: * Encode directly for MIME compatibility, * and reset state except for space, to not disrupt compression. */ if(c!=0x20) { prev=BOCU1_ASCII_PREV; } *target++=(uint8_t)c; *offsets++=sourceIndex; --targetCapacity; sourceIndex=nextSourceIndex; continue; } if(UTF_IS_LEAD(c)) { getTrail: if(source<sourceLimit) { /* test the following code unit */ UChar trail=*source; if(UTF_IS_SECOND_SURROGATE(trail)) { ++source; ++nextSourceIndex; c=UTF16_GET_PAIR_VALUE(c, trail); } } else { /* no more input */ c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */ break; } } /* * all other Unicode code points c==U+0021..U+10ffff * are encoded with the difference c-prev * * a new prev is computed from c, * placed in the middle of a 0x80-block (for most small scripts) or * in the middle of the Unihan and Hangul blocks * to statistically minimize the following difference */ diff=c-prev; prev=BOCU1_PREV(c); if(DIFF_IS_SINGLE(diff)) { *target++=(uint8_t)PACK_SINGLE_DIFF(diff); *offsets++=sourceIndex; --targetCapacity; sourceIndex=nextSourceIndex; if(c<0x3000) { goto fastSingle; } } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) { /* optimize 2-byte case */ int32_t m; if(diff>=0) { diff-=BOCU1_REACH_POS_1+1; m=diff%BOCU1_TRAIL_COUNT; diff/=BOCU1_TRAIL_COUNT; diff+=BOCU1_START_POS_2; } else { diff-=BOCU1_REACH_NEG_1; NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); diff+=BOCU1_START_NEG_2; } *target++=(uint8_t)diff; *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m); *offsets++=sourceIndex; *offsets++=sourceIndex; targetCapacity-=2; sourceIndex=nextSourceIndex; } else { int32_t length; /* will be 2..4 */ diff=packDiff(diff); length=BOCU1_LENGTH_FROM_PACKED(diff); /* write the output character bytes from diff and length */ /* from the first if in the loop we know that targetCapacity>0 */ if(length<=targetCapacity) { switch(length) { /* each branch falls through to the next one */ case 4: *target++=(uint8_t)(diff>>24); *offsets++=sourceIndex; case 3: *target++=(uint8_t)(diff>>16); *offsets++=sourceIndex; case 2: *target++=(uint8_t)(diff>>8); *offsets++=sourceIndex; /* case 1: handled above */ *target++=(uint8_t)diff; *offsets++=sourceIndex; default: /* will never occur */ break; } targetCapacity-=length; sourceIndex=nextSourceIndex; } else { uint8_t *charErrorBuffer; /* * We actually do this backwards here: * In order to save an intermediate variable, we output * first to the overflow buffer what does not fit into the * regular target. */ /* we know that 1<=targetCapacity<length<=4 */ length-=targetCapacity; charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; switch(length) { /* each branch falls through to the next one */ case 3: *charErrorBuffer++=(uint8_t)(diff>>16); case 2: *charErrorBuffer++=(uint8_t)(diff>>8); case 1: *charErrorBuffer=(uint8_t)diff; default: /* will never occur */ break; } cnv->charErrorBufferLength=(int8_t)length; /* now output what fits into the regular target */ diff>>=8*length; /* length was reduced by targetCapacity */ switch(targetCapacity) { /* each branch falls through to the next one */ case 3: *target++=(uint8_t)(diff>>16); *offsets++=sourceIndex; case 2: *target++=(uint8_t)(diff>>8); *offsets++=sourceIndex; case 1: *target++=(uint8_t)diff; *offsets++=sourceIndex; default: /* will never occur */ break; } /* target overflow */ targetCapacity=0; *pErrorCode=U_BUFFER_OVERFLOW_ERROR; break; } } } else {
U_CAPI UChar32* U_EXPORT2 u_strToUTF32(UChar32 *dest, int32_t destCapacity, int32_t *pDestLength, const UChar *src, int32_t srcLength, UErrorCode *pErrorCode) { const UChar* pSrc = src; const UChar* pSrcLimit; int32_t reqLength=0; uint32_t ch=0; uint32_t *pDest = (uint32_t *)dest; uint32_t *pDestLimit = pDest + destCapacity; UChar ch2=0; /* args check */ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ return NULL; } if((src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){ *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; return NULL; } if(srcLength==-1) { while((ch=*pSrc)!=0 && pDest!=pDestLimit) { ++pSrc; /*need not check for NUL because NUL fails UTF_IS_TRAIL() anyway*/ if(UTF_IS_LEAD(ch) && UTF_IS_TRAIL(ch2=*pSrc)) { ++pSrc; ch=UTF16_GET_PAIR_VALUE(ch, ch2); } *(pDest++)= ch; } while((ch=*pSrc++)!=0) { if(UTF_IS_LEAD(ch) && UTF_IS_TRAIL(ch2=*pSrc)) { ++pSrc; } ++reqLength; } } else { pSrcLimit = pSrc+srcLength; while(pSrc<pSrcLimit && pDest<pDestLimit) { ch=*pSrc++; if(UTF_IS_LEAD(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) { ++pSrc; ch=UTF16_GET_PAIR_VALUE(ch, ch2); } *(pDest++)= ch; } while(pSrc!=pSrcLimit) { ch=*pSrc++; if(UTF_IS_LEAD(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) { ++pSrc; } ++reqLength; } } reqLength+=(int32_t)(pDest - (uint32_t *)dest); if(pDestLength){ *pDestLength = reqLength; } /* Terminate the buffer */ u_terminateUChar32s(dest,destCapacity,reqLength,pErrorCode); return dest; }