/** * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes * and return a packed integer with them. * * The encoding favors small absolute differences with short encodings * to compress runs of same-script characters. * * Optimized version with unrolled loops and fewer floating-point operations * than the standard packDiff(). * * @param diff difference value -0x10ffff..0x10ffff * @return * 0x010000zz for 1-byte sequence zz * 0x0200yyzz for 2-byte sequence yy zz * 0x03xxyyzz for 3-byte sequence xx yy zz * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03) */ static int32_t packDiff(int32_t diff) { int32_t result, m; U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */ if(diff>=BOCU1_REACH_NEG_1) { /* mostly positive differences, and single-byte negative ones */ #if 0 /* single-byte case handled in macros, see below */ if(diff<=BOCU1_REACH_POS_1) { /* single byte */ return 0x01000000|(BOCU1_MIDDLE+diff); } else #endif if(diff<=BOCU1_REACH_POS_2) { /* two bytes */ diff-=BOCU1_REACH_POS_1+1; result=0x02000000; m=diff%BOCU1_TRAIL_COUNT; diff/=BOCU1_TRAIL_COUNT; result|=BOCU1_TRAIL_TO_BYTE(m); result|=(BOCU1_START_POS_2+diff)<<8; } else if(diff<=BOCU1_REACH_POS_3) { /* three bytes */ diff-=BOCU1_REACH_POS_2+1; result=0x03000000; m=diff%BOCU1_TRAIL_COUNT; diff/=BOCU1_TRAIL_COUNT; result|=BOCU1_TRAIL_TO_BYTE(m); m=diff%BOCU1_TRAIL_COUNT; diff/=BOCU1_TRAIL_COUNT; result|=BOCU1_TRAIL_TO_BYTE(m)<<8; result|=(BOCU1_START_POS_3+diff)<<16; } else { /* four bytes */ diff-=BOCU1_REACH_POS_3+1; m=diff%BOCU1_TRAIL_COUNT; diff/=BOCU1_TRAIL_COUNT; result=BOCU1_TRAIL_TO_BYTE(m); m=diff%BOCU1_TRAIL_COUNT; diff/=BOCU1_TRAIL_COUNT; result|=BOCU1_TRAIL_TO_BYTE(m)<<8; /* * We know that / and % would deliver quotient 0 and rest=diff. * Avoid division and modulo for performance. */ result|=BOCU1_TRAIL_TO_BYTE(diff)<<16; result|=((uint32_t)BOCU1_START_POS_4)<<24; } } else { /* two- to four-byte negative differences */ if(diff>=BOCU1_REACH_NEG_2) { /* two bytes */ diff-=BOCU1_REACH_NEG_1; result=0x02000000; NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); result|=BOCU1_TRAIL_TO_BYTE(m); result|=(BOCU1_START_NEG_2+diff)<<8; } else if(diff>=BOCU1_REACH_NEG_3) { /* three bytes */ diff-=BOCU1_REACH_NEG_2; result=0x03000000; NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); result|=BOCU1_TRAIL_TO_BYTE(m); NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); result|=BOCU1_TRAIL_TO_BYTE(m)<<8; result|=(BOCU1_START_NEG_3+diff)<<16; } else { /* four bytes */ diff-=BOCU1_REACH_NEG_3; NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); result=BOCU1_TRAIL_TO_BYTE(m); NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); result|=BOCU1_TRAIL_TO_BYTE(m)<<8; /* * We know that NEGDIVMOD would deliver * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT. * Avoid division and modulo for performance. */ m=diff+BOCU1_TRAIL_COUNT; result|=BOCU1_TRAIL_TO_BYTE(m)<<16; result|=BOCU1_MIN<<24; } } return result; }
static void U_CALLCONV _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, UErrorCode *pErrorCode) { UConverter *cnv; const UChar *source, *sourceLimit; uint8_t *target; int32_t targetCapacity; int32_t *offsets; int32_t prev, c, diff; int32_t sourceIndex, nextSourceIndex; /* set up the local pointers */ cnv=pArgs->converter; source=pArgs->source; sourceLimit=pArgs->sourceLimit; target=(uint8_t *)pArgs->target; targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); offsets=pArgs->offsets; /* get the converter state from UConverter */ c=cnv->fromUChar32; prev=(int32_t)cnv->fromUnicodeStatus; if(prev==0) { prev=BOCU1_ASCII_PREV; } /* sourceIndex=-1 if the current character began in the previous buffer */ sourceIndex= c==0 ? 0 : -1; nextSourceIndex=0; /* conversion loop */ if(c!=0 && targetCapacity>0) { goto getTrail; } fastSingle: /* fast loop for single-byte differences */ /* use only one loop counter variable, targetCapacity, not also source */ diff=(int32_t)(sourceLimit-source); if(targetCapacity>diff) { targetCapacity=diff; } while(targetCapacity>0 && (c=*source)<0x3000) { if(c<=0x20) { if(c!=0x20) { prev=BOCU1_ASCII_PREV; } *target++=(uint8_t)c; *offsets++=nextSourceIndex++; ++source; --targetCapacity; } else { diff=c-prev; if(DIFF_IS_SINGLE(diff)) { prev=BOCU1_SIMPLE_PREV(c); *target++=(uint8_t)PACK_SINGLE_DIFF(diff); *offsets++=nextSourceIndex++; ++source; --targetCapacity; } else { break; } } } /* restore real values */ targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target); sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */ /* regular loop for all cases */ while(source<sourceLimit) { if(targetCapacity>0) { c=*source++; ++nextSourceIndex; if(c<=0x20) { /* * ISO C0 control & space: * Encode directly for MIME compatibility, * and reset state except for space, to not disrupt compression. */ if(c!=0x20) { prev=BOCU1_ASCII_PREV; } *target++=(uint8_t)c; *offsets++=sourceIndex; --targetCapacity; sourceIndex=nextSourceIndex; continue; } if(U16_IS_LEAD(c)) { getTrail: if(source<sourceLimit) { /* test the following code unit */ UChar trail=*source; if(U16_IS_TRAIL(trail)) { ++source; ++nextSourceIndex; c=U16_GET_SUPPLEMENTARY(c, trail); } } else { /* no more input */ c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */ break; } } /* * all other Unicode code points c==U+0021..U+10ffff * are encoded with the difference c-prev * * a new prev is computed from c, * placed in the middle of a 0x80-block (for most small scripts) or * in the middle of the Unihan and Hangul blocks * to statistically minimize the following difference */ diff=c-prev; prev=BOCU1_PREV(c); if(DIFF_IS_SINGLE(diff)) { *target++=(uint8_t)PACK_SINGLE_DIFF(diff); *offsets++=sourceIndex; --targetCapacity; sourceIndex=nextSourceIndex; if(c<0x3000) { goto fastSingle; } } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) { /* optimize 2-byte case */ int32_t m; if(diff>=0) { diff-=BOCU1_REACH_POS_1+1; m=diff%BOCU1_TRAIL_COUNT; diff/=BOCU1_TRAIL_COUNT; diff+=BOCU1_START_POS_2; } else { diff-=BOCU1_REACH_NEG_1; NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); diff+=BOCU1_START_NEG_2; } *target++=(uint8_t)diff; *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m); *offsets++=sourceIndex; *offsets++=sourceIndex; targetCapacity-=2; sourceIndex=nextSourceIndex; } else { int32_t length; /* will be 2..4 */ diff=packDiff(diff); length=BOCU1_LENGTH_FROM_PACKED(diff); /* write the output character bytes from diff and length */ /* from the first if in the loop we know that targetCapacity>0 */ if(length<=targetCapacity) { switch(length) { /* each branch falls through to the next one */ case 4: *target++=(uint8_t)(diff>>24); *offsets++=sourceIndex; U_FALLTHROUGH; case 3: *target++=(uint8_t)(diff>>16); *offsets++=sourceIndex; U_FALLTHROUGH; case 2: *target++=(uint8_t)(diff>>8); *offsets++=sourceIndex; /* case 1: handled above */ *target++=(uint8_t)diff; *offsets++=sourceIndex; U_FALLTHROUGH; default: /* will never occur */ break; } targetCapacity-=length; sourceIndex=nextSourceIndex; } else { uint8_t *charErrorBuffer; /* * We actually do this backwards here: * In order to save an intermediate variable, we output * first to the overflow buffer what does not fit into the * regular target. */ /* we know that 1<=targetCapacity<length<=4 */ length-=targetCapacity; charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; switch(length) { /* each branch falls through to the next one */ case 3: *charErrorBuffer++=(uint8_t)(diff>>16); U_FALLTHROUGH; case 2: *charErrorBuffer++=(uint8_t)(diff>>8); U_FALLTHROUGH; case 1: *charErrorBuffer=(uint8_t)diff; U_FALLTHROUGH; default: /* will never occur */ break; } cnv->charErrorBufferLength=(int8_t)length; /* now output what fits into the regular target */ diff>>=8*length; /* length was reduced by targetCapacity */ switch(targetCapacity) { /* each branch falls through to the next one */ case 3: *target++=(uint8_t)(diff>>16); *offsets++=sourceIndex; U_FALLTHROUGH; case 2: *target++=(uint8_t)(diff>>8); *offsets++=sourceIndex; U_FALLTHROUGH; case 1: *target++=(uint8_t)diff; *offsets++=sourceIndex; U_FALLTHROUGH; default: /* will never occur */ break; } /* target overflow */ targetCapacity=0; *pErrorCode=U_BUFFER_OVERFLOW_ERROR; break; } } } else {
/** * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes * and return a packed integer with them. * * The encoding favors small absolut differences with short encodings * to compress runs of same-script characters. * * @param diff difference value -0x10ffff..0x10ffff * @return * 0x010000zz for 1-byte sequence zz * 0x0200yyzz for 2-byte sequence yy zz * 0x03xxyyzz for 3-byte sequence xx yy zz * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03) */ U_CFUNC int32_t packDiff(int32_t diff) { int32_t result, m, lead, count, shift; if(diff>=BOCU1_REACH_NEG_1) { /* mostly positive differences, and single-byte negative ones */ if(diff<=BOCU1_REACH_POS_1) { /* single byte */ return 0x01000000|(BOCU1_MIDDLE+diff); } else if(diff<=BOCU1_REACH_POS_2) { /* two bytes */ diff-=BOCU1_REACH_POS_1+1; lead=BOCU1_START_POS_2; count=1; } else if(diff<=BOCU1_REACH_POS_3) { /* three bytes */ diff-=BOCU1_REACH_POS_2+1; lead=BOCU1_START_POS_3; count=2; } else { /* four bytes */ diff-=BOCU1_REACH_POS_3+1; lead=BOCU1_START_POS_4; count=3; } } else { /* two- and four-byte negative differences */ if(diff>=BOCU1_REACH_NEG_2) { /* two bytes */ diff-=BOCU1_REACH_NEG_1; lead=BOCU1_START_NEG_2; count=1; } else if(diff>=BOCU1_REACH_NEG_3) { /* three bytes */ diff-=BOCU1_REACH_NEG_2; lead=BOCU1_START_NEG_3; count=2; } else { /* four bytes */ diff-=BOCU1_REACH_NEG_3; lead=BOCU1_START_NEG_4; count=3; } } /* encode the length of the packed result */ if(count<3) { result=(count+1)<<24; } else /* count==3, MSB used for the lead byte */ { result=0; } /* calculate trail bytes like digits in itoa() */ shift=0; do { NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); result|=BOCU1_TRAIL_TO_BYTE(m)<<shift; shift+=8; } while(--count>0); /* add lead byte */ result|=(lead+diff)<<shift; return result; }