/** * BOCU-1 encoder function. * * @param pPrev pointer to the integer that holds * the "previous code point" state; * the initial value should be 0 which * encodeBocu1 will set to the actual BOCU-1 initial state value * @param c the code point to encode * @return the packed 1/2/3/4-byte encoding, see packDiff(), * or 0 if an error occurs * * @see packDiff */ U_CFUNC int32_t encodeBocu1(int32_t *pPrev, int32_t c) { int32_t prev; if(pPrev==NULL || c<0 || c>0x10ffff) { /* illegal argument */ return 0; } prev=*pPrev; if(prev==0) { /* lenient handling of initial value 0 */ prev=*pPrev=BOCU1_ASCII_PREV; } if(c<=0x20) { /* * ISO C0 control & space: * Encode directly for MIME compatibility, * and reset state except for space, to not disrupt compression. */ if(c!=0x20) { *pPrev=BOCU1_ASCII_PREV; } return 0x01000000|c; } /* * all other Unicode code points c==U+0021..U+10ffff * are encoded with the difference c-prev * * a new prev is computed from c, * placed in the middle of a 0x80-block (for most small scripts) or * in the middle of the Unihan and Hangul blocks * to statistically minimize the following difference */ *pPrev=bocu1Prev(c); return packDiff(c-prev); }
/** * Encode one difference value -0x10ffff..+0x10ffff in 1..4 bytes, * preserving lexical order. * Also checks for roundtripping of the difference encoding. * Test function. * * @param diff difference value to test, -0x10ffff..0x10ffff * @param p pointer to output byte array * @return p advanced by number of bytes output * * @see unpackDiff */ static uint8_t * writeDiff(int32_t diff, uint8_t *p) { /* generate the difference as a packed value and serialize it */ int32_t packed, initialPrev; packed=packDiff(diff); /* * bogus initial "prev" to work around * code point range check in decodeBocu1() */ if(diff<=0) { initialPrev=0x10ffff; } else { initialPrev=-1; } if(diff!=unpackDiff(initialPrev, packed)) { log_err("error: unpackDiff(packDiff(diff=%ld)=0x%08lx)=%ld!=diff\n", diff, packed, unpackDiff(initialPrev, packed)); } return p+writePacked(packed, p); }
static void U_CALLCONV _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, UErrorCode *pErrorCode) { UConverter *cnv; const UChar *source, *sourceLimit; uint8_t *target; int32_t targetCapacity; int32_t *offsets; int32_t prev, c, diff; int32_t sourceIndex, nextSourceIndex; /* set up the local pointers */ cnv=pArgs->converter; source=pArgs->source; sourceLimit=pArgs->sourceLimit; target=(uint8_t *)pArgs->target; targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); offsets=pArgs->offsets; /* get the converter state from UConverter */ c=cnv->fromUChar32; prev=(int32_t)cnv->fromUnicodeStatus; if(prev==0) { prev=BOCU1_ASCII_PREV; } /* sourceIndex=-1 if the current character began in the previous buffer */ sourceIndex= c==0 ? 0 : -1; nextSourceIndex=0; /* conversion loop */ if(c!=0 && targetCapacity>0) { goto getTrail; } fastSingle: /* fast loop for single-byte differences */ /* use only one loop counter variable, targetCapacity, not also source */ diff=(int32_t)(sourceLimit-source); if(targetCapacity>diff) { targetCapacity=diff; } while(targetCapacity>0 && (c=*source)<0x3000) { if(c<=0x20) { if(c!=0x20) { prev=BOCU1_ASCII_PREV; } *target++=(uint8_t)c; *offsets++=nextSourceIndex++; ++source; --targetCapacity; } else { diff=c-prev; if(DIFF_IS_SINGLE(diff)) { prev=BOCU1_SIMPLE_PREV(c); *target++=(uint8_t)PACK_SINGLE_DIFF(diff); *offsets++=nextSourceIndex++; ++source; --targetCapacity; } else { break; } } } /* restore real values */ targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target); sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */ /* regular loop for all cases */ while(source<sourceLimit) { if(targetCapacity>0) { c=*source++; ++nextSourceIndex; if(c<=0x20) { /* * ISO C0 control & space: * Encode directly for MIME compatibility, * and reset state except for space, to not disrupt compression. */ if(c!=0x20) { prev=BOCU1_ASCII_PREV; } *target++=(uint8_t)c; *offsets++=sourceIndex; --targetCapacity; sourceIndex=nextSourceIndex; continue; } if(U16_IS_LEAD(c)) { getTrail: if(source<sourceLimit) { /* test the following code unit */ UChar trail=*source; if(U16_IS_TRAIL(trail)) { ++source; ++nextSourceIndex; c=U16_GET_SUPPLEMENTARY(c, trail); } } else { /* no more input */ c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */ break; } } /* * all other Unicode code points c==U+0021..U+10ffff * are encoded with the difference c-prev * * a new prev is computed from c, * placed in the middle of a 0x80-block (for most small scripts) or * in the middle of the Unihan and Hangul blocks * to statistically minimize the following difference */ diff=c-prev; prev=BOCU1_PREV(c); if(DIFF_IS_SINGLE(diff)) { *target++=(uint8_t)PACK_SINGLE_DIFF(diff); *offsets++=sourceIndex; --targetCapacity; sourceIndex=nextSourceIndex; if(c<0x3000) { goto fastSingle; } } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) { /* optimize 2-byte case */ int32_t m; if(diff>=0) { diff-=BOCU1_REACH_POS_1+1; m=diff%BOCU1_TRAIL_COUNT; diff/=BOCU1_TRAIL_COUNT; diff+=BOCU1_START_POS_2; } else { diff-=BOCU1_REACH_NEG_1; NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); diff+=BOCU1_START_NEG_2; } *target++=(uint8_t)diff; *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m); *offsets++=sourceIndex; *offsets++=sourceIndex; targetCapacity-=2; sourceIndex=nextSourceIndex; } else { int32_t length; /* will be 2..4 */ diff=packDiff(diff); length=BOCU1_LENGTH_FROM_PACKED(diff); /* write the output character bytes from diff and length */ /* from the first if in the loop we know that targetCapacity>0 */ if(length<=targetCapacity) { switch(length) { /* each branch falls through to the next one */ case 4: *target++=(uint8_t)(diff>>24); *offsets++=sourceIndex; U_FALLTHROUGH; case 3: *target++=(uint8_t)(diff>>16); *offsets++=sourceIndex; U_FALLTHROUGH; case 2: *target++=(uint8_t)(diff>>8); *offsets++=sourceIndex; /* case 1: handled above */ *target++=(uint8_t)diff; *offsets++=sourceIndex; U_FALLTHROUGH; default: /* will never occur */ break; } targetCapacity-=length; sourceIndex=nextSourceIndex; } else { uint8_t *charErrorBuffer; /* * We actually do this backwards here: * In order to save an intermediate variable, we output * first to the overflow buffer what does not fit into the * regular target. */ /* we know that 1<=targetCapacity<length<=4 */ length-=targetCapacity; charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; switch(length) { /* each branch falls through to the next one */ case 3: *charErrorBuffer++=(uint8_t)(diff>>16); U_FALLTHROUGH; case 2: *charErrorBuffer++=(uint8_t)(diff>>8); U_FALLTHROUGH; case 1: *charErrorBuffer=(uint8_t)diff; U_FALLTHROUGH; default: /* will never occur */ break; } cnv->charErrorBufferLength=(int8_t)length; /* now output what fits into the regular target */ diff>>=8*length; /* length was reduced by targetCapacity */ switch(targetCapacity) { /* each branch falls through to the next one */ case 3: *target++=(uint8_t)(diff>>16); *offsets++=sourceIndex; U_FALLTHROUGH; case 2: *target++=(uint8_t)(diff>>8); *offsets++=sourceIndex; U_FALLTHROUGH; case 1: *target++=(uint8_t)diff; *offsets++=sourceIndex; U_FALLTHROUGH; default: /* will never occur */ break; } /* target overflow */ targetCapacity=0; *pErrorCode=U_BUFFER_OVERFLOW_ERROR; break; } } } else {