/* get a UChar32 from the stream*/ U_CAPI int32_t U_EXPORT2 ucbuf_getc32(UCHARBUF* buf,UErrorCode* error){ int32_t retVal = (int32_t)U_EOF; if(error==NULL || U_FAILURE(*error)){ return FALSE; } if(buf->currentPos+1>=buf->bufLimit){ if(buf->remaining==0){ return U_EOF; } buf=ucbuf_fillucbuf(buf,error); if(U_FAILURE(*error)){ return U_EOF; } } if(UTF_IS_LEAD(*(buf->currentPos))){ retVal=UTF16_GET_PAIR_VALUE(buf->currentPos[0],buf->currentPos[1]); buf->currentPos+=2; }else{ retVal = *(buf->currentPos++); } return retVal; }
/* Parse a single escape sequence. Although this method deals in * UChars, it does not use C++ or UnicodeString. This allows it to * be used from C contexts. */ U_CAPI UChar32 U_EXPORT2 u_unescapeAt(UNESCAPE_CHAR_AT charAt, int32_t* offset, int32_t length, void* context) { int32_t start = *offset; UChar c; UChar32 result = 0; int8_t n = 0; int8_t minDig = 0; int8_t maxDig = 0; int8_t bitsPerDigit = 4; int8_t dig; int32_t i; UBool braces = FALSE; /* Check that offset is in range */ if (*offset < 0 || *offset >= length) { goto err; } /* Fetch first UChar after '\\' */ c = charAt((*offset)++, context); /* Convert hexadecimal and octal escapes */ switch (c) { case 0x0075 /*'u'*/: minDig = maxDig = 4; break; case 0x0055 /*'U'*/: minDig = maxDig = 8; break; case 0x0078 /*'x'*/: minDig = 1; if (*offset < length && charAt(*offset, context) == 0x7B /*{*/) { ++(*offset); braces = TRUE; maxDig = 8; } else { maxDig = 2; } break; default: dig = _digit8(c); if (dig >= 0) { minDig = 1; maxDig = 3; n = 1; /* Already have first octal digit */ bitsPerDigit = 3; result = dig; } break; } if (minDig != 0) { while (*offset < length && n < maxDig) { c = charAt(*offset, context); dig = (int8_t) ((bitsPerDigit == 3) ? _digit8(c) : _digit16(c)); if (dig < 0) { break; } result = (result << bitsPerDigit) | dig; ++(*offset); ++n; } if (n < minDig) { goto err; } if (braces) { if (c != 0x7D /*}*/) { goto err; } ++(*offset); } if (result < 0 || result >= 0x110000) { goto err; } /* If an escape sequence specifies a lead surrogate, see if * there is a trail surrogate after it, either as an escape or * as a literal. If so, join them up into a supplementary. */ if (*offset < length && U16_IS_LEAD(result)) { int32_t ahead = *offset + 1; c = charAt(*offset, context); if (c == 0x5C /*'\\'*/ && ahead < length) { c = (UChar) u_unescapeAt(charAt, &ahead, length, context); } if (U16_IS_TRAIL(c)) { *offset = ahead; result = U16_GET_SUPPLEMENTARY(result, c); } } return result; } /* Convert C-style escapes in table */ for (i = 0; i < UNESCAPE_MAP_LENGTH; i += 2) { if (c == UNESCAPE_MAP[i]) { return UNESCAPE_MAP[i + 1]; } else if (c < UNESCAPE_MAP[i]) { break; } } /* Map \cX to control-X: X & 0x1F */ if (c == 0x0063 /*'c'*/ && *offset < length) { c = charAt((*offset)++, context); if (UTF_IS_FIRST_SURROGATE(c) && *offset < length) { UChar c2 = charAt(*offset, context); if (UTF_IS_SECOND_SURROGATE(c2)) { ++(*offset); c = (UChar) UTF16_GET_PAIR_VALUE(c, c2); /* [sic] */ } } return 0x1F & c; } /* If no special forms are recognized, then consider * the backslash to generically escape the next character. * Deal with surrogate pairs. */ if (UTF_IS_FIRST_SURROGATE(c) && *offset < length) { UChar c2 = charAt(*offset, context); if (UTF_IS_SECOND_SURROGATE(c2)) { ++(*offset); return UTF16_GET_PAIR_VALUE(c, c2); } } return c; err: /* Invalid escape sequence */ *offset = start; /* Reset to initial value */ return (UChar32) 0xFFFFFFFF; }
U_CAPI UChar32* U_EXPORT2 u_strToUTF32(UChar32 *dest, int32_t destCapacity, int32_t *pDestLength, const UChar *src, int32_t srcLength, UErrorCode *pErrorCode) { const UChar* pSrc = src; const UChar* pSrcLimit; int32_t reqLength=0; uint32_t ch=0; uint32_t *pDest = (uint32_t *)dest; uint32_t *pDestLimit = pDest + destCapacity; UChar ch2=0; /* args check */ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ return NULL; } if((src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){ *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; return NULL; } if(srcLength==-1) { while((ch=*pSrc)!=0 && pDest!=pDestLimit) { ++pSrc; /*need not check for NUL because NUL fails UTF_IS_TRAIL() anyway*/ if(UTF_IS_LEAD(ch) && UTF_IS_TRAIL(ch2=*pSrc)) { ++pSrc; ch=UTF16_GET_PAIR_VALUE(ch, ch2); } *(pDest++)= ch; } while((ch=*pSrc++)!=0) { if(UTF_IS_LEAD(ch) && UTF_IS_TRAIL(ch2=*pSrc)) { ++pSrc; } ++reqLength; } } else { pSrcLimit = pSrc+srcLength; while(pSrc<pSrcLimit && pDest<pDestLimit) { ch=*pSrc++; if(UTF_IS_LEAD(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) { ++pSrc; ch=UTF16_GET_PAIR_VALUE(ch, ch2); } *(pDest++)= ch; } while(pSrc!=pSrcLimit) { ch=*pSrc++; if(UTF_IS_LEAD(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) { ++pSrc; } ++reqLength; } } reqLength+=(int32_t)(pDest - (uint32_t *)dest); if(pDestLength){ *pDestLength = reqLength; } /* Terminate the buffer */ u_terminateUChar32s(dest,destCapacity,reqLength,pErrorCode); return dest; }
static void _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, UErrorCode *pErrorCode) { UConverter *cnv; const UChar *source, *sourceLimit; uint8_t *target; int32_t targetCapacity; int32_t *offsets; int32_t prev, c, diff; int32_t sourceIndex, nextSourceIndex; U_ALIGN_CODE(16) /* set up the local pointers */ cnv=pArgs->converter; source=pArgs->source; sourceLimit=pArgs->sourceLimit; target=(uint8_t *)pArgs->target; targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); offsets=pArgs->offsets; /* get the converter state from UConverter */ c=cnv->fromUChar32; prev=(int32_t)cnv->fromUnicodeStatus; if(prev==0) { prev=BOCU1_ASCII_PREV; } /* sourceIndex=-1 if the current character began in the previous buffer */ sourceIndex= c==0 ? 0 : -1; nextSourceIndex=0; /* conversion loop */ if(c!=0 && targetCapacity>0) { goto getTrail; } fastSingle: /* fast loop for single-byte differences */ /* use only one loop counter variable, targetCapacity, not also source */ diff=(int32_t)(sourceLimit-source); if(targetCapacity>diff) { targetCapacity=diff; } while(targetCapacity>0 && (c=*source)<0x3000) { if(c<=0x20) { if(c!=0x20) { prev=BOCU1_ASCII_PREV; } *target++=(uint8_t)c; *offsets++=nextSourceIndex++; ++source; --targetCapacity; } else { diff=c-prev; if(DIFF_IS_SINGLE(diff)) { prev=BOCU1_SIMPLE_PREV(c); *target++=(uint8_t)PACK_SINGLE_DIFF(diff); *offsets++=nextSourceIndex++; ++source; --targetCapacity; } else { break; } } } /* restore real values */ targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target); sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */ /* regular loop for all cases */ while(source<sourceLimit) { if(targetCapacity>0) { c=*source++; ++nextSourceIndex; if(c<=0x20) { /* * ISO C0 control & space: * Encode directly for MIME compatibility, * and reset state except for space, to not disrupt compression. */ if(c!=0x20) { prev=BOCU1_ASCII_PREV; } *target++=(uint8_t)c; *offsets++=sourceIndex; --targetCapacity; sourceIndex=nextSourceIndex; continue; } if(UTF_IS_LEAD(c)) { getTrail: if(source<sourceLimit) { /* test the following code unit */ UChar trail=*source; if(UTF_IS_SECOND_SURROGATE(trail)) { ++source; ++nextSourceIndex; c=UTF16_GET_PAIR_VALUE(c, trail); } } else { /* no more input */ c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */ break; } } /* * all other Unicode code points c==U+0021..U+10ffff * are encoded with the difference c-prev * * a new prev is computed from c, * placed in the middle of a 0x80-block (for most small scripts) or * in the middle of the Unihan and Hangul blocks * to statistically minimize the following difference */ diff=c-prev; prev=BOCU1_PREV(c); if(DIFF_IS_SINGLE(diff)) { *target++=(uint8_t)PACK_SINGLE_DIFF(diff); *offsets++=sourceIndex; --targetCapacity; sourceIndex=nextSourceIndex; if(c<0x3000) { goto fastSingle; } } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) { /* optimize 2-byte case */ int32_t m; if(diff>=0) { diff-=BOCU1_REACH_POS_1+1; m=diff%BOCU1_TRAIL_COUNT; diff/=BOCU1_TRAIL_COUNT; diff+=BOCU1_START_POS_2; } else { diff-=BOCU1_REACH_NEG_1; NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); diff+=BOCU1_START_NEG_2; } *target++=(uint8_t)diff; *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m); *offsets++=sourceIndex; *offsets++=sourceIndex; targetCapacity-=2; sourceIndex=nextSourceIndex; } else { int32_t length; /* will be 2..4 */ diff=packDiff(diff); length=BOCU1_LENGTH_FROM_PACKED(diff); /* write the output character bytes from diff and length */ /* from the first if in the loop we know that targetCapacity>0 */ if(length<=targetCapacity) { switch(length) { /* each branch falls through to the next one */ case 4: *target++=(uint8_t)(diff>>24); *offsets++=sourceIndex; case 3: *target++=(uint8_t)(diff>>16); *offsets++=sourceIndex; case 2: *target++=(uint8_t)(diff>>8); *offsets++=sourceIndex; /* case 1: handled above */ *target++=(uint8_t)diff; *offsets++=sourceIndex; default: /* will never occur */ break; } targetCapacity-=length; sourceIndex=nextSourceIndex; } else { uint8_t *charErrorBuffer; /* * We actually do this backwards here: * In order to save an intermediate variable, we output * first to the overflow buffer what does not fit into the * regular target. */ /* we know that 1<=targetCapacity<length<=4 */ length-=targetCapacity; charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; switch(length) { /* each branch falls through to the next one */ case 3: *charErrorBuffer++=(uint8_t)(diff>>16); case 2: *charErrorBuffer++=(uint8_t)(diff>>8); case 1: *charErrorBuffer=(uint8_t)diff; default: /* will never occur */ break; } cnv->charErrorBufferLength=(int8_t)length; /* now output what fits into the regular target */ diff>>=8*length; /* length was reduced by targetCapacity */ switch(targetCapacity) { /* each branch falls through to the next one */ case 3: *target++=(uint8_t)(diff>>16); *offsets++=sourceIndex; case 2: *target++=(uint8_t)(diff>>8); *offsets++=sourceIndex; case 1: *target++=(uint8_t)diff; *offsets++=sourceIndex; default: /* will never occur */ break; } /* target overflow */ targetCapacity=0; *pErrorCode=U_BUFFER_OVERFLOW_ERROR; break; } } } else {
static void UConverter_fromUnicode_CompoundText_OFFSETS(UConverterFromUnicodeArgs* args, UErrorCode* err){ UConverter *cnv = args->converter; uint8_t *target = (uint8_t *) args->target; const uint8_t *targetLimit = (const uint8_t *) args->targetLimit; const UChar* source = args->source; const UChar* sourceLimit = args->sourceLimit; /* int32_t* offsets = args->offsets; */ UChar32 sourceChar; UBool useFallback = cnv->useFallback; uint8_t tmpTargetBuffer[7]; int32_t tmpTargetBufferLength = 0; COMPOUND_TEXT_CONVERTERS currentState, tmpState; uint32_t pValue; int32_t pValueLength = 0; int32_t i, n; UConverterDataCompoundText *myConverterData = (UConverterDataCompoundText *) cnv->extraInfo; currentState = myConverterData->state; /* check if the last codepoint of previous buffer was a lead surrogate*/ if((sourceChar = cnv->fromUChar32)!=0 && target< targetLimit) { goto getTrail; } while( source < sourceLimit){ if(target < targetLimit){ sourceChar = *(source++); /*check if the char is a First surrogate*/ if(UTF_IS_SURROGATE(sourceChar)) { if(UTF_IS_SURROGATE_FIRST(sourceChar)) { getTrail: /*look ahead to find the trail surrogate*/ if(source < sourceLimit) { /* test the following code unit */ UChar trail=(UChar) *source; if(UTF_IS_SECOND_SURROGATE(trail)) { source++; sourceChar=UTF16_GET_PAIR_VALUE(sourceChar, trail); cnv->fromUChar32=0x00; /* convert this supplementary code point */ /* exit this condition tree */ } else { /* this is an unmatched lead code unit (1st surrogate) */ /* callback(illegal) */ *err=U_ILLEGAL_CHAR_FOUND; cnv->fromUChar32=sourceChar; break; } } else { /* no more input */ cnv->fromUChar32=sourceChar; break; } } else { /* this is an unmatched trail code unit (2nd surrogate) */ /* callback(illegal) */ *err=U_ILLEGAL_CHAR_FOUND; cnv->fromUChar32=sourceChar; break; } } tmpTargetBufferLength = 0; tmpState = getState(sourceChar); if (tmpState != DO_SEARCH && currentState != tmpState) { /* Get escape sequence if necessary */ currentState = tmpState; for (i = 0; escSeqCompoundText[currentState][i] != 0; i++) { tmpTargetBuffer[tmpTargetBufferLength++] = escSeqCompoundText[currentState][i]; } } if (tmpState == DO_SEARCH) { /* Test all available converters */ for (i = 1; i < SEARCH_LENGTH; i++) { pValueLength = ucnv_MBCSFromUChar32(myConverterData->myConverterArray[i], sourceChar, &pValue, useFallback); if (pValueLength > 0) { tmpState = (COMPOUND_TEXT_CONVERTERS)i; if (currentState != tmpState) { currentState = tmpState; for (i = 0; escSeqCompoundText[currentState][i] != 0; i++) { tmpTargetBuffer[tmpTargetBufferLength++] = escSeqCompoundText[currentState][i]; } } for (n = (pValueLength - 1); n >= 0; n--) { tmpTargetBuffer[tmpTargetBufferLength++] = (uint8_t)(pValue >> (n * 8)); } break; } } } else if (tmpState == COMPOUND_TEXT_SINGLE_0) {