/* * parse a list of code points * store them as a string in dest[destCapacity] * set the first code point in *pFirst * @return The length of the string in numbers of UChars. */ U_CAPI int32_t U_EXPORT2 u_parseString(const char *s, UChar *dest, int32_t destCapacity, uint32_t *pFirst, UErrorCode *pErrorCode) { char *end; uint32_t value; int32_t destLength; if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { return 0; } if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; } if(pFirst!=NULL) { *pFirst=0xffffffff; } destLength=0; for(;;) { s=u_skipWhitespace(s); if(*s==';' || *s==0) { if(destLength<destCapacity) { dest[destLength]=0; } else if(destLength==destCapacity) { *pErrorCode=U_STRING_NOT_TERMINATED_WARNING; } else { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; } return destLength; } /* read one code point */ value=(uint32_t)uprv_strtoul(s, &end, 16); if(end<=s || (*end!=' ' && *end!='\t' && *end!=';' && *end!=0) || value>=0x110000) { *pErrorCode=U_PARSE_ERROR; return 0; } /* store the first code point */ if(destLength==0 && pFirst!=NULL) { *pFirst=value; } /* append it to the destination array */ if((destLength+UTF_CHAR_LENGTH(value))<=destCapacity) { UTF_APPEND_CHAR_UNSAFE(dest, destLength, value); } else { destLength+=UTF_CHAR_LENGTH(value); } /* go to the following characters */ s=end; } }
/* Do an invariant conversion of char* -> UChar*, with escape parsing */ U_CAPI int32_t U_EXPORT2 u_unescape(const char* src, UChar* dest, int32_t destCapacity) { const char* segment = src; int32_t i = 0; char c; while ((c = *src) != 0) { /* '\\' intentionally written as compiler-specific * character constant to correspond to compiler-specific * char* constants. */ if (c == '\\') { int32_t lenParsed = 0; UChar32 c32; if (src != segment) { if (dest != NULL) { _appendUChars(dest + i, destCapacity - i, segment, src - segment); } i += src - segment; } ++src; /* advance past '\\' */ c32 = u_unescapeAt(_charPtr_charAt, &lenParsed, uprv_strlen(src), (void*) src); if (lenParsed == 0) { goto err; } src += lenParsed; /* advance past escape seq. */ if (dest != NULL && UTF_CHAR_LENGTH(c32) <= (destCapacity - i)) { UTF_APPEND_CHAR_UNSAFE(dest, i, c32); } else { i += UTF_CHAR_LENGTH(c32); } segment = src; } else { ++src; } } if (src != segment) { if (dest != NULL) { _appendUChars(dest + i, destCapacity - i, segment, src - segment); } i += src - segment; } if (dest != NULL && i < destCapacity) { dest[i] = 0; } return i; err: if (dest != NULL && destCapacity > 0) { *dest = 0; } return 0; }
/** * Decode a BOCU-1 byte sequence to a UTF-16 string. * Does not check for overflows, but otherwise useful function. * * @param p pointer to input BOCU-1 bytes * @param length number of input bytes * @param s point to output UTF-16 string array * @return number of UChar code units output */ static int32_t readString(const uint8_t *p, int32_t length, UChar *s) { Bocu1Rx rx={ 0, 0, 0 }; int32_t c, i, sLength; i=sLength=0; while(i<length) { c=decodeBocu1(&rx, p[i++]); if(c<-1) { log_err("error: readString detects encoding error at string index %ld\n", i); return -1; } if(c>=0) { UTF_APPEND_CHAR_UNSAFE(s, sLength, c); } } return sLength; }
/* * When we have UBIDI_OUTPUT_REVERSE set on ubidi_writeReordered(), then we * semantically write RTL runs in reverse and later reverse them again. * Instead, we actually write them in forward order to begin with. * However, if the RTL run was to be mirrored, we need to mirror here now * since the implicit second reversal must not do it. * It looks strange to do mirroring in LTR output, but it is only because * we are writing RTL output in reverse. */ static int32_t doWriteForward(const UChar *src, int32_t srcLength, UChar *dest, int32_t destSize, uint16_t options, UErrorCode *pErrorCode) { /* optimize for several combinations of options */ switch(options&(UBIDI_REMOVE_BIDI_CONTROLS|UBIDI_DO_MIRRORING)) { case 0: { /* simply copy the LTR run to the destination */ int32_t length=srcLength; if(destSize<length) { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; return srcLength; } do { *dest++=*src++; } while(--length>0); return srcLength; } case UBIDI_DO_MIRRORING: { /* do mirroring */ int32_t i=0, j=0; UChar32 c; if(destSize<srcLength) { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; return srcLength; } do { UTF_NEXT_CHAR(src, i, srcLength, c); c=u_charMirror(c); UTF_APPEND_CHAR_UNSAFE(dest, j, c); } while(i<srcLength); return srcLength; } case UBIDI_REMOVE_BIDI_CONTROLS: { /* copy the LTR run and remove any BiDi control characters */ int32_t remaining=destSize; UChar c; do { c=*src++; if(!IS_BIDI_CONTROL_CHAR(c)) { if(--remaining<0) { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; /* preflight the length */ while(--srcLength>0) { c=*src++; if(!IS_BIDI_CONTROL_CHAR(c)) { --remaining; } } return destSize-remaining; } *dest++=c; } } while(--srcLength>0); return destSize-remaining; } default: { /* remove BiDi control characters and do mirroring */ int32_t remaining=destSize; int32_t i, j=0; UChar32 c; do { i=0; UTF_NEXT_CHAR(src, i, srcLength, c); src+=i; srcLength-=i; if(!IS_BIDI_CONTROL_CHAR(c)) { remaining-=i; if(remaining<0) { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; /* preflight the length */ while(srcLength>0) { c=*src++; if(!IS_BIDI_CONTROL_CHAR(c)) { --remaining; } --srcLength; } return destSize-remaining; } c=u_charMirror(c); UTF_APPEND_CHAR_UNSAFE(dest, j, c); } } while(srcLength>0); return j; } } /* end of switch */ }
static int32_t doWriteReverse(const UChar *src, int32_t srcLength, UChar *dest, int32_t destSize, uint16_t options, UErrorCode *pErrorCode) { /* * RTL run - * * RTL runs need to be copied to the destination in reverse order * of code points, not code units, to keep Unicode characters intact. * * The general strategy for this is to read the source text * in backward order, collect all code units for a code point * (and optionally following combining characters, see below), * and copy all these code units in ascending order * to the destination for this run. * * Several options request whether combining characters * should be kept after their base characters, * whether BiDi control characters should be removed, and * whether characters should be replaced by their mirror-image * equivalent Unicode characters. */ int32_t i, j; UChar32 c; /* optimize for several combinations of options */ switch(options&(UBIDI_REMOVE_BIDI_CONTROLS|UBIDI_DO_MIRRORING|UBIDI_KEEP_BASE_COMBINING)) { case 0: /* * With none of the "complicated" options set, the destination * run will have the same length as the source run, * and there is no mirroring and no keeping combining characters * with their base characters. */ if(destSize<srcLength) { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; return srcLength; } destSize=srcLength; /* preserve character integrity */ do { /* i is always after the last code unit known to need to be kept in this segment */ i=srcLength; /* collect code units for one base character */ UTF_BACK_1(src, 0, srcLength); /* copy this base character */ j=srcLength; do { *dest++=src[j++]; } while(j<i); } while(srcLength>0); break; case UBIDI_KEEP_BASE_COMBINING: /* * Here, too, the destination * run will have the same length as the source run, * and there is no mirroring. * We do need to keep combining characters with their base characters. */ if(destSize<srcLength) { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; return srcLength; } destSize=srcLength; /* preserve character integrity */ do { /* i is always after the last code unit known to need to be kept in this segment */ i=srcLength; /* collect code units and modifier letters for one base character */ do { UTF_PREV_CHAR(src, 0, srcLength, c); } while(srcLength>0 && IS_COMBINING(u_charType(c))); /* copy this "user character" */ j=srcLength; do { *dest++=src[j++]; } while(j<i); } while(srcLength>0); break; default: /* * With several "complicated" options set, this is the most * general and the slowest copying of an RTL run. * We will do mirroring, remove BiDi controls, and * keep combining characters with their base characters * as requested. */ if(!(options&UBIDI_REMOVE_BIDI_CONTROLS)) { i=srcLength; } else { /* we need to find out the destination length of the run, which will not include the BiDi control characters */ int32_t length=srcLength; UChar ch; i=0; do { ch=*src++; if(!IS_BIDI_CONTROL_CHAR(ch)) { ++i; } } while(--length>0); src-=srcLength; } if(destSize<i) { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; return i; } destSize=i; /* preserve character integrity */ do { /* i is always after the last code unit known to need to be kept in this segment */ i=srcLength; /* collect code units for one base character */ UTF_PREV_CHAR(src, 0, srcLength, c); if(options&UBIDI_KEEP_BASE_COMBINING) { /* collect modifier letters for this base character */ while(srcLength>0 && IS_COMBINING(u_charType(c))) { UTF_PREV_CHAR(src, 0, srcLength, c); } } if(options&UBIDI_REMOVE_BIDI_CONTROLS && IS_BIDI_CONTROL_CHAR(c)) { /* do not copy this BiDi control character */ continue; } /* copy this "user character" */ j=srcLength; if(options&UBIDI_DO_MIRRORING) { /* mirror only the base character */ int32_t k=0; c=u_charMirror(c); UTF_APPEND_CHAR_UNSAFE(dest, k, c); dest+=k; j+=k; } while(j<i) { *dest++=src[j++]; } } while(srcLength>0); break; } /* end of switch */ return destSize; }