static int32_t compareUnicode(UCMTable *lTable, const UCMapping *l, UCMTable *rTable, const UCMapping *r) { const UChar32 *lu, *ru; int32_t result, i, length; if(l->uLen==1 && r->uLen==1) { /* compare two single code points */ return l->u-r->u; } /* get pointers to the code point sequences */ lu=UCM_GET_CODE_POINTS(lTable, l); ru=UCM_GET_CODE_POINTS(rTable, r); /* get the minimum length */ if(l->uLen<=r->uLen) { length=l->uLen; } else { length=r->uLen; } /* compare the code points */ for(i=0; i<length; ++i) { result=lu[i]-ru[i]; if(result!=0) { return result; } } /* compare the lengths */ return l->uLen-r->uLen; }
/* * remove mappings with their move flag set from the base table * and move some of them (with UCM_MOVE_TO_EXT) to the extension table */ U_CAPI void U_EXPORT2 ucm_moveMappings(UCMTable *base, UCMTable *ext) { UCMapping *mb, *mbLimit; int8_t flag; mb=base->mappings; mbLimit=mb+base->mappingsLength; while(mb<mbLimit) { flag=mb->moveFlag; if(flag!=0) { /* reset the move flag */ mb->moveFlag=0; if(ext!=NULL && (flag&UCM_MOVE_TO_EXT)) { /* add the mapping to the extension table */ ucm_addMapping(ext, mb, UCM_GET_CODE_POINTS(base, mb), UCM_GET_BYTES(base, mb)); } /* remove this mapping: move the last base mapping down and overwrite the current one */ if(mb<(mbLimit-1)) { uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping)); } --mbLimit; --base->mappingsLength; base->isSorted=FALSE; } else { ++mb; } } }
U_CAPI UBool U_EXPORT2 ucm_separateMappings(UCMFile *ucm, UBool isSISO) { UCMTable *table; UCMapping *m, *mLimit; int32_t type; UBool needsMove, isOK; table=ucm->base; m=table->mappings; mLimit=m+table->mappingsLength; needsMove=FALSE; isOK=TRUE; for(; m<mLimit; ++m) { if(isSISO && m->bLen==1 && (m->b.bytes[0]==0xe || m->b.bytes[0]==0xf)) { fprintf(stderr, "warning: removing illegal mapping from an SI/SO-stateful table\n"); ucm_printMapping(table, m, stderr); m->moveFlag|=UCM_REMOVE_MAPPING; needsMove=TRUE; continue; } type=ucm_mappingType( &ucm->states, m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m)); if(type<0) { /* illegal byte sequence */ printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), stderr); isOK=FALSE; } else if(type>0) { m->moveFlag|=UCM_MOVE_TO_EXT; needsMove=TRUE; } } if(!isOK) { return FALSE; } if(needsMove) { ucm_moveMappings(ucm->base, ucm->ext); return ucm_checkBaseExt(&ucm->states, ucm->base, ucm->ext, ucm->ext, FALSE); } else { ucm_sortTable(ucm->base); return TRUE; } }
static uint32_t getToUnicodeValue(CnvExtData *extData, UCMTable *table, UCMapping *m) { UChar32 *u32; UChar *u; uint32_t value; int32_t u16Length, ratio; UErrorCode errorCode; /* write the Unicode result code point or string index */ if(m->uLen==1) { u16Length=U16_LENGTH(m->u); value=(uint32_t)(UCNV_EXT_TO_U_MIN_CODE_POINT+m->u); } else { /* the parser enforces m->uLen<=UCNV_EXT_MAX_UCHARS */ /* get the result code point string and its 16-bit string length */ u32=UCM_GET_CODE_POINTS(table, m); errorCode=U_ZERO_ERROR; u_strFromUTF32(NULL, 0, &u16Length, u32, m->uLen, &errorCode); if(U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) { exit(errorCode); } /* allocate it and put its length and index into the value */ value= (((uint32_t)m->uLen+UCNV_EXT_TO_U_LENGTH_OFFSET)<<UCNV_EXT_TO_U_LENGTH_SHIFT)| ((uint32_t)utm_countItems(extData->toUUChars)); u=utm_allocN(extData->toUUChars, u16Length); /* write the result 16-bit string */ errorCode=U_ZERO_ERROR; u_strFromUTF32(u, u16Length, NULL, u32, m->uLen, &errorCode); if(U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) { exit(errorCode); } } if(m->f==0) { value|=UCNV_EXT_TO_U_ROUNDTRIP_FLAG; } /* update statistics */ if(m->bLen>extData->maxInBytes) { extData->maxInBytes=m->bLen; } if(u16Length>extData->maxOutUChars) { extData->maxOutUChars=u16Length; } ratio=(u16Length+(m->bLen-1))/m->bLen; if(ratio>extData->maxUCharsPerByte) { extData->maxUCharsPerByte=ratio; } return value; }
/* * Remove toUnicode fallbacks and non-<subchar1> SUB mappings * which are irrelevant for the fromUnicode extension table. * Remove MBCS_FROM_U_EXT_FLAG bits. * Overwrite the reverseMap with an index array to the relevant mappings. * Modify the code point sequences to a generator-friendly format where * the first code points remains unchanged but the following are recoded * into 16-bit Unicode string form. * The table must be sorted. * Destroys previous data in the reverseMap. */ static int32_t prepareFromUMappings(UCMTable *table) { UCMapping *mappings, *m; int32_t *map; int32_t i, j, count; int8_t flag; mappings=table->mappings; map=table->reverseMap; count=table->mappingsLength; /* * we do not go through the map on input because the mappings are * sorted lexically */ m=mappings; for(i=j=0; i<count; ++m, ++i) { flag=m->f; if(flag>=0) { flag&=MBCS_FROM_U_EXT_MASK; m->f=flag; } if(flag==0 || flag==1 || (flag==2 && m->bLen==1) || flag==4) { map[j++]=i; if(m->uLen>1) { /* recode all but the first code point to 16-bit Unicode */ UChar32 *u32; UChar *u; UChar32 c; int32_t q, r; u32=UCM_GET_CODE_POINTS(table, m); u=(UChar *)u32; /* destructive in-place recoding */ for(r=2, q=1; q<m->uLen; ++q) { c=u32[q]; U16_APPEND_UNSAFE(u, r, c); } /* counts the first code point always at 2 - the first 16-bit unit is at 16-bit index 2 */ m->uLen=(int8_t)r; } } } return j; }
U_CAPI void U_EXPORT2 ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f) { printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), f); }
U_CAPI void U_EXPORT2 ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable, const uint8_t *subchar, int32_t subcharLength, uint8_t subchar1) { UCMapping *fromUMapping, *toUMapping; int32_t fromUIndex, toUIndex, fromUTop, toUTop, cmp; ucm_sortTable(fromUTable); ucm_sortTable(toUTable); fromUMapping=fromUTable->mappings; toUMapping=toUTable->mappings; fromUTop=fromUTable->mappingsLength; toUTop=toUTable->mappingsLength; fromUIndex=toUIndex=0; while(fromUIndex<fromUTop && toUIndex<toUTop) { cmp=compareMappings(fromUTable, fromUMapping, toUTable, toUMapping, TRUE); if(cmp==0) { /* equal: roundtrip, nothing to do (flags are initially 0) */ ++fromUMapping; ++toUMapping; ++fromUIndex; ++toUIndex; } else if(cmp<0) { /* * the fromU mapping does not have a toU counterpart: * fallback Unicode->codepage */ if( (fromUMapping->bLen==subcharLength && 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) || (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1) ) { fromUMapping->f=2; /* SUB mapping */ } else { fromUMapping->f=1; /* normal fallback */ } ++fromUMapping; ++fromUIndex; } else { /* * the toU mapping does not have a fromU counterpart: * (reverse) fallback codepage->Unicode, copy it to the fromU table */ /* ignore reverse fallbacks to Unicode SUB */ if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) { toUMapping->f=3; /* reverse fallback */ ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping)); /* the table may have been reallocated */ fromUMapping=fromUTable->mappings+fromUIndex; } ++toUMapping; ++toUIndex; } } /* either one or both tables are exhausted */ while(fromUIndex<fromUTop) { /* leftover fromU mappings are fallbacks */ if( (fromUMapping->bLen==subcharLength && 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) || (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1) ) { fromUMapping->f=2; /* SUB mapping */ } else { fromUMapping->f=1; /* normal fallback */ } ++fromUMapping; ++fromUIndex; } while(toUIndex<toUTop) { /* leftover toU mappings are reverse fallbacks */ /* ignore reverse fallbacks to Unicode SUB */ if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) { toUMapping->f=3; /* reverse fallback */ ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping)); } ++toUMapping; ++toUIndex; } fromUTable->isSorted=FALSE; }
static uint8_t checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext, UBool moveToExt, UBool intersectBase) { UCMapping *mb, *me; int32_t *baseMap, *extMap; int32_t b, e, bLimit, eLimit, cmp; uint8_t result; UBool isSISO; baseMap=base->reverseMap; extMap=ext->reverseMap; b=e=0; bLimit=base->mappingsLength; eLimit=ext->mappingsLength; result=0; isSISO=(UBool)(baseStates->outputType==MBCS_OUTPUT_2_SISO); for(;;) { /* skip irrelevant mappings on both sides */ for(;; ++b) { if(b==bLimit) { return result; } mb=base->mappings+baseMap[b]; if(intersectBase==2 && mb->bLen==1) { /* * comparing a base against a DBCS extension: * leave SBCS base mappings alone */ continue; } if(mb->f==0 || mb->f==3) { break; } } for(;;) { if(e==eLimit) { return result; } me=ext->mappings+extMap[e]; if(me->f==0 || me->f==3) { break; } ++e; } /* compare the base and extension mappings */ cmp=compareBytes(base, mb, ext, me, TRUE); if(cmp<0) { if(intersectBase) { /* mapping in base but not in ext, move it */ mb->moveFlag|=UCM_MOVE_TO_EXT; result|=NEEDS_MOVE; /* * does mb map from an input sequence that is a prefix of me's? * for SI/SO tables, a single byte is never a prefix because it * occurs in a separate single-byte state */ } else if( mb->bLen<me->bLen && (!isSISO || mb->bLen>1) && 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen) ) { if(moveToExt) { /* mark this mapping to be moved to the extension table */ mb->moveFlag|=UCM_MOVE_TO_EXT; result|=NEEDS_MOVE; } else { fprintf(stderr, "ucm error: the base table contains a mapping whose input sequence\n" " is a prefix of the input sequence of an extension mapping\n"); ucm_printMapping(base, mb, stderr); ucm_printMapping(ext, me, stderr); result|=HAS_ERRORS; } } ++b; } else if(cmp==0) { /* * same output: remove the extension mapping, * otherwise treat as an error */ if( mb->f==me->f && mb->uLen==me->uLen && 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen) ) { me->moveFlag|=UCM_REMOVE_MAPPING; result|=NEEDS_MOVE; } else if(intersectBase) { /* mapping in base but not in ext, move it */ mb->moveFlag|=UCM_MOVE_TO_EXT; result|=NEEDS_MOVE; } else { fprintf(stderr, "ucm error: the base table contains a mapping whose input sequence\n" " is the same as the input sequence of an extension mapping\n" " but it maps differently\n"); ucm_printMapping(base, mb, stderr); ucm_printMapping(ext, me, stderr); result|=HAS_ERRORS; } ++b; } else /* cmp>0 */ { ++e; } } }
static uint8_t checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext, UBool moveToExt, UBool intersectBase) { (void)baseStates; UCMapping *mb, *me, *mbLimit, *meLimit; int32_t cmp; uint8_t result; mb=base->mappings; mbLimit=mb+base->mappingsLength; me=ext->mappings; meLimit=me+ext->mappingsLength; result=0; for(;;) { /* skip irrelevant mappings on both sides */ for(;;) { if(mb==mbLimit) { return result; } if((0<=mb->f && mb->f<=2) || mb->f==4) { break; } ++mb; } for(;;) { if(me==meLimit) { return result; } if((0<=me->f && me->f<=2) || me->f==4) { break; } ++me; } /* compare the base and extension mappings */ cmp=compareUnicode(base, mb, ext, me); if(cmp<0) { if(intersectBase && (intersectBase!=2 || mb->bLen>1)) { /* * mapping in base but not in ext, move it * * if ext is DBCS, move DBCS mappings here * and check SBCS ones for Unicode prefix below */ mb->moveFlag|=UCM_MOVE_TO_EXT; result|=NEEDS_MOVE; /* does mb map from an input sequence that is a prefix of me's? */ } else if( mb->uLen<me->uLen && 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen) ) { if(moveToExt) { /* mark this mapping to be moved to the extension table */ mb->moveFlag|=UCM_MOVE_TO_EXT; result|=NEEDS_MOVE; } else { fprintf(stderr, "ucm error: the base table contains a mapping whose input sequence\n" " is a prefix of the input sequence of an extension mapping\n"); ucm_printMapping(base, mb, stderr); ucm_printMapping(ext, me, stderr); result|=HAS_ERRORS; } } ++mb; } else if(cmp==0) { /* * same output: remove the extension mapping, * otherwise treat as an error */ if( mb->f==me->f && mb->bLen==me->bLen && 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen) ) { me->moveFlag|=UCM_REMOVE_MAPPING; result|=NEEDS_MOVE; } else if(intersectBase) { /* mapping in base but not in ext, move it */ mb->moveFlag|=UCM_MOVE_TO_EXT; result|=NEEDS_MOVE; } else { fprintf(stderr, "ucm error: the base table contains a mapping whose input sequence\n" " is the same as the input sequence of an extension mapping\n" " but it maps differently\n"); ucm_printMapping(base, mb, stderr); ucm_printMapping(ext, me, stderr); result|=HAS_ERRORS; } ++mb; } else /* cmp>0 */ { ++me; } } }
/* * works like generateToUTable(), except that the * output section consists of two arrays, one for input UChars and one * for result values * * also, fromUTable sections are always stored in a compact form for * access via binary search */ static UBool generateFromUTable(CnvExtData *extData, UCMTable *table, int32_t start, int32_t limit, int32_t unitIndex, uint32_t defaultValue) { UCMapping *mappings, *m; int32_t *map; int32_t i, j, uniqueCount, count, subStart, subLimit; UChar *uchars; UChar32 low, high, prev; UChar *sectionUChars; uint32_t *sectionValues; mappings=table->mappings; map=table->reverseMap; /* step 1: examine the input units; set low, high, uniqueCount */ m=mappings+map[start]; uchars=(UChar *)UCM_GET_CODE_POINTS(table, m); low=uchars[unitIndex]; uniqueCount=1; prev=high=low; for(i=start+1; i<limit; ++i) { m=mappings+map[i]; uchars=(UChar *)UCM_GET_CODE_POINTS(table, m); high=uchars[unitIndex]; if(high!=prev) { prev=high; ++uniqueCount; } } /* step 2: allocate the section; set count, section */ /* the fromUTable always stores for access via binary search */ count=uniqueCount; /* allocate the section: 1 entry for the header + count for the items */ sectionUChars=(UChar *)utm_allocN(extData->fromUTableUChars, 1+count); sectionValues=(uint32_t *)utm_allocN(extData->fromUTableValues, 1+count); /* write the section header */ *sectionUChars++=(UChar)count; *sectionValues++=defaultValue; /* step 3: write temporary section table with subsection starts */ prev=low-1; /* just before low to prevent empty subsections before low */ j=0; /* section table index */ for(i=start; i<limit; ++i) { m=mappings+map[i]; uchars=(UChar *)UCM_GET_CODE_POINTS(table, m); high=uchars[unitIndex]; if(high!=prev) { /* start of a new subsection for unit high */ prev=high; /* write the entry with the subsection start */ sectionUChars[j]=(UChar)high; sectionValues[j]=(uint32_t)i; ++j; } } /* assert(j==count) */ /* step 4: recurse and write results */ subLimit=(int32_t)(sectionValues[0]); for(j=0; j<count; ++j) { subStart=subLimit; subLimit= (j+1)<count ? (int32_t)(sectionValues[j+1]) : limit; /* see if there is exactly one input unit sequence of length unitIndex+1 */ defaultValue=0; m=mappings+map[subStart]; if(m->uLen==unitIndex+1) { /* do not include this in generateToUTable() */ ++subStart; if(subStart<subLimit && mappings[map[subStart]].uLen==unitIndex+1) { /* print error for multiple same-input-sequence mappings */ fprintf(stderr, "error: multiple mappings from same Unicode code points\n"); ucm_printMapping(table, m, stderr); ucm_printMapping(table, mappings+map[subStart], stderr); return FALSE; } defaultValue=getFromUBytesValue(extData, table, m); } if(subStart==subLimit) { /* write the result for the input sequence ending here */ sectionValues[j]=defaultValue; } else { /* write the index to the subsection table */ sectionValues[j]=(uint32_t)utm_countItems(extData->fromUTableValues); /* recurse */ if(!generateFromUTable(extData, table, subStart, subLimit, unitIndex+1, defaultValue)) { return FALSE; } } } return TRUE; }
static uint32_t getFromUBytesValue(CnvExtData *extData, UCMTable *table, UCMapping *m) { uint8_t *bytes, *resultBytes; uint32_t value; int32_t u16Length, ratio; if(m->f==2) { /* * no mapping, <subchar1> preferred * * no need to count in statistics because the subchars are already * counted for maxOutBytes and maxBytesPerUChar in UConverterStaticData, * and this non-mapping does not count for maxInUChars which are always * trivially at least two if counting unmappable supplementary code points */ return UCNV_EXT_FROM_U_SUBCHAR1; } bytes=UCM_GET_BYTES(table, m); value=0; switch(m->bLen) { /* 1..3: store the bytes in the value word */ case 3: value=((uint32_t)*bytes++)<<16; case 2: value|=((uint32_t)*bytes++)<<8; case 1: value|=*bytes; break; default: /* the parser enforces m->bLen<=UCNV_EXT_MAX_BYTES */ /* store the bytes in fromUBytes[] and the index in the value word */ value=(uint32_t)utm_countItems(extData->fromUBytes); resultBytes=utm_allocN(extData->fromUBytes, m->bLen); uprv_memcpy(resultBytes, bytes, m->bLen); break; } value|=(uint32_t)m->bLen<<UCNV_EXT_FROM_U_LENGTH_SHIFT; if(m->f==0) { value|=UCNV_EXT_FROM_U_ROUNDTRIP_FLAG; } /* calculate the real UTF-16 length (see recoding in prepareFromUMappings()) */ if(m->uLen==1) { u16Length=U16_LENGTH(m->u); } else { u16Length=U16_LENGTH(UCM_GET_CODE_POINTS(table, m)[0])+(m->uLen-2); } /* update statistics */ if(u16Length>extData->maxInUChars) { extData->maxInUChars=u16Length; } if(m->bLen>extData->maxOutBytes) { extData->maxOutBytes=m->bLen; } ratio=(m->bLen+(u16Length-1))/u16Length; if(ratio>extData->maxBytesPerUChar) { extData->maxBytesPerUChar=ratio; } return value; }