/* * remove mappings with their move flag set from the base table * and move some of them (with UCM_MOVE_TO_EXT) to the extension table */ U_CAPI void U_EXPORT2 ucm_moveMappings(UCMTable *base, UCMTable *ext) { UCMapping *mb, *mbLimit; int8_t flag; mb=base->mappings; mbLimit=mb+base->mappingsLength; while(mb<mbLimit) { flag=mb->moveFlag; if(flag!=0) { /* reset the move flag */ mb->moveFlag=0; if(ext!=NULL && (flag&UCM_MOVE_TO_EXT)) { /* add the mapping to the extension table */ ucm_addMapping(ext, mb, UCM_GET_CODE_POINTS(base, mb), UCM_GET_BYTES(base, mb)); } /* remove this mapping: move the last base mapping down and overwrite the current one */ if(mb<(mbLimit-1)) { uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping)); } --mbLimit; --base->mappingsLength; base->isSorted=FALSE; } else { ++mb; } } }
U_CAPI UBool U_EXPORT2 ucm_separateMappings(UCMFile *ucm, UBool isSISO) { UCMTable *table; UCMapping *m, *mLimit; int32_t type; UBool needsMove, isOK; table=ucm->base; m=table->mappings; mLimit=m+table->mappingsLength; needsMove=FALSE; isOK=TRUE; for(; m<mLimit; ++m) { if(isSISO && m->bLen==1 && (m->b.bytes[0]==0xe || m->b.bytes[0]==0xf)) { fprintf(stderr, "warning: removing illegal mapping from an SI/SO-stateful table\n"); ucm_printMapping(table, m, stderr); m->moveFlag|=UCM_REMOVE_MAPPING; needsMove=TRUE; continue; } type=ucm_mappingType( &ucm->states, m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m)); if(type<0) { /* illegal byte sequence */ printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), stderr); isOK=FALSE; } else if(type>0) { m->moveFlag|=UCM_MOVE_TO_EXT; needsMove=TRUE; } } if(!isOK) { return FALSE; } if(needsMove) { ucm_moveMappings(ucm->base, ucm->ext); return ucm_checkBaseExt(&ucm->states, ucm->base, ucm->ext, ucm->ext, FALSE); } else { ucm_sortTable(ucm->base); return TRUE; } }
static int32_t compareBytes(UCMTable *lTable, const UCMapping *l, UCMTable *rTable, const UCMapping *r, UBool lexical) { const uint8_t *lb, *rb; int32_t result, i, length; /* * A lexical comparison is used for sorting in the builder, to allow * an efficient search for a byte sequence that could be a prefix * of a previously entered byte sequence. * * Comparing by lengths first is for compatibility with old .ucm tools * like canonucm and rptp2ucm. */ if(lexical) { /* get the minimum length and continue */ if(l->bLen<=r->bLen) { length=l->bLen; } else { length=r->bLen; } } else { /* compare lengths first */ result=l->bLen-r->bLen; if(result!=0) { return result; } else { length=l->bLen; } } /* get pointers to the byte sequences */ lb=UCM_GET_BYTES(lTable, l); rb=UCM_GET_BYTES(rTable, r); /* compare the bytes */ for(i=0; i<length; ++i) { result=lb[i]-rb[i]; if(result!=0) { return result; } } /* compare the lengths */ return l->bLen-r->bLen; }
U_CAPI UBool U_EXPORT2 ucm_checkValidity(UCMTable *table, UCMStates *baseStates) { UCMapping *m, *mLimit; int32_t count; UBool isOK; m=table->mappings; mLimit=m+table->mappingsLength; isOK=TRUE; while(m<mLimit) { count=ucm_countChars(baseStates, UCM_GET_BYTES(table, m), m->bLen); if(count<1) { ucm_printMapping(table, m, stderr); isOK=FALSE; } ++m; } return isOK; }
U_CAPI void U_EXPORT2 ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f) { printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), f); }
U_CAPI void U_EXPORT2 ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable, const uint8_t *subchar, int32_t subcharLength, uint8_t subchar1) { UCMapping *fromUMapping, *toUMapping; int32_t fromUIndex, toUIndex, fromUTop, toUTop, cmp; ucm_sortTable(fromUTable); ucm_sortTable(toUTable); fromUMapping=fromUTable->mappings; toUMapping=toUTable->mappings; fromUTop=fromUTable->mappingsLength; toUTop=toUTable->mappingsLength; fromUIndex=toUIndex=0; while(fromUIndex<fromUTop && toUIndex<toUTop) { cmp=compareMappings(fromUTable, fromUMapping, toUTable, toUMapping, TRUE); if(cmp==0) { /* equal: roundtrip, nothing to do (flags are initially 0) */ ++fromUMapping; ++toUMapping; ++fromUIndex; ++toUIndex; } else if(cmp<0) { /* * the fromU mapping does not have a toU counterpart: * fallback Unicode->codepage */ if( (fromUMapping->bLen==subcharLength && 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) || (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1) ) { fromUMapping->f=2; /* SUB mapping */ } else { fromUMapping->f=1; /* normal fallback */ } ++fromUMapping; ++fromUIndex; } else { /* * the toU mapping does not have a fromU counterpart: * (reverse) fallback codepage->Unicode, copy it to the fromU table */ /* ignore reverse fallbacks to Unicode SUB */ if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) { toUMapping->f=3; /* reverse fallback */ ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping)); /* the table may have been reallocated */ fromUMapping=fromUTable->mappings+fromUIndex; } ++toUMapping; ++toUIndex; } } /* either one or both tables are exhausted */ while(fromUIndex<fromUTop) { /* leftover fromU mappings are fallbacks */ if( (fromUMapping->bLen==subcharLength && 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) || (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1) ) { fromUMapping->f=2; /* SUB mapping */ } else { fromUMapping->f=1; /* normal fallback */ } ++fromUMapping; ++fromUIndex; } while(toUIndex<toUTop) { /* leftover toU mappings are reverse fallbacks */ /* ignore reverse fallbacks to Unicode SUB */ if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) { toUMapping->f=3; /* reverse fallback */ ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping)); } ++toUMapping; ++toUIndex; } fromUTable->isSorted=FALSE; }
static uint8_t checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext, UBool moveToExt, UBool intersectBase) { UCMapping *mb, *me; int32_t *baseMap, *extMap; int32_t b, e, bLimit, eLimit, cmp; uint8_t result; UBool isSISO; baseMap=base->reverseMap; extMap=ext->reverseMap; b=e=0; bLimit=base->mappingsLength; eLimit=ext->mappingsLength; result=0; isSISO=(UBool)(baseStates->outputType==MBCS_OUTPUT_2_SISO); for(;;) { /* skip irrelevant mappings on both sides */ for(;; ++b) { if(b==bLimit) { return result; } mb=base->mappings+baseMap[b]; if(intersectBase==2 && mb->bLen==1) { /* * comparing a base against a DBCS extension: * leave SBCS base mappings alone */ continue; } if(mb->f==0 || mb->f==3) { break; } } for(;;) { if(e==eLimit) { return result; } me=ext->mappings+extMap[e]; if(me->f==0 || me->f==3) { break; } ++e; } /* compare the base and extension mappings */ cmp=compareBytes(base, mb, ext, me, TRUE); if(cmp<0) { if(intersectBase) { /* mapping in base but not in ext, move it */ mb->moveFlag|=UCM_MOVE_TO_EXT; result|=NEEDS_MOVE; /* * does mb map from an input sequence that is a prefix of me's? * for SI/SO tables, a single byte is never a prefix because it * occurs in a separate single-byte state */ } else if( mb->bLen<me->bLen && (!isSISO || mb->bLen>1) && 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen) ) { if(moveToExt) { /* mark this mapping to be moved to the extension table */ mb->moveFlag|=UCM_MOVE_TO_EXT; result|=NEEDS_MOVE; } else { fprintf(stderr, "ucm error: the base table contains a mapping whose input sequence\n" " is a prefix of the input sequence of an extension mapping\n"); ucm_printMapping(base, mb, stderr); ucm_printMapping(ext, me, stderr); result|=HAS_ERRORS; } } ++b; } else if(cmp==0) { /* * same output: remove the extension mapping, * otherwise treat as an error */ if( mb->f==me->f && mb->uLen==me->uLen && 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen) ) { me->moveFlag|=UCM_REMOVE_MAPPING; result|=NEEDS_MOVE; } else if(intersectBase) { /* mapping in base but not in ext, move it */ mb->moveFlag|=UCM_MOVE_TO_EXT; result|=NEEDS_MOVE; } else { fprintf(stderr, "ucm error: the base table contains a mapping whose input sequence\n" " is the same as the input sequence of an extension mapping\n" " but it maps differently\n"); ucm_printMapping(base, mb, stderr); ucm_printMapping(ext, me, stderr); result|=HAS_ERRORS; } ++b; } else /* cmp>0 */ { ++e; } } }
static uint8_t checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext, UBool moveToExt, UBool intersectBase) { (void)baseStates; UCMapping *mb, *me, *mbLimit, *meLimit; int32_t cmp; uint8_t result; mb=base->mappings; mbLimit=mb+base->mappingsLength; me=ext->mappings; meLimit=me+ext->mappingsLength; result=0; for(;;) { /* skip irrelevant mappings on both sides */ for(;;) { if(mb==mbLimit) { return result; } if((0<=mb->f && mb->f<=2) || mb->f==4) { break; } ++mb; } for(;;) { if(me==meLimit) { return result; } if((0<=me->f && me->f<=2) || me->f==4) { break; } ++me; } /* compare the base and extension mappings */ cmp=compareUnicode(base, mb, ext, me); if(cmp<0) { if(intersectBase && (intersectBase!=2 || mb->bLen>1)) { /* * mapping in base but not in ext, move it * * if ext is DBCS, move DBCS mappings here * and check SBCS ones for Unicode prefix below */ mb->moveFlag|=UCM_MOVE_TO_EXT; result|=NEEDS_MOVE; /* does mb map from an input sequence that is a prefix of me's? */ } else if( mb->uLen<me->uLen && 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen) ) { if(moveToExt) { /* mark this mapping to be moved to the extension table */ mb->moveFlag|=UCM_MOVE_TO_EXT; result|=NEEDS_MOVE; } else { fprintf(stderr, "ucm error: the base table contains a mapping whose input sequence\n" " is a prefix of the input sequence of an extension mapping\n"); ucm_printMapping(base, mb, stderr); ucm_printMapping(ext, me, stderr); result|=HAS_ERRORS; } } ++mb; } else if(cmp==0) { /* * same output: remove the extension mapping, * otherwise treat as an error */ if( mb->f==me->f && mb->bLen==me->bLen && 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen) ) { me->moveFlag|=UCM_REMOVE_MAPPING; result|=NEEDS_MOVE; } else if(intersectBase) { /* mapping in base but not in ext, move it */ mb->moveFlag|=UCM_MOVE_TO_EXT; result|=NEEDS_MOVE; } else { fprintf(stderr, "ucm error: the base table contains a mapping whose input sequence\n" " is the same as the input sequence of an extension mapping\n" " but it maps differently\n"); ucm_printMapping(base, mb, stderr); ucm_printMapping(ext, me, stderr); result|=HAS_ERRORS; } ++mb; } else /* cmp>0 */ { ++me; } } }
static uint32_t getFromUBytesValue(CnvExtData *extData, UCMTable *table, UCMapping *m) { uint8_t *bytes, *resultBytes; uint32_t value; int32_t u16Length, ratio; if(m->f==2) { /* * no mapping, <subchar1> preferred * * no need to count in statistics because the subchars are already * counted for maxOutBytes and maxBytesPerUChar in UConverterStaticData, * and this non-mapping does not count for maxInUChars which are always * trivially at least two if counting unmappable supplementary code points */ return UCNV_EXT_FROM_U_SUBCHAR1; } bytes=UCM_GET_BYTES(table, m); value=0; switch(m->bLen) { /* 1..3: store the bytes in the value word */ case 3: value=((uint32_t)*bytes++)<<16; case 2: value|=((uint32_t)*bytes++)<<8; case 1: value|=*bytes; break; default: /* the parser enforces m->bLen<=UCNV_EXT_MAX_BYTES */ /* store the bytes in fromUBytes[] and the index in the value word */ value=(uint32_t)utm_countItems(extData->fromUBytes); resultBytes=utm_allocN(extData->fromUBytes, m->bLen); uprv_memcpy(resultBytes, bytes, m->bLen); break; } value|=(uint32_t)m->bLen<<UCNV_EXT_FROM_U_LENGTH_SHIFT; if(m->f==0) { value|=UCNV_EXT_FROM_U_ROUNDTRIP_FLAG; } /* calculate the real UTF-16 length (see recoding in prepareFromUMappings()) */ if(m->uLen==1) { u16Length=U16_LENGTH(m->u); } else { u16Length=U16_LENGTH(UCM_GET_CODE_POINTS(table, m)[0])+(m->uLen-2); } /* update statistics */ if(u16Length>extData->maxInUChars) { extData->maxInUChars=u16Length; } if(m->bLen>extData->maxOutBytes) { extData->maxOutBytes=m->bLen; } ratio=(m->bLen+(u16Length-1))/u16Length; if(ratio>extData->maxBytesPerUChar) { extData->maxBytesPerUChar=ratio; } return value; }
/* * Recursive toUTable generator core function. * Preconditions: * - start<limit (There is at least one mapping.) * - The mappings are sorted lexically. (Access is through the reverseMap.) * - All mappings between start and limit have input sequences that share * the same prefix of unitIndex length, and therefore all of these sequences * are at least unitIndex+1 long. * - There are only relevant mappings available through the reverseMap, * see reduceToUMappings(). * * One function invocation generates one section table. * * Steps: * 1. Count the number of unique unit values and get the low/high unit values * that occur at unitIndex. * 2. Allocate the section table with possible optimization for linear access. * 3. Write temporary version of the section table with start indexes of * subsections, each corresponding to one unit value at unitIndex. * 4. Iterate through the table once more, and depending on the subsection length: * 0: write 0 as a result value (unused byte in linear-access section table) * >0: if there is one mapping with an input unit sequence of unitIndex+1 * then defaultValue=compute the mapping result for this whole sequence * else defaultValue=0 * * recurse into the subsection */ static UBool generateToUTable(CnvExtData *extData, UCMTable *table, int32_t start, int32_t limit, int32_t unitIndex, uint32_t defaultValue) { UCMapping *mappings, *m; int32_t *map; int32_t i, j, uniqueCount, count, subStart, subLimit; uint8_t *bytes; int32_t low, high, prev; uint32_t *section; mappings=table->mappings; map=table->reverseMap; /* step 1: examine the input units; set low, high, uniqueCount */ m=mappings+map[start]; bytes=UCM_GET_BYTES(table, m); low=bytes[unitIndex]; uniqueCount=1; prev=high=low; for(i=start+1; i<limit; ++i) { m=mappings+map[i]; bytes=UCM_GET_BYTES(table, m); high=bytes[unitIndex]; if(high!=prev) { prev=high; ++uniqueCount; } } /* step 2: allocate the section; set count, section */ count=(high-low)+1; if(unitIndex==0 || uniqueCount>=(3*count)/4) { /* * for the root table and for fairly full tables: * allocate for direct, linear array access * by keeping count, to write an entry for each unit value * from low to high */ } else { count=uniqueCount; } /* allocate the section: 1 entry for the header + count for the items */ section=(uint32_t *)utm_allocN(extData->toUTable, 1+count); /* write the section header */ *section++=((uint32_t)count<<UCNV_EXT_TO_U_BYTE_SHIFT)|defaultValue; /* step 3: write temporary section table with subsection starts */ prev=low-1; /* just before low to prevent empty subsections before low */ j=0; /* section table index */ for(i=start; i<limit; ++i) { m=mappings+map[i]; bytes=UCM_GET_BYTES(table, m); high=bytes[unitIndex]; if(high!=prev) { /* start of a new subsection for unit high */ if(count>uniqueCount) { /* write empty subsections for unused units in a linear table */ while(++prev<high) { section[j++]=((uint32_t)prev<<UCNV_EXT_TO_U_BYTE_SHIFT)|(uint32_t)i; } } else { prev=high; } /* write the entry with the subsection start */ section[j++]=((uint32_t)high<<UCNV_EXT_TO_U_BYTE_SHIFT)|(uint32_t)i; } } /* assert(j==count) */ /* step 4: recurse and write results */ subLimit=UCNV_EXT_TO_U_GET_VALUE(section[0]); for(j=0; j<count; ++j) { subStart=subLimit; subLimit= (j+1)<count ? UCNV_EXT_TO_U_GET_VALUE(section[j+1]) : limit; /* remove the subStart temporary value */ section[j]&=~UCNV_EXT_TO_U_VALUE_MASK; if(subStart==subLimit) { /* leave the value zero: empty subsection for unused unit in a linear table */ continue; } /* see if there is exactly one input unit sequence of length unitIndex+1 */ defaultValue=0; m=mappings+map[subStart]; if(m->bLen==unitIndex+1) { /* do not include this in generateToUTable() */ ++subStart; if(subStart<subLimit && mappings[map[subStart]].bLen==unitIndex+1) { /* print error for multiple same-input-sequence mappings */ fprintf(stderr, "error: multiple mappings from same bytes\n"); ucm_printMapping(table, m, stderr); ucm_printMapping(table, mappings+map[subStart], stderr); return FALSE; } defaultValue=getToUnicodeValue(extData, table, m); } if(subStart==subLimit) { /* write the result for the input sequence ending here */ section[j]|=defaultValue; } else { /* write the index to the subsection table */ section[j]|=(uint32_t)utm_countItems(extData->toUTable); /* recurse */ if(!generateToUTable(extData, table, subStart, subLimit, unitIndex+1, defaultValue)) { return FALSE; } } } return TRUE; }