示例#1
0
U_CAPI int32_t U_EXPORT2
ucm_mappingType(UCMStates *baseStates,
                UCMapping *m,
                UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
                uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
    (void)codePoints;
    /* check validity of the bytes and count the characters in them */
    int32_t count=ucm_countChars(baseStates, bytes, m->bLen);
    if(count<1) {
        /* illegal byte sequence */
        return -1;
    }

    /*
     * Suitable for an ICU conversion base table means:
     * - a 1:1 mapping (1 Unicode code point : 1 byte sequence)
     * - precision flag 0..3
     * - SBCS: any 1:1 mapping
     *         (the table stores additional bits to distinguish mapping types)
     * - MBCS: not a |2 SUB mapping for <subchar1>
     * - MBCS: not a |1 fallback to 0x00
     * - MBCS: not a multi-byte mapping with leading 0x00 bytes
     *
     * Further restrictions for fromUnicode tables
     * are enforced in makeconv (MBCSOkForBaseFromUnicode()).
     *
     * All of the MBCS fromUnicode specific tests could be removed from here,
     * but the ones above are for unusual mappings, and removing the tests
     * from here would change canonucm output which seems gratuitous.
     * (Markus Scherer 2006-nov-28)
     *
     * Exception: All implicit mappings (f<0) that need to be moved
     * because of fromUnicode restrictions _must_ be moved here because
     * makeconv uses a hack for moving mappings only for the fromUnicode table
     * that only works with non-negative values of f.
     */
    if( m->uLen==1 && count==1 && m->f<=3 &&
        (baseStates->maxCharLength==1 ||
            !((m->f==2 && m->bLen==1) ||
              (m->f==1 && bytes[0]==0) ||
              (m->f<=1 && m->bLen>1 && bytes[0]==0)))
    ) {
        return 0; /* suitable for a base table */
    } else {
        return 1; /* needs to go into an extension table */
    }
}
示例#2
0
U_CAPI UBool U_EXPORT2
ucm_checkValidity(UCMTable *table, UCMStates *baseStates) {
    UCMapping *m, *mLimit;
    int32_t count;
    UBool isOK;

    m=table->mappings;
    mLimit=m+table->mappingsLength;
    isOK=TRUE;

    while(m<mLimit) {
        count=ucm_countChars(baseStates, UCM_GET_BYTES(table, m), m->bLen);
        if(count<1) {
            ucm_printMapping(table, m, stderr);
            isOK=FALSE;
        }
        ++m;
    }

    return isOK;
}
示例#3
0
static void
createConverter(ConvData *data, const char *converterName, UErrorCode *pErrorCode) {
    ConvData baseData;
    UBool dataIsBase;

    UConverterStaticData *staticData;
    UCMStates *states, *baseStates;

    if(U_FAILURE(*pErrorCode)) {
        return;
    }

    initConvData(data);

    dataIsBase=readFile(data, converterName, pErrorCode);
    if(U_FAILURE(*pErrorCode)) {
        return;
    }

    staticData=&data->staticData;
    states=&data->ucm->states;

    if(dataIsBase) {
        /*
         * Build a normal .cnv file with a base table
         * and an optional extension table.
         */
        data->cnvData=MBCSOpen(data->ucm);
        if(data->cnvData==NULL) {
            *pErrorCode=U_MEMORY_ALLOCATION_ERROR;

        } else if(!data->cnvData->isValid(data->cnvData,
                            staticData->subChar, staticData->subCharLen)
        ) {
            fprintf(stderr, "       the substitution character byte sequence is illegal in this codepage structure!\n");
            *pErrorCode=U_INVALID_TABLE_FORMAT;

        } else if(staticData->subChar1!=0 &&
                    !data->cnvData->isValid(data->cnvData, &staticData->subChar1, 1)
        ) {
            fprintf(stderr, "       the subchar1 byte is illegal in this codepage structure!\n");
            *pErrorCode=U_INVALID_TABLE_FORMAT;

        } else if(
            data->ucm->ext->mappingsLength>0 &&
            !ucm_checkBaseExt(states, data->ucm->base, data->ucm->ext, data->ucm->ext, FALSE)
        ) {
            *pErrorCode=U_INVALID_TABLE_FORMAT;
        } else if(data->ucm->base->flagsType&UCM_FLAGS_EXPLICIT) {
            /* sort the table so that it can be turned into UTF-8-friendly data */
            ucm_sortTable(data->ucm->base);
        }

        if(U_SUCCESS(*pErrorCode)) {
            if(
                /* add the base table after ucm_checkBaseExt()! */
                !data->cnvData->addTable(data->cnvData, data->ucm->base, &data->staticData)
            ) {
                *pErrorCode=U_INVALID_TABLE_FORMAT;
            } else {
                /*
                 * addTable() may have requested moving more mappings to the extension table
                 * if they fit into the base toUnicode table but not into the
                 * base fromUnicode table.
                 * (Especially for UTF-8-friendly fromUnicode tables.)
                 * Such mappings will have the MBCS_FROM_U_EXT_FLAG set, which causes them
                 * to be excluded from the extension toUnicode data.
                 * See MBCSOkForBaseFromUnicode() for which mappings do not fit into
                 * the base fromUnicode table.
                 */
                ucm_moveMappings(data->ucm->base, data->ucm->ext);
                ucm_sortTable(data->ucm->ext);
                if(data->ucm->ext->mappingsLength>0) {
                    /* prepare the extension table, if there is one */
                    data->extData=CnvExtOpen(data->ucm);
                    if(data->extData==NULL) {
                        *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
                    } else if(
                        !data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)
                    ) {
                        *pErrorCode=U_INVALID_TABLE_FORMAT;
                    }
                }
            }
        }
    } else {
        /* Build an extension-only .cnv file. */
        char baseFilename[500];
        char *basename;

        initConvData(&baseData);

        /* assemble a path/filename for data->ucm->baseName */
        uprv_strcpy(baseFilename, converterName);
        basename=(char *)findBasename(baseFilename);
        uprv_strcpy(basename, data->ucm->baseName);
        uprv_strcat(basename, ".ucm");

        /* read the base table */
        dataIsBase=readFile(&baseData, baseFilename, pErrorCode);
        if(U_FAILURE(*pErrorCode)) {
            return;
        } else if(!dataIsBase) {
            fprintf(stderr, "error: the <icu:base> file \"%s\" is not a base table file\n", baseFilename);
            *pErrorCode=U_INVALID_TABLE_FORMAT;
        } else {
            /* prepare the extension table */
            data->extData=CnvExtOpen(data->ucm);
            if(data->extData==NULL) {
                *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
            } else {
                /* fill in gaps in extension file header fields */
                UCMapping *m, *mLimit;
                uint8_t fallbackFlags;

                baseStates=&baseData.ucm->states;
                if(states->conversionType==UCNV_DBCS) {
                    staticData->minBytesPerChar=(int8_t)(states->minCharLength=2);
                } else if(states->minCharLength==0) {
                    staticData->minBytesPerChar=(int8_t)(states->minCharLength=baseStates->minCharLength);
                }
                if(states->maxCharLength<states->minCharLength) {
                    staticData->maxBytesPerChar=(int8_t)(states->maxCharLength=baseStates->maxCharLength);
                }

                if(staticData->subCharLen==0) {
                    uprv_memcpy(staticData->subChar, baseData.staticData.subChar, 4);
                    staticData->subCharLen=baseData.staticData.subCharLen;
                }
                /*
                 * do not copy subChar1 -
                 * only use what is explicitly specified
                 * because it cannot be unset in the extension file header
                 */

                /* get the fallback flags */
                fallbackFlags=0;
                for(m=baseData.ucm->base->mappings, mLimit=m+baseData.ucm->base->mappingsLength;
                    m<mLimit && fallbackFlags!=3;
                    ++m
                ) {
                    if(m->f==1) {
                        fallbackFlags|=1;
                    } else if(m->f==3) {
                        fallbackFlags|=2;
                    }
                }

                if(fallbackFlags&1) {
                    staticData->hasFromUnicodeFallback=TRUE;
                }
                if(fallbackFlags&2) {
                    staticData->hasToUnicodeFallback=TRUE;
                }

                if(1!=ucm_countChars(baseStates, staticData->subChar, staticData->subCharLen)) {
                    fprintf(stderr, "       the substitution character byte sequence is illegal in this codepage structure!\n");
                    *pErrorCode=U_INVALID_TABLE_FORMAT;

                } else if(staticData->subChar1!=0 && 1!=ucm_countChars(baseStates, &staticData->subChar1, 1)) {
                    fprintf(stderr, "       the subchar1 byte is illegal in this codepage structure!\n");
                    *pErrorCode=U_INVALID_TABLE_FORMAT;

                } else if(
                    !ucm_checkValidity(data->ucm->ext, baseStates) ||
                    !ucm_checkBaseExt(baseStates, baseData.ucm->base, data->ucm->ext, data->ucm->ext, FALSE)
                ) {
                    *pErrorCode=U_INVALID_TABLE_FORMAT;
                } else {
                    if(states->maxCharLength>1) {
                        /*
                         * When building a normal .cnv file with a base table
                         * for an MBCS (not SBCS) table with explicit precision flags,
                         * the MBCSAddTable() function marks some mappings for moving
                         * to the extension table.
                         * They fit into the base toUnicode table but not into the
                         * base fromUnicode table.
                         * (Note: We do have explicit precision flags because they are
                         * required for extension table generation, and
                         * ucm_checkBaseExt() verified it.)
                         *
                         * We do not call MBCSAddTable() here (we probably could)
                         * so we need to do the analysis before building the extension table.
                         * We assume that MBCSAddTable() will build a UTF-8-friendly table.
                         * Redundant mappings in the extension table are ok except they cost some size.
                         *
                         * Do this after ucm_checkBaseExt().
                         */
                        const MBCSData *mbcsData=MBCSGetDummy();
                        int32_t needsMove=0;
                        for(m=baseData.ucm->base->mappings, mLimit=m+baseData.ucm->base->mappingsLength;
                            m<mLimit;
                            ++m
                        ) {
                            if(!MBCSOkForBaseFromUnicode(mbcsData, m->b.bytes, m->bLen, m->u, m->f)) {
                                m->f|=MBCS_FROM_U_EXT_FLAG;
                                m->moveFlag=UCM_MOVE_TO_EXT;
                                ++needsMove;
                            }
                        }

                        if(needsMove!=0) {
                            ucm_moveMappings(baseData.ucm->base, data->ucm->ext);
                            ucm_sortTable(data->ucm->ext);
                        }
                    }
                    if(!data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)) {
                        *pErrorCode=U_INVALID_TABLE_FORMAT;
                    }
                }
            }
        }

        cleanupConvData(&baseData);
    }
}