Ejemplo n.º 1
0
void TextCodecICU::createICUConverter() const
{
    ASSERT(!m_converterICU);

#if defined(USING_SYSTEM_ICU)
    const char* name = m_encoding.name();
    m_needsGBKFallbacks = name[0] == 'G' && name[1] == 'B' && name[2] == 'K' && !name[3];
#endif

    UErrorCode err;

    UConverter*& cachedConverter = cachedConverterICU();
    if (cachedConverter) {
        err = U_ZERO_ERROR;
        const char* cachedName = ucnv_getName(cachedConverter, &err);
        if (U_SUCCESS(err) && m_encoding == cachedName) {
            m_converterICU = cachedConverter;
            cachedConverter = 0;
            return;
        }
    }

    err = U_ZERO_ERROR;
    m_converterICU = ucnv_open(m_encoding.name(), &err);
#if !LOG_DISABLED
    if (err == U_AMBIGUOUS_ALIAS_WARNING)
        WTF_LOG_ERROR("ICU ambiguous alias warning for encoding: %s", m_encoding.name());
#endif
    if (m_converterICU)
        ucnv_setFallback(m_converterICU, TRUE);
}
void TextCodecICU::createICUConverter() const
{
    ASSERT(!m_converterICU);

    UErrorCode err;

    m_needsGBKFallbacks = !strcmp(m_encodingName, "GBK");

    UConverter*& cachedConverter = cachedConverterICU();
    if (cachedConverter) {
        err = U_ZERO_ERROR;
        const char* cachedConverterName = ucnv_getName(cachedConverter, &err);
        if (U_SUCCESS(err) && !strcmp(m_canonicalConverterName, cachedConverterName)) {
            m_converterICU = cachedConverter;
            cachedConverter = 0;
            return;
        }
    }

    err = U_ZERO_ERROR;
    m_converterICU = ucnv_open(m_canonicalConverterName, &err);
    ASSERT(U_SUCCESS(err));
    if (m_converterICU)
        ucnv_setFallback(m_converterICU, TRUE);
}
Ejemplo n.º 3
0
// Convert a file from one encoding to another
static UBool convertFile(const char *pname,
                         const char *fromcpage,
                         UConverterToUCallback toucallback,
                         const void *touctxt,
                         const char *tocpage,
                         UConverterFromUCallback fromucallback,
                         const void *fromuctxt,
                         int fallback,
                         size_t bufsz,
                         const char *translit,
                         const char *infilestr,
                         FILE * outfile, int verbose)
{
    FILE *infile;
    UBool ret = TRUE;
    UConverter *convfrom = 0;
    UConverter *convto = 0;
    UErrorCode err = U_ZERO_ERROR;
    UBool flush;
    const char *cbufp;
    char *bufp;
    char *buf = 0;

    uint32_t infoffset = 0, outfoffset = 0;   /* Where we are in the file, for error reporting. */

    const UChar *unibufbp;
    UChar *unibufp;
    UChar *unibuf = 0;
    int32_t *fromoffsets = 0, *tooffsets = 0;

    size_t rd, wr, tobufsz;

#if !UCONFIG_NO_TRANSLITERATION
    Transliterator *t = 0;      // Transliterator acting on Unicode data.
#endif
    UnicodeString u;            // String to do the transliteration.

    // Open the correct input file or connect to stdin for reading input

    if (infilestr != 0 && strcmp(infilestr, "-")) {
        infile = fopen(infilestr, "rb");
        if (infile == 0) {
            UnicodeString str1(infilestr, "");
            str1.append((UChar32) 0);
            UnicodeString str2(strerror(errno), "");
            str2.append((UChar32) 0);
            initMsg(pname);
            u_wmsg(stderr, "cantOpenInputF", str1.getBuffer(), str2.getBuffer());
            return FALSE;
        }
    } else {
        infilestr = "-";
        infile = stdin;
#ifdef WIN32
        if (setmode(fileno(stdin), O_BINARY) == -1) {
            initMsg(pname);
            u_wmsg(stderr, "cantSetInBinMode");
            return FALSE;
        }
#endif
    }

    if (verbose) {
        fprintf(stderr, "%s:\n", infilestr);
    }

#if !UCONFIG_NO_TRANSLITERATION
    // Create transliterator as needed.

    if (translit != NULL && *translit) {
        UParseError parse;
        UnicodeString str(translit), pestr;

        /* Create from rules or by ID as needed. */

        parse.line = -1;

        if (uprv_strchr(translit, ':') || uprv_strchr(translit, '>') || uprv_strchr(translit, '<') || uprv_strchr(translit, '>')) {
            t = Transliterator::createFromRules("Uconv", str, UTRANS_FORWARD, parse, err);
        } else {
            t = Transliterator::createInstance(translit, UTRANS_FORWARD, err);
        }

        if (U_FAILURE(err)) {
            str.append((UChar32) 0);
            initMsg(pname);

            if (parse.line >= 0) {
                UChar linebuf[20], offsetbuf[20];
                uprv_itou(linebuf, 20, parse.line, 10, 0);
                uprv_itou(offsetbuf, 20, parse.offset, 10, 0);
                u_wmsg(stderr, "cantCreateTranslitParseErr", str.getBuffer(),
                    u_wmsg_errorName(err), linebuf, offsetbuf);
            } else {
                u_wmsg(stderr, "cantCreateTranslit", str.getBuffer(),
                    u_wmsg_errorName(err));
            }

            if (t) {
                delete t;
                t = 0;
            }
            goto error_exit;
        }
    }
#endif

    // Create codepage converter. If the codepage or its aliases weren't
    // available, it returns NULL and a failure code. We also set the
    // callbacks, and return errors in the same way.

    convfrom = ucnv_open(fromcpage, &err);
    if (U_FAILURE(err)) {
        UnicodeString str(fromcpage, (int32_t)(uprv_strlen(fromcpage) + 1));
        initMsg(pname);
        u_wmsg(stderr, "cantOpenFromCodeset", str.getBuffer(),
            u_wmsg_errorName(err));
        goto error_exit;
    }
    ucnv_setToUCallBack(convfrom, toucallback, touctxt, 0, 0, &err);
    if (U_FAILURE(err)) {
        initMsg(pname);
        u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err));
        goto error_exit;
    }

    convto = ucnv_open(tocpage, &err);
    if (U_FAILURE(err)) {
        UnicodeString str(tocpage, (int32_t)(uprv_strlen(tocpage) + 1));
        initMsg(pname);
        u_wmsg(stderr, "cantOpenToCodeset", str.getBuffer(),
            u_wmsg_errorName(err));
        goto error_exit;
    }
    ucnv_setFromUCallBack(convto, fromucallback, fromuctxt, 0, 0, &err);
    if (U_FAILURE(err)) {
        initMsg(pname);
        u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err));
        goto error_exit;
    }
    ucnv_setFallback(convto, fallback);

    // To ensure that the buffer always is of enough size, we
    // must take the worst case scenario, that is the character in
    // the codepage that uses the most bytes and multiply it against
    // the buffer size.

    // use bufsz+1 to allow for additional BOM/signature character (U+FEFF)
    tobufsz = (bufsz+1) * ucnv_getMaxCharSize(convto);

    buf = new char[tobufsz];
    unibuf = new UChar[bufsz];

    fromoffsets = new int32_t[bufsz];
    tooffsets = new int32_t[tobufsz];

    // OK, we can convert now.

    do {
        char willexit = 0;

        rd = fread(buf, 1, bufsz, infile);
        if (ferror(infile) != 0) {
            UnicodeString str(strerror(errno));
            str.append((UChar32) 0);
            initMsg(pname);
            u_wmsg(stderr, "cantRead", str.getBuffer());
            goto error_exit;
        }

        // Convert the read buffer into the new coding
        // After the call 'unibufp' will be placed on the last
        // character that was converted in the 'unibuf'.
        // Also the 'cbufp' is positioned on the last converted
        // character.
        // At the last conversion in the file, flush should be set to
        // true so that we get all characters converted
        //
        // The converter must be flushed at the end of conversion so
        // that characters on hold also will be written.

        unibufp = unibuf;
        cbufp = buf;
        flush = rd != bufsz;
        ucnv_toUnicode(convfrom, &unibufp, unibufp + bufsz, &cbufp,
            cbufp + rd, fromoffsets, flush, &err);

        infoffset += (uint32_t)(cbufp - buf);

        if (U_FAILURE(err)) {
            char pos[32];
            sprintf(pos, "%u", infoffset - 1);
            UnicodeString str(pos, (int32_t)(uprv_strlen(pos) + 1));
            initMsg(pname);
            u_wmsg(stderr, "problemCvtToU", str.getBuffer(), u_wmsg_errorName(err));
            willexit = 1;
            err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */
        }

        // At the last conversion, the converted characters should be
        // equal to number of chars read.

        if (flush && !willexit && cbufp != (buf + rd)) {
            char pos[32];
            sprintf(pos, "%u", infoffset);
            UnicodeString str(pos, (int32_t)(uprv_strlen(pos) + 1));
            initMsg(pname);
            u_wmsg(stderr, "premEndInput", str.getBuffer());
            willexit = 1;
        }

        // Prepare to transliterate and convert. Transliterate if needed.

#if !UCONFIG_NO_TRANSLITERATION
        if (t) {
            u.setTo(unibuf, (int32_t)(unibufp - unibuf)); // Copy into string.
            t->transliterate(u);
        } else
#endif
        {
            u.setTo(unibuf, (int32_t)(unibufp - unibuf), (int32_t)(bufsz)); // Share the buffer.
        }

        int32_t ulen = u.length();

        // Convert the Unicode buffer into the destination codepage
        // Again 'bufp' will be placed on the last converted character
        // And 'unibufbp' will be placed on the last converted unicode character
        // At the last conversion flush should be set to true to ensure that
        // all characters left get converted

        const UChar *unibufu = unibufbp = u.getBuffer();

        do {
            int32_t len = ulen > (int32_t)bufsz ? (int32_t)bufsz : ulen;

            bufp = buf;
            unibufp = (UChar *) (unibufbp + len);

            ucnv_fromUnicode(convto, &bufp, bufp + tobufsz,
                             &unibufbp,
                             unibufp,
                             tooffsets, flush, &err);

            if (U_FAILURE(err)) {
                const char *errtag;
                char pos[32];

                uint32_t erroffset =
                    dataOffset((int32_t)(bufp - buf - 1), fromoffsets, (int32_t)(bufsz), tooffsets, (int32_t)(tobufsz));
                int32_t ferroffset = (int32_t)(infoffset - (unibufp - unibufu) + erroffset);

                if ((int32_t) ferroffset < 0) {
                    ferroffset = (int32_t)(outfoffset + (bufp - buf));
                    errtag = "problemCvtFromUOut";
                } else {
                    errtag = "problemCvtFromU";
                }
                sprintf(pos, "%u", ferroffset);
                UnicodeString str(pos, (int32_t)(uprv_strlen(pos) + 1));
                initMsg(pname);
                u_wmsg(stderr, errtag, str.getBuffer(),
                       u_wmsg_errorName(err));
                willexit = 1;
            }

            // At the last conversion, the converted characters should be equal to number
            // of consumed characters.
            if (flush && !willexit && unibufbp != (unibufu + (size_t) (unibufp - unibufu))) {
                char pos[32];
                sprintf(pos, "%u", infoffset);
                UnicodeString str(pos, (int32_t)(uprv_strlen(pos) + 1));
                initMsg(pname);
                u_wmsg(stderr, "premEnd", str.getBuffer());
                willexit = 1;
            }

            // Finally, write the converted buffer to the output file


            rd = (size_t) (bufp - buf);
            outfoffset += (int32_t)(wr = fwrite(buf, 1, rd, outfile));
            if (wr != rd) {
                UnicodeString str(strerror(errno), "");
                initMsg(pname);
                u_wmsg(stderr, "cantWrite", str.getBuffer());
                willexit = 1;
            }

            if (willexit) {
                goto error_exit;
            }
        } while ((ulen -= (int32_t)(bufsz)) > 0);
    } while (!flush);           // Stop when we have flushed the
                                // converters (this means that it's
                                // the end of output)

    goto normal_exit;

error_exit:
    ret = FALSE;

normal_exit:
    // Cleanup.

    if (convfrom) ucnv_close(convfrom);
    if (convto) ucnv_close(convto);

#if !UCONFIG_NO_TRANSLITERATION
    if (t) delete t;
#endif

    if (buf) delete[] buf;
    if (unibuf) delete[] unibuf;

    if (fromoffsets) delete[] fromoffsets;
    if (tooffsets) delete[] tooffsets;

    if (infile != stdin) {
        fclose(infile);
    }

    return ret;
}
Ejemplo n.º 4
0
static UBool testConvertToUnicode( const uint8_t *source, int sourcelen, const UChar *expect, int expectlen, 
               const char *codepage, UBool fallback, const int32_t *expectOffsets)
{
    UErrorCode status = U_ZERO_ERROR;
    UConverter *conv = 0;
    UChar   junkout[NEW_MAX_BUFFER]; /* FIX */
    int32_t junokout[NEW_MAX_BUFFER]; /* FIX */
    const char *src;
    const char *realSourceEnd;
    const char *srcLimit;
    UChar *targ;
    UChar *end;
    int32_t *offs;
    int i;
    UBool   checkOffsets = TRUE;
    char junk[9999];
    char offset_str[9999];
    UChar *p;
    UBool action;

    int32_t   realBufferSize;
    UChar *realBufferEnd;


    for(i=0;i<NEW_MAX_BUFFER;i++)
        junkout[i] = 0xFFFE;

    for(i=0;i<NEW_MAX_BUFFER;i++)
        junokout[i] = -1;

    setNuConvTestName(codepage, "TO");

    log_verbose("\n=========  %s\n", gNuConvTestName);

    conv = my_ucnv_open(codepage, &status);
    if(U_FAILURE(status))
    {
        log_data_err("Couldn't open converter %s\n",gNuConvTestName);
        return TRUE; /* because it has been logged */
    }

    log_verbose("Converter opened..\n");

    src = (const char *)source;
    targ = junkout;
    offs = junokout;

    realBufferSize = (sizeof(junkout)/sizeof(junkout[0]));
    realBufferEnd = junkout + realBufferSize;
    realSourceEnd = src + sourcelen;
    /*----setting the fallback routine----*/
    ucnv_setFallback (conv, fallback);
    action = ucnv_usesFallback(conv);
    if(action != fallback){
        log_err("FAIL: Error is setting fallback. Errocode=%s\n", myErrorName(status));
    }
    /*-------------------------------------*/
    if ( gOutBufferSize != realBufferSize )
      checkOffsets = FALSE;

    if( gInBufferSize != NEW_MAX_BUFFER )
      checkOffsets = FALSE;

    do
    {
        end = nct_min( targ + gOutBufferSize, realBufferEnd);
        srcLimit = nct_min(realSourceEnd, src + gInBufferSize);

        if(targ == realBufferEnd)
        {
            log_err("Error, the end would overflow the real output buffer while about to call toUnicode! tarjey=%08lx %s",targ,gNuConvTestName);
            return FALSE;
        }
        log_verbose("calling toUnicode @ %08lx to %08lx\n", targ,end);



        status = U_ZERO_ERROR;

        ucnv_toUnicode (conv,
                &targ,
                end,
                (const char **)&src,
                (const char *)srcLimit,
                checkOffsets ? offs : NULL,
                (UBool)(srcLimit == realSourceEnd), /* flush if we're at the end of hte source data */
                &status);
    } while ( (status == U_BUFFER_OVERFLOW_ERROR) || (srcLimit < realSourceEnd) ); /* while we just need another buffer */


    if(U_FAILURE(status))
    {
        log_err("Problem doing toUnicode, errcode %s %s\n", myErrorName(status), gNuConvTestName);
        return FALSE;
    }

    log_verbose("\nConversion done. %d bytes -> %d chars.\nResult :",
        sourcelen, targ-junkout);
    if(VERBOSITY)
    {

        junk[0] = 0;
        offset_str[0] = 0;

        for(p = junkout;p<targ;p++)
        {
            sprintf(junk + strlen(junk), "0x%04x, ", (0xFFFF) & (unsigned int)*p);
            sprintf(offset_str + strlen(offset_str), "0x%04x, ", (0xFFFF) & (unsigned int)junokout[p-junkout]);
        }

        log_verbose(junk);
        printUSeq(expect, expectlen);
        if ( checkOffsets )
        {
            log_verbose("\nOffsets:");
            log_verbose(offset_str);
        }
        log_verbose("\n");
    }
    ucnv_close(conv);

    log_verbose("comparing %d uchars (%d bytes)..\n",expectlen,expectlen*2);

    if (checkOffsets && (expectOffsets != 0))
    {
        if(memcmp(junokout,expectOffsets,(targ-junkout) * sizeof(int32_t)))
        {
            log_err("\n\ndid not get the expected offsets while %s \n", gNuConvTestName);
            log_err("\nGot  : ");
            for(p=junkout;p<targ;p++)
                log_err("%d, ", junokout[p-junkout]); 
            log_err("\nExpected: ");
            for(i=0; i<(targ-junkout); i++)
                log_err("%d,", expectOffsets[i]);
            log_err("");
            for(i=0; i<(targ-junkout); i++)
                log_err("0x%04X,", junkout[i]);
            log_err("");
            for(i=0; i<(src-(const char *)source); i++)
                log_err("0x%04X,", (unsigned char)source[i]);
        }
    }

    if(!memcmp(junkout, expect, expectlen*2))
    {
        log_verbose("Matches!\n");
        return TRUE;
    }
    else
    {
        log_err("String does not match. %s\n", gNuConvTestName);
        log_verbose("String does not match. %s\n", gNuConvTestName);
        printUSeqErr(junkout, expectlen);
        printf("\n");
        printUSeqErr(expect, expectlen);
        return FALSE;
    }
}
Ejemplo n.º 5
0
static UBool testConvertFromUnicode(const UChar *source, int sourceLen,  const uint8_t *expect, int expectLen, 
                const char *codepage, UBool fallback, const int32_t *expectOffsets)
{


    UErrorCode status = U_ZERO_ERROR;
    UConverter *conv = 0;
    char junkout[NEW_MAX_BUFFER]; /* FIX */
    int32_t junokout[NEW_MAX_BUFFER]; /* FIX */
    const UChar *src;
    char *end;
    char *targ;
    int32_t *offs;
    int i;
    int32_t   realBufferSize;
    char *realBufferEnd;
    const UChar *realSourceEnd;
    const UChar *sourceLimit;
    UBool checkOffsets = TRUE;
    UBool doFlush;
    UBool action=FALSE;
    char *p;


    for(i=0;i<NEW_MAX_BUFFER;i++)
        junkout[i] = (char)0xF0;
    for(i=0;i<NEW_MAX_BUFFER;i++)
        junokout[i] = 0xFF;
    setNuConvTestName(codepage, "FROM");

    log_verbose("\nTesting========= %s  FROM \n  inputbuffer= %d   outputbuffer= %d\n", codepage, gInBufferSize, 
            gOutBufferSize);

    conv = my_ucnv_open(codepage, &status);
    if(U_FAILURE(status))
    {
        log_data_err("Couldn't open converter %s\n",codepage);
        return TRUE;
    }

    log_verbose("Converter opened..\n");
    /*----setting the callback routine----*/
    ucnv_setFallback (conv, fallback);
    action = ucnv_usesFallback(conv);
    if(action != fallback){
        log_err("FAIL: Error is setting fallback. Errocode=%s\n", myErrorName(status));
    }
    /*------------------------*/
    src = source;
    targ = junkout;
    offs = junokout;

    realBufferSize = (sizeof(junkout)/sizeof(junkout[0]));
    realBufferEnd = junkout + realBufferSize;
    realSourceEnd = source + sourceLen;

    if ( gOutBufferSize != realBufferSize )
        checkOffsets = FALSE;

    if( gInBufferSize != NEW_MAX_BUFFER )
        checkOffsets = FALSE;

    do
    {
        end = nct_min(targ + gOutBufferSize, realBufferEnd);
        sourceLimit = nct_min(src + gInBufferSize, realSourceEnd);

        doFlush = (UBool)(sourceLimit == realSourceEnd);

        if(targ == realBufferEnd)
        {
            log_err("Error, overflowed the real buffer while about to call fromUnicode! targ=%08lx %s", targ, gNuConvTestName);
            return FALSE;
        }
        log_verbose("calling fromUnicode @ SOURCE:%08lx to %08lx  TARGET: %08lx to %08lx, flush=%s\n", src,sourceLimit, targ,end, doFlush?"TRUE":"FALSE");


        status = U_ZERO_ERROR;

        ucnv_fromUnicode (conv,
                  (char **)&targ,
                  (const char *)end,
                  &src,
                  sourceLimit,
                  checkOffsets ? offs : NULL,
                  doFlush, /* flush if we're at the end of the input data */
                  &status);

    } while ( (status == U_BUFFER_OVERFLOW_ERROR) || (sourceLimit < realSourceEnd) );

    if(U_FAILURE(status))
    {
        log_err("Problem doing toUnicode, errcode %d %s\n", myErrorName(status), gNuConvTestName);
        return FALSE;
    }

    log_verbose("\nConversion done [%d uchars in -> %d chars out]. \nResult :",
        sourceLen, targ-junkout);
    if(VERBOSITY)
    {
        char junk[9999];
        char offset_str[9999];

        junk[0] = 0;
        offset_str[0] = 0;
        for(p = junkout;p<targ;p++)
        {
            sprintf(junk + uprv_strlen(junk), "0x%02x, ", (0xFF) & (unsigned int)*p);
            sprintf(offset_str + strlen(offset_str), "0x%02x, ", (0xFF) & (unsigned int)junokout[p-junkout]);
        }

        log_verbose(junk);
        printSeq((const unsigned char*)expect, expectLen);
        if ( checkOffsets )
        {
            log_verbose("\nOffsets:");
            log_verbose(offset_str);
        }
        log_verbose("\n");
    }
    ucnv_close(conv);


    if(expectLen != targ-junkout)
    {
        log_err("Expected %d chars out, got %d %s\n", expectLen, targ-junkout, gNuConvTestName);
        log_verbose("Expected %d chars out, got %d %s\n", expectLen, targ-junkout, gNuConvTestName);
        printSeqErr((const unsigned char*)junkout, (int32_t)(targ-junkout));
        printSeqErr((const unsigned char*)expect, expectLen);
        return FALSE;
    }

    if (checkOffsets && (expectOffsets != 0) )
    {
        log_verbose("\ncomparing %d offsets..\n", targ-junkout);
        if(uprv_memcmp(junokout,expectOffsets,(targ-junkout) * sizeof(int32_t) )){
            log_err("\ndid not get the expected offsets while %s \n", gNuConvTestName);
            log_err("Got  : ");
            printSeqErr((const unsigned char*)junkout, (int32_t)(targ-junkout));
            for(p=junkout;p<targ;p++)
                log_err("%d, ", junokout[p-junkout]); 
            log_err("\nExpected: ");
            for(i=0; i<(targ-junkout); i++)
                log_err("%d,", expectOffsets[i]);
        }
    }

    log_verbose("\n\ncomparing..\n");
    if(!memcmp(junkout, expect, expectLen))
    {
        log_verbose("Matches!\n");
        return TRUE;
    }
    else
    {
        log_err("String does not match. %s\n", gNuConvTestName);
        log_verbose("String does not match. %s\n", gNuConvTestName);
        printSeqErr((const unsigned char*)junkout, expectLen);
        printSeqErr((const unsigned char*)expect, expectLen);
        return FALSE;
    }
}
Ejemplo n.º 6
0
void __hs_ucnv_setFallback(UConverter *cnv, UBool usesFallback)
{
    ucnv_setFallback(cnv, usesFallback);
}