void TextCodecICU::createICUConverter() const { ASSERT(!m_converterICU); #if defined(USING_SYSTEM_ICU) const char* name = m_encoding.name(); m_needsGBKFallbacks = name[0] == 'G' && name[1] == 'B' && name[2] == 'K' && !name[3]; #endif UErrorCode err; UConverter*& cachedConverter = cachedConverterICU(); if (cachedConverter) { err = U_ZERO_ERROR; const char* cachedName = ucnv_getName(cachedConverter, &err); if (U_SUCCESS(err) && m_encoding == cachedName) { m_converterICU = cachedConverter; cachedConverter = 0; return; } } err = U_ZERO_ERROR; m_converterICU = ucnv_open(m_encoding.name(), &err); #if !LOG_DISABLED if (err == U_AMBIGUOUS_ALIAS_WARNING) WTF_LOG_ERROR("ICU ambiguous alias warning for encoding: %s", m_encoding.name()); #endif if (m_converterICU) ucnv_setFallback(m_converterICU, TRUE); }
void TextCodecICU::createICUConverter() const { ASSERT(!m_converterICU); UErrorCode err; m_needsGBKFallbacks = !strcmp(m_encodingName, "GBK"); UConverter*& cachedConverter = cachedConverterICU(); if (cachedConverter) { err = U_ZERO_ERROR; const char* cachedConverterName = ucnv_getName(cachedConverter, &err); if (U_SUCCESS(err) && !strcmp(m_canonicalConverterName, cachedConverterName)) { m_converterICU = cachedConverter; cachedConverter = 0; return; } } err = U_ZERO_ERROR; m_converterICU = ucnv_open(m_canonicalConverterName, &err); ASSERT(U_SUCCESS(err)); if (m_converterICU) ucnv_setFallback(m_converterICU, TRUE); }
// Convert a file from one encoding to another static UBool convertFile(const char *pname, const char *fromcpage, UConverterToUCallback toucallback, const void *touctxt, const char *tocpage, UConverterFromUCallback fromucallback, const void *fromuctxt, int fallback, size_t bufsz, const char *translit, const char *infilestr, FILE * outfile, int verbose) { FILE *infile; UBool ret = TRUE; UConverter *convfrom = 0; UConverter *convto = 0; UErrorCode err = U_ZERO_ERROR; UBool flush; const char *cbufp; char *bufp; char *buf = 0; uint32_t infoffset = 0, outfoffset = 0; /* Where we are in the file, for error reporting. */ const UChar *unibufbp; UChar *unibufp; UChar *unibuf = 0; int32_t *fromoffsets = 0, *tooffsets = 0; size_t rd, wr, tobufsz; #if !UCONFIG_NO_TRANSLITERATION Transliterator *t = 0; // Transliterator acting on Unicode data. #endif UnicodeString u; // String to do the transliteration. // Open the correct input file or connect to stdin for reading input if (infilestr != 0 && strcmp(infilestr, "-")) { infile = fopen(infilestr, "rb"); if (infile == 0) { UnicodeString str1(infilestr, ""); str1.append((UChar32) 0); UnicodeString str2(strerror(errno), ""); str2.append((UChar32) 0); initMsg(pname); u_wmsg(stderr, "cantOpenInputF", str1.getBuffer(), str2.getBuffer()); return FALSE; } } else { infilestr = "-"; infile = stdin; #ifdef WIN32 if (setmode(fileno(stdin), O_BINARY) == -1) { initMsg(pname); u_wmsg(stderr, "cantSetInBinMode"); return FALSE; } #endif } if (verbose) { fprintf(stderr, "%s:\n", infilestr); } #if !UCONFIG_NO_TRANSLITERATION // Create transliterator as needed. if (translit != NULL && *translit) { UParseError parse; UnicodeString str(translit), pestr; /* Create from rules or by ID as needed. */ parse.line = -1; if (uprv_strchr(translit, ':') || uprv_strchr(translit, '>') || uprv_strchr(translit, '<') || uprv_strchr(translit, '>')) { t = Transliterator::createFromRules("Uconv", str, UTRANS_FORWARD, parse, err); } else { t = Transliterator::createInstance(translit, UTRANS_FORWARD, err); } if (U_FAILURE(err)) { str.append((UChar32) 0); initMsg(pname); if (parse.line >= 0) { UChar linebuf[20], offsetbuf[20]; uprv_itou(linebuf, 20, parse.line, 10, 0); uprv_itou(offsetbuf, 20, parse.offset, 10, 0); u_wmsg(stderr, "cantCreateTranslitParseErr", str.getBuffer(), u_wmsg_errorName(err), linebuf, offsetbuf); } else { u_wmsg(stderr, "cantCreateTranslit", str.getBuffer(), u_wmsg_errorName(err)); } if (t) { delete t; t = 0; } goto error_exit; } } #endif // Create codepage converter. If the codepage or its aliases weren't // available, it returns NULL and a failure code. We also set the // callbacks, and return errors in the same way. convfrom = ucnv_open(fromcpage, &err); if (U_FAILURE(err)) { UnicodeString str(fromcpage, (int32_t)(uprv_strlen(fromcpage) + 1)); initMsg(pname); u_wmsg(stderr, "cantOpenFromCodeset", str.getBuffer(), u_wmsg_errorName(err)); goto error_exit; } ucnv_setToUCallBack(convfrom, toucallback, touctxt, 0, 0, &err); if (U_FAILURE(err)) { initMsg(pname); u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err)); goto error_exit; } convto = ucnv_open(tocpage, &err); if (U_FAILURE(err)) { UnicodeString str(tocpage, (int32_t)(uprv_strlen(tocpage) + 1)); initMsg(pname); u_wmsg(stderr, "cantOpenToCodeset", str.getBuffer(), u_wmsg_errorName(err)); goto error_exit; } ucnv_setFromUCallBack(convto, fromucallback, fromuctxt, 0, 0, &err); if (U_FAILURE(err)) { initMsg(pname); u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err)); goto error_exit; } ucnv_setFallback(convto, fallback); // To ensure that the buffer always is of enough size, we // must take the worst case scenario, that is the character in // the codepage that uses the most bytes and multiply it against // the buffer size. // use bufsz+1 to allow for additional BOM/signature character (U+FEFF) tobufsz = (bufsz+1) * ucnv_getMaxCharSize(convto); buf = new char[tobufsz]; unibuf = new UChar[bufsz]; fromoffsets = new int32_t[bufsz]; tooffsets = new int32_t[tobufsz]; // OK, we can convert now. do { char willexit = 0; rd = fread(buf, 1, bufsz, infile); if (ferror(infile) != 0) { UnicodeString str(strerror(errno)); str.append((UChar32) 0); initMsg(pname); u_wmsg(stderr, "cantRead", str.getBuffer()); goto error_exit; } // Convert the read buffer into the new coding // After the call 'unibufp' will be placed on the last // character that was converted in the 'unibuf'. // Also the 'cbufp' is positioned on the last converted // character. // At the last conversion in the file, flush should be set to // true so that we get all characters converted // // The converter must be flushed at the end of conversion so // that characters on hold also will be written. unibufp = unibuf; cbufp = buf; flush = rd != bufsz; ucnv_toUnicode(convfrom, &unibufp, unibufp + bufsz, &cbufp, cbufp + rd, fromoffsets, flush, &err); infoffset += (uint32_t)(cbufp - buf); if (U_FAILURE(err)) { char pos[32]; sprintf(pos, "%u", infoffset - 1); UnicodeString str(pos, (int32_t)(uprv_strlen(pos) + 1)); initMsg(pname); u_wmsg(stderr, "problemCvtToU", str.getBuffer(), u_wmsg_errorName(err)); willexit = 1; err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */ } // At the last conversion, the converted characters should be // equal to number of chars read. if (flush && !willexit && cbufp != (buf + rd)) { char pos[32]; sprintf(pos, "%u", infoffset); UnicodeString str(pos, (int32_t)(uprv_strlen(pos) + 1)); initMsg(pname); u_wmsg(stderr, "premEndInput", str.getBuffer()); willexit = 1; } // Prepare to transliterate and convert. Transliterate if needed. #if !UCONFIG_NO_TRANSLITERATION if (t) { u.setTo(unibuf, (int32_t)(unibufp - unibuf)); // Copy into string. t->transliterate(u); } else #endif { u.setTo(unibuf, (int32_t)(unibufp - unibuf), (int32_t)(bufsz)); // Share the buffer. } int32_t ulen = u.length(); // Convert the Unicode buffer into the destination codepage // Again 'bufp' will be placed on the last converted character // And 'unibufbp' will be placed on the last converted unicode character // At the last conversion flush should be set to true to ensure that // all characters left get converted const UChar *unibufu = unibufbp = u.getBuffer(); do { int32_t len = ulen > (int32_t)bufsz ? (int32_t)bufsz : ulen; bufp = buf; unibufp = (UChar *) (unibufbp + len); ucnv_fromUnicode(convto, &bufp, bufp + tobufsz, &unibufbp, unibufp, tooffsets, flush, &err); if (U_FAILURE(err)) { const char *errtag; char pos[32]; uint32_t erroffset = dataOffset((int32_t)(bufp - buf - 1), fromoffsets, (int32_t)(bufsz), tooffsets, (int32_t)(tobufsz)); int32_t ferroffset = (int32_t)(infoffset - (unibufp - unibufu) + erroffset); if ((int32_t) ferroffset < 0) { ferroffset = (int32_t)(outfoffset + (bufp - buf)); errtag = "problemCvtFromUOut"; } else { errtag = "problemCvtFromU"; } sprintf(pos, "%u", ferroffset); UnicodeString str(pos, (int32_t)(uprv_strlen(pos) + 1)); initMsg(pname); u_wmsg(stderr, errtag, str.getBuffer(), u_wmsg_errorName(err)); willexit = 1; } // At the last conversion, the converted characters should be equal to number // of consumed characters. if (flush && !willexit && unibufbp != (unibufu + (size_t) (unibufp - unibufu))) { char pos[32]; sprintf(pos, "%u", infoffset); UnicodeString str(pos, (int32_t)(uprv_strlen(pos) + 1)); initMsg(pname); u_wmsg(stderr, "premEnd", str.getBuffer()); willexit = 1; } // Finally, write the converted buffer to the output file rd = (size_t) (bufp - buf); outfoffset += (int32_t)(wr = fwrite(buf, 1, rd, outfile)); if (wr != rd) { UnicodeString str(strerror(errno), ""); initMsg(pname); u_wmsg(stderr, "cantWrite", str.getBuffer()); willexit = 1; } if (willexit) { goto error_exit; } } while ((ulen -= (int32_t)(bufsz)) > 0); } while (!flush); // Stop when we have flushed the // converters (this means that it's // the end of output) goto normal_exit; error_exit: ret = FALSE; normal_exit: // Cleanup. if (convfrom) ucnv_close(convfrom); if (convto) ucnv_close(convto); #if !UCONFIG_NO_TRANSLITERATION if (t) delete t; #endif if (buf) delete[] buf; if (unibuf) delete[] unibuf; if (fromoffsets) delete[] fromoffsets; if (tooffsets) delete[] tooffsets; if (infile != stdin) { fclose(infile); } return ret; }
static UBool testConvertToUnicode( const uint8_t *source, int sourcelen, const UChar *expect, int expectlen, const char *codepage, UBool fallback, const int32_t *expectOffsets) { UErrorCode status = U_ZERO_ERROR; UConverter *conv = 0; UChar junkout[NEW_MAX_BUFFER]; /* FIX */ int32_t junokout[NEW_MAX_BUFFER]; /* FIX */ const char *src; const char *realSourceEnd; const char *srcLimit; UChar *targ; UChar *end; int32_t *offs; int i; UBool checkOffsets = TRUE; char junk[9999]; char offset_str[9999]; UChar *p; UBool action; int32_t realBufferSize; UChar *realBufferEnd; for(i=0;i<NEW_MAX_BUFFER;i++) junkout[i] = 0xFFFE; for(i=0;i<NEW_MAX_BUFFER;i++) junokout[i] = -1; setNuConvTestName(codepage, "TO"); log_verbose("\n========= %s\n", gNuConvTestName); conv = my_ucnv_open(codepage, &status); if(U_FAILURE(status)) { log_data_err("Couldn't open converter %s\n",gNuConvTestName); return TRUE; /* because it has been logged */ } log_verbose("Converter opened..\n"); src = (const char *)source; targ = junkout; offs = junokout; realBufferSize = (sizeof(junkout)/sizeof(junkout[0])); realBufferEnd = junkout + realBufferSize; realSourceEnd = src + sourcelen; /*----setting the fallback routine----*/ ucnv_setFallback (conv, fallback); action = ucnv_usesFallback(conv); if(action != fallback){ log_err("FAIL: Error is setting fallback. Errocode=%s\n", myErrorName(status)); } /*-------------------------------------*/ if ( gOutBufferSize != realBufferSize ) checkOffsets = FALSE; if( gInBufferSize != NEW_MAX_BUFFER ) checkOffsets = FALSE; do { end = nct_min( targ + gOutBufferSize, realBufferEnd); srcLimit = nct_min(realSourceEnd, src + gInBufferSize); if(targ == realBufferEnd) { log_err("Error, the end would overflow the real output buffer while about to call toUnicode! tarjey=%08lx %s",targ,gNuConvTestName); return FALSE; } log_verbose("calling toUnicode @ %08lx to %08lx\n", targ,end); status = U_ZERO_ERROR; ucnv_toUnicode (conv, &targ, end, (const char **)&src, (const char *)srcLimit, checkOffsets ? offs : NULL, (UBool)(srcLimit == realSourceEnd), /* flush if we're at the end of hte source data */ &status); } while ( (status == U_BUFFER_OVERFLOW_ERROR) || (srcLimit < realSourceEnd) ); /* while we just need another buffer */ if(U_FAILURE(status)) { log_err("Problem doing toUnicode, errcode %s %s\n", myErrorName(status), gNuConvTestName); return FALSE; } log_verbose("\nConversion done. %d bytes -> %d chars.\nResult :", sourcelen, targ-junkout); if(VERBOSITY) { junk[0] = 0; offset_str[0] = 0; for(p = junkout;p<targ;p++) { sprintf(junk + strlen(junk), "0x%04x, ", (0xFFFF) & (unsigned int)*p); sprintf(offset_str + strlen(offset_str), "0x%04x, ", (0xFFFF) & (unsigned int)junokout[p-junkout]); } log_verbose(junk); printUSeq(expect, expectlen); if ( checkOffsets ) { log_verbose("\nOffsets:"); log_verbose(offset_str); } log_verbose("\n"); } ucnv_close(conv); log_verbose("comparing %d uchars (%d bytes)..\n",expectlen,expectlen*2); if (checkOffsets && (expectOffsets != 0)) { if(memcmp(junokout,expectOffsets,(targ-junkout) * sizeof(int32_t))) { log_err("\n\ndid not get the expected offsets while %s \n", gNuConvTestName); log_err("\nGot : "); for(p=junkout;p<targ;p++) log_err("%d, ", junokout[p-junkout]); log_err("\nExpected: "); for(i=0; i<(targ-junkout); i++) log_err("%d,", expectOffsets[i]); log_err(""); for(i=0; i<(targ-junkout); i++) log_err("0x%04X,", junkout[i]); log_err(""); for(i=0; i<(src-(const char *)source); i++) log_err("0x%04X,", (unsigned char)source[i]); } } if(!memcmp(junkout, expect, expectlen*2)) { log_verbose("Matches!\n"); return TRUE; } else { log_err("String does not match. %s\n", gNuConvTestName); log_verbose("String does not match. %s\n", gNuConvTestName); printUSeqErr(junkout, expectlen); printf("\n"); printUSeqErr(expect, expectlen); return FALSE; } }
static UBool testConvertFromUnicode(const UChar *source, int sourceLen, const uint8_t *expect, int expectLen, const char *codepage, UBool fallback, const int32_t *expectOffsets) { UErrorCode status = U_ZERO_ERROR; UConverter *conv = 0; char junkout[NEW_MAX_BUFFER]; /* FIX */ int32_t junokout[NEW_MAX_BUFFER]; /* FIX */ const UChar *src; char *end; char *targ; int32_t *offs; int i; int32_t realBufferSize; char *realBufferEnd; const UChar *realSourceEnd; const UChar *sourceLimit; UBool checkOffsets = TRUE; UBool doFlush; UBool action=FALSE; char *p; for(i=0;i<NEW_MAX_BUFFER;i++) junkout[i] = (char)0xF0; for(i=0;i<NEW_MAX_BUFFER;i++) junokout[i] = 0xFF; setNuConvTestName(codepage, "FROM"); log_verbose("\nTesting========= %s FROM \n inputbuffer= %d outputbuffer= %d\n", codepage, gInBufferSize, gOutBufferSize); conv = my_ucnv_open(codepage, &status); if(U_FAILURE(status)) { log_data_err("Couldn't open converter %s\n",codepage); return TRUE; } log_verbose("Converter opened..\n"); /*----setting the callback routine----*/ ucnv_setFallback (conv, fallback); action = ucnv_usesFallback(conv); if(action != fallback){ log_err("FAIL: Error is setting fallback. Errocode=%s\n", myErrorName(status)); } /*------------------------*/ src = source; targ = junkout; offs = junokout; realBufferSize = (sizeof(junkout)/sizeof(junkout[0])); realBufferEnd = junkout + realBufferSize; realSourceEnd = source + sourceLen; if ( gOutBufferSize != realBufferSize ) checkOffsets = FALSE; if( gInBufferSize != NEW_MAX_BUFFER ) checkOffsets = FALSE; do { end = nct_min(targ + gOutBufferSize, realBufferEnd); sourceLimit = nct_min(src + gInBufferSize, realSourceEnd); doFlush = (UBool)(sourceLimit == realSourceEnd); if(targ == realBufferEnd) { log_err("Error, overflowed the real buffer while about to call fromUnicode! targ=%08lx %s", targ, gNuConvTestName); return FALSE; } log_verbose("calling fromUnicode @ SOURCE:%08lx to %08lx TARGET: %08lx to %08lx, flush=%s\n", src,sourceLimit, targ,end, doFlush?"TRUE":"FALSE"); status = U_ZERO_ERROR; ucnv_fromUnicode (conv, (char **)&targ, (const char *)end, &src, sourceLimit, checkOffsets ? offs : NULL, doFlush, /* flush if we're at the end of the input data */ &status); } while ( (status == U_BUFFER_OVERFLOW_ERROR) || (sourceLimit < realSourceEnd) ); if(U_FAILURE(status)) { log_err("Problem doing toUnicode, errcode %d %s\n", myErrorName(status), gNuConvTestName); return FALSE; } log_verbose("\nConversion done [%d uchars in -> %d chars out]. \nResult :", sourceLen, targ-junkout); if(VERBOSITY) { char junk[9999]; char offset_str[9999]; junk[0] = 0; offset_str[0] = 0; for(p = junkout;p<targ;p++) { sprintf(junk + uprv_strlen(junk), "0x%02x, ", (0xFF) & (unsigned int)*p); sprintf(offset_str + strlen(offset_str), "0x%02x, ", (0xFF) & (unsigned int)junokout[p-junkout]); } log_verbose(junk); printSeq((const unsigned char*)expect, expectLen); if ( checkOffsets ) { log_verbose("\nOffsets:"); log_verbose(offset_str); } log_verbose("\n"); } ucnv_close(conv); if(expectLen != targ-junkout) { log_err("Expected %d chars out, got %d %s\n", expectLen, targ-junkout, gNuConvTestName); log_verbose("Expected %d chars out, got %d %s\n", expectLen, targ-junkout, gNuConvTestName); printSeqErr((const unsigned char*)junkout, (int32_t)(targ-junkout)); printSeqErr((const unsigned char*)expect, expectLen); return FALSE; } if (checkOffsets && (expectOffsets != 0) ) { log_verbose("\ncomparing %d offsets..\n", targ-junkout); if(uprv_memcmp(junokout,expectOffsets,(targ-junkout) * sizeof(int32_t) )){ log_err("\ndid not get the expected offsets while %s \n", gNuConvTestName); log_err("Got : "); printSeqErr((const unsigned char*)junkout, (int32_t)(targ-junkout)); for(p=junkout;p<targ;p++) log_err("%d, ", junokout[p-junkout]); log_err("\nExpected: "); for(i=0; i<(targ-junkout); i++) log_err("%d,", expectOffsets[i]); } } log_verbose("\n\ncomparing..\n"); if(!memcmp(junkout, expect, expectLen)) { log_verbose("Matches!\n"); return TRUE; } else { log_err("String does not match. %s\n", gNuConvTestName); log_verbose("String does not match. %s\n", gNuConvTestName); printSeqErr((const unsigned char*)junkout, expectLen); printSeqErr((const unsigned char*)expect, expectLen); return FALSE; } }
void __hs_ucnv_setFallback(UConverter *cnv, UBool usesFallback) { ucnv_setFallback(cnv, usesFallback); }