CF_INLINE UConverter *__CFStringEncodingConverterCreateICUConverter(const char *icuName, uint32_t flags, bool toUnicode) { UConverter *converter; UErrorCode errorCode = U_ZERO_ERROR; uint8_t streamID = CFStringEncodingStreamIDFromMask(flags); if (0 != streamID) { // this is a part of streaming previously created __CFICUThreadData *data = __CFStringEncodingICUGetThreadData(); --streamID; // map to array index if ((streamID < data->_numSlots) && (NULL != data->_converters[streamID])) return data->_converters[streamID]; } converter = ucnv_open(icuName, &errorCode); if (NULL != converter) { char lossyByte = CFStringEncodingMaskToLossyByte(flags); if ((0 == lossyByte) && (0 != (flags & kCFStringEncodingAllowLossyConversion))) lossyByte = '?'; if (0 ==lossyByte) { if (toUnicode) { ucnv_setToUCallBack(converter, &UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode); } else { ucnv_setFromUCallBack(converter, &UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode); } } else { ucnv_setSubstChars(converter, &lossyByte, 1, &errorCode); } } return converter; }
static void NativeConverter_setCallbackDecode(JNIEnv* env, jclass, jlong address, jint onMalformedInput, jint onUnmappableInput, jstring javaReplacement) { UConverter* cnv = toUConverter(address); if (cnv == NULL) { maybeThrowIcuException(env, "toConverter", U_ILLEGAL_ARGUMENT_ERROR); return; } UConverterToUCallback oldCallback; const void* oldCallbackContext; ucnv_getToUCallBack(cnv, &oldCallback, &oldCallbackContext); DecoderCallbackContext* callbackContext = const_cast<DecoderCallbackContext*>( reinterpret_cast<const DecoderCallbackContext*>(oldCallbackContext)); if (callbackContext == NULL) { callbackContext = new DecoderCallbackContext; } callbackContext->onMalformedInput = getToUCallback(onMalformedInput); callbackContext->onUnmappableInput = getToUCallback(onUnmappableInput); ScopedStringChars replacement(env, javaReplacement); if (replacement.get() == NULL) { maybeThrowIcuException(env, "replacement", U_ILLEGAL_ARGUMENT_ERROR); return; } u_strncpy(callbackContext->replacementChars, replacement.get(), replacement.size()); callbackContext->replacementCharCount = replacement.size(); UErrorCode errorCode = U_ZERO_ERROR; ucnv_setToUCallBack(cnv, CHARSET_DECODER_CALLBACK, callbackContext, NULL, NULL, &errorCode); maybeThrowIcuException(env, "ucnv_setToUCallBack", errorCode); }
static UConverter * GSStringOpenConverter (CFStringEncoding encoding, char lossByte) { const char *converterName; UConverter *cnv; UErrorCode err = U_ZERO_ERROR; converterName = CFStringICUConverterName (encoding); cnv = ucnv_open (converterName, &err); if (U_FAILURE (err)) cnv = NULL; if (lossByte) { /* FIXME: for some reason this is returning U_ILLEGAL_ARGUMENTS_ERROR */ ucnv_setSubstChars (cnv, &lossByte, 1, &err); } else { ucnv_setToUCallBack (cnv, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &err); ucnv_setFromUCallBack (cnv, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &err); } return cnv; }
ErrorCallbackSetter(UConverter* converter, bool stopOnError) : m_converter(converter), m_shouldStopOnEncodingErrors(stopOnError) { if (m_shouldStopOnEncodingErrors) { UErrorCode err = U_ZERO_ERROR; ucnv_setToUCallBack(m_converter, UCNV_TO_U_CALLBACK_STOP, 0, &m_savedAction, &m_savedContext, &err); DCHECK_EQ(err, U_ZERO_ERROR); } }
U_STABLE void U_EXPORT2 ucnv_setToUCallBack_53(UConverter * converter, UConverterToUCallback newAction, const void* newContext, UConverterToUCallback *oldAction, const void** oldContext, UErrorCode * err) { ucnv_setToUCallBack(converter, newAction, newContext, oldAction, oldContext, err); }
~ErrorCallbackSetter() { if (m_shouldStopOnEncodingErrors) { UErrorCode err = U_ZERO_ERROR; const void* oldContext; UConverterToUCallback oldAction; ucnv_setToUCallBack(m_converter, m_savedAction, m_savedContext, &oldAction, &oldContext, &err); ASSERT(oldAction == UCNV_TO_U_CALLBACK_SUBSTITUTE); ASSERT(!strcmp(static_cast<const char*>(oldContext), UCNV_SUB_STOP_ON_ILLEGAL)); ASSERT(err == U_ZERO_ERROR); } }
~ErrorCallbackSetter() { if (m_shouldStopOnEncodingErrors) { UErrorCode err = U_ZERO_ERROR; const void* oldContext; UConverterToUCallback oldAction; ucnv_setToUCallBack(m_converter, m_savedAction, m_savedContext, &oldAction, &oldContext, &err); DCHECK_EQ(oldAction, UCNV_TO_U_CALLBACK_STOP); DCHECK(!oldContext); DCHECK_EQ(err, U_ZERO_ERROR); } }
ErrorCallbackSetter(UConverter* converter, bool stopOnError) : m_converter(converter) , m_shouldStopOnEncodingErrors(stopOnError) { if (m_shouldStopOnEncodingErrors) { UErrorCode err = U_ZERO_ERROR; ucnv_setToUCallBack(m_converter, UCNV_TO_U_CALLBACK_SUBSTITUTE, UCNV_SUB_STOP_ON_ILLEGAL, &m_savedAction, &m_savedContext, &err); ASSERT(err == U_ZERO_ERROR); } }
int initTxt(struct doc_descriptor *desc) { UErrorCode err; char *encoding = NULL; int len, BOMlength = 0; char buf[BUFSIZE]; UChar outbuf[4*BUFSIZE]; lseek(desc->fd, 0, SEEK_SET); len = read(desc->fd, buf, BUFSIZE); /* detect BOM */ err = U_ZERO_ERROR; encoding = ucnv_detectUnicodeSignature(buf, BUFSIZE, &BOMlength, &err); if(encoding != NULL) { lseek(desc->fd, BOMlength, SEEK_SET); /* initialize converter to encoding */ err = U_ZERO_ERROR; desc->conv = ucnv_open(encoding, &err); if (U_FAILURE(err)) { fprintf(stderr, "unable to open ICU converter\n"); return ERR_ICU; } } else { /* initialize converter to UTF-8 */ err = U_ZERO_ERROR; desc->conv = ucnv_open("utf8", &err); if (U_FAILURE(err)) { fprintf(stderr, "unable to open ICU converter\n"); return ERR_ICU; } /* check the first 2048 bytes */ err = U_ZERO_ERROR; ucnv_setToUCallBack(desc->conv, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &err); if (U_FAILURE(err)) { fprintf(stderr, "error setToUCallback\n"); return ERR_ICU; } err = U_ZERO_ERROR; ucnv_toUChars(desc->conv, outbuf, 4 * BUFSIZE, buf, len, &err); if (U_FAILURE(err)) { fprintf(stderr, "Unknown encoding\n"); return ERR_ICU; } lseek(desc->fd, 0, SEEK_SET); } return OK; }
status_t ICUCategoryData::_GetConverter(UConverter*& converterOut) { // we use different converters per thread to avoid concurrent accesses ICUThreadLocalStorageValue* tlsValue = NULL; status_t result = ICUThreadLocalStorageValue::GetInstanceForKey( fThreadLocalStorageKey, tlsValue); if (result != B_OK) return result; if (tlsValue->converter != NULL) { if (strcmp(tlsValue->charset, fGivenCharset) == 0) { converterOut = tlsValue->converter; return B_OK; } // charset no longer matches the converter, we need to dump it and // create a new one ucnv_close(tlsValue->converter); tlsValue->converter = NULL; } // create a new converter for the current charset UErrorCode icuStatus = U_ZERO_ERROR; UConverter* icuConverter = ucnv_open(fGivenCharset, &icuStatus); if (icuConverter == NULL) return B_NAME_NOT_FOUND; // setup the new converter to stop upon any errors icuStatus = U_ZERO_ERROR; ucnv_setToUCallBack(icuConverter, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &icuStatus); if (!U_SUCCESS(icuStatus)) { ucnv_close(icuConverter); return B_ERROR; } icuStatus = U_ZERO_ERROR; ucnv_setFromUCallBack(icuConverter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &icuStatus); if (!U_SUCCESS(icuStatus)) { ucnv_close(icuConverter); return B_ERROR; } tlsValue->converter = icuConverter; strlcpy(tlsValue->charset, fGivenCharset, sizeof(tlsValue->charset)); converterOut = icuConverter; return B_OK; }
charsetFilteredOutputStream_icu::charsetFilteredOutputStream_icu (const charset& source, const charset& dest, outputStream* os, const charsetConverterOptions& opts) : m_from(NULL), m_to(NULL), m_sourceCharset(source), m_destCharset(dest), m_stream(*os), m_options(opts) { UErrorCode err = U_ZERO_ERROR; m_from = ucnv_open(source.getName().c_str(), &err); if (!U_SUCCESS(err)) { throw exceptions::charset_conv_error ("Cannot initialize ICU converter for source charset '" + source.getName() + "' (error code: " + u_errorName(err) + "."); } m_to = ucnv_open(dest.getName().c_str(), &err); if (!U_SUCCESS(err)) { throw exceptions::charset_conv_error ("Cannot initialize ICU converter for destination charset '" + dest.getName() + "' (error code: " + u_errorName(err) + "."); } // Tell ICU what to do when encountering an illegal byte sequence if (m_options.silentlyReplaceInvalidSequences) { // Set replacement chars for when converting from Unicode to codepage icu::UnicodeString substString(m_options.invalidSequence.c_str()); ucnv_setSubstString(m_to, substString.getTerminatedBuffer(), -1, &err); if (U_FAILURE(err)) throw exceptions::charset_conv_error("[ICU] Error when setting substitution string."); } else { // Tell ICU top stop (and return an error) on illegal byte sequences ucnv_setToUCallBack (m_to, UCNV_TO_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err); if (U_FAILURE(err)) throw exceptions::charset_conv_error("[ICU] Error when setting ToU callback."); ucnv_setFromUCallBack (m_to, UCNV_FROM_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err); if (U_FAILURE(err)) throw exceptions::charset_conv_error("[ICU] Error when setting FromU callback."); } }
Bool CodeSet_Validate(const char *buf, // IN: the string size_t size, // IN: length of string const char *code) // IN: encoding { #if defined(NO_ICU) return CodeSetOld_Validate(buf, size, code); #else UConverter *cv; UErrorCode uerr; // ucnv_toUChars takes 32-bit int size ASSERT_NOT_IMPLEMENTED(size <= (size_t) MAX_INT32); if (size == 0) { return TRUE; } /* * Fallback if necessary. */ if (dontUseIcu) { return CodeSetOld_Validate(buf, size, code); } /* * Calling ucnv_toUChars() this way is the idiom to precompute * the length of the output. (See preflighting in the ICU User Guide.) * So if the error is not U_BUFFER_OVERFLOW_ERROR, then the input * is bad. */ uerr = U_ZERO_ERROR; cv = ucnv_open(code, &uerr); ASSERT_NOT_IMPLEMENTED(uerr == U_ZERO_ERROR); ucnv_setToUCallBack(cv, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &uerr); ASSERT_NOT_IMPLEMENTED(uerr == U_ZERO_ERROR); ucnv_toUChars(cv, NULL, 0, buf, size, &uerr); ucnv_close(cv); return uerr == U_BUFFER_OVERFLOW_ERROR; #endif }
// Requires free() of returned UTF16Chars. void convertUTF8ToUTF16(const NPUTF8 *UTF8Chars, int UTF8Length, NPUTF16 **UTF16Chars, unsigned int *UTF16Length) { #if USE(ICU_UNICODE) assert(UTF8Chars || UTF8Length == 0); assert(UTF16Chars); if (UTF8Length == -1) UTF8Length = static_cast<int>(strlen(UTF8Chars)); // UTF16Length maximum length is the length of the UTF8 string, plus one to include terminator // Without the plus one, it will convert ok, but a warning is generated from the converter as // there is not enough room for a terminating character. *UTF16Length = UTF8Length + 1; *UTF16Chars = 0; UErrorCode status = U_ZERO_ERROR; UConverter* conv = ucnv_open("utf8", &status); if (U_SUCCESS(status)) { *UTF16Chars = (NPUTF16 *)malloc(sizeof(NPUTF16) * (*UTF16Length)); ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_STOP, 0, 0, 0, &status); *UTF16Length = ucnv_toUChars(conv, (::UChar*)*UTF16Chars, *UTF16Length, UTF8Chars, UTF8Length, &status); ucnv_close(conv); } // Check to see if the conversion was successful // Some plugins return invalid UTF-8 in NPVariantType_String, see <http://bugs.webkit.org/show_bug.cgi?id=5163> // There is no "bad data" for latin1. It is unlikely that the plugin was really sending text in this encoding, // but it should have used UTF-8, and now we are simply avoiding a crash. if (!U_SUCCESS(status)) { *UTF16Length = UTF8Length; if (!*UTF16Chars) // If the memory wasn't allocated, allocate it. *UTF16Chars = (NPUTF16 *)malloc(sizeof(NPUTF16) * (*UTF16Length)); for (unsigned i = 0; i < *UTF16Length; i++) (*UTF16Chars)[i] = UTF8Chars[i] & 0xFF; } #else assert(!"Implement me!"); #endif }
static UConverter * xh_encoder_uconv_create(xh_char_t *encoding, xh_bool_t toUnicode) { UConverter *uconv; UErrorCode status = U_ZERO_ERROR; uconv = ucnv_open((char *) encoding, &status); if ( U_FAILURE(status) ) { return NULL; } if (toUnicode) { ucnv_setToUCallBack(uconv, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &status); } else { ucnv_setFromUCallBack(uconv, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status); } return uconv; }
UConverter* get_chset_desc(const mstr* chset) { int chset_indx; UErrorCode status; if (0 >= (chset_indx = verify_chset(chset))) return NULL; if (NULL == chset_desc[chset_indx]) { status = U_ZERO_ERROR; chset_desc[chset_indx] = ucnv_open(chset_names[chset_indx].addr, &status); if (U_FAILURE(status)) GTMASSERT; /* Initialize the callback for illegal/invalid characters, so that conversion * stops at the first illegal character rather than continuing with replacement */ status = U_ZERO_ERROR; ucnv_setToUCallBack(chset_desc[chset_indx], &callback_stop, NULL, NULL, NULL, &status); if (U_FAILURE(status)) GTMASSERT; } return chset_desc[chset_indx]; }
EXPORT UnicodeString &PyBytes_AsUnicodeString(PyObject *object, const char *encoding, const char *mode, UnicodeString &string) { UErrorCode status = U_ZERO_ERROR; UConverter *conv = ucnv_open(encoding, &status); if (U_FAILURE(status)) throw ICUException(status); _STOPReason stop; char *src; Py_ssize_t len; UChar *buffer, *target; memset(&stop, 0, sizeof(stop)); if (!strcmp(mode, "strict")) { ucnv_setToUCallBack(conv, _stopDecode, &stop, NULL, NULL, &status); if (U_FAILURE(status)) { ucnv_close(conv); throw ICUException(status); } } PyBytes_AsStringAndSize(object, &src, &len); stop.src = src; stop.src_length = len; buffer = target = new UChar[len]; if (buffer == NULL) { ucnv_close(conv); PyErr_NoMemory(); throw ICUException(); } ucnv_toUnicode(conv, &target, target + len, (const char **) &src, src + len, NULL, true, &status); if (U_FAILURE(status)) { const char *reasonName; switch (stop.reason) { case UCNV_UNASSIGNED: reasonName = "the code point is unassigned"; break; case UCNV_ILLEGAL: reasonName = "the code point is illegal"; break; case UCNV_IRREGULAR: reasonName = "the code point is not a regular sequence in the encoding"; break; default: reasonName = "unexpected reason code"; break; } status = U_ZERO_ERROR; PyErr_Format(PyExc_ValueError, "'%s' codec can't decode byte 0x%x in position %d: reason code %d (%s)", ucnv_getName(conv, &status), (int) (unsigned char) stop.chars[0], stop.error_position, stop.reason, reasonName); delete[] buffer; ucnv_close(conv); throw ICUException(); } string.setTo(buffer, target - buffer); delete[] buffer; ucnv_close(conv); return string; }
// Convert a file from one encoding to another static UBool convertFile(const char *pname, const char *fromcpage, UConverterToUCallback toucallback, const void *touctxt, const char *tocpage, UConverterFromUCallback fromucallback, const void *fromuctxt, int fallback, size_t bufsz, const char *translit, const char *infilestr, FILE * outfile, int verbose) { FILE *infile; UBool ret = TRUE; UConverter *convfrom = 0; UConverter *convto = 0; UErrorCode err = U_ZERO_ERROR; UBool flush; const char *cbufp; char *bufp; char *buf = 0; uint32_t infoffset = 0, outfoffset = 0; /* Where we are in the file, for error reporting. */ const UChar *unibufbp; UChar *unibufp; UChar *unibuf = 0; int32_t *fromoffsets = 0, *tooffsets = 0; size_t rd, wr, tobufsz; #if !UCONFIG_NO_TRANSLITERATION Transliterator *t = 0; // Transliterator acting on Unicode data. #endif UnicodeString u; // String to do the transliteration. // Open the correct input file or connect to stdin for reading input if (infilestr != 0 && strcmp(infilestr, "-")) { infile = fopen(infilestr, "rb"); if (infile == 0) { UnicodeString str1(infilestr, ""); str1.append((UChar32) 0); UnicodeString str2(strerror(errno), ""); str2.append((UChar32) 0); initMsg(pname); u_wmsg(stderr, "cantOpenInputF", str1.getBuffer(), str2.getBuffer()); return FALSE; } } else { infilestr = "-"; infile = stdin; #ifdef WIN32 if (setmode(fileno(stdin), O_BINARY) == -1) { initMsg(pname); u_wmsg(stderr, "cantSetInBinMode"); return FALSE; } #endif } if (verbose) { fprintf(stderr, "%s:\n", infilestr); } #if !UCONFIG_NO_TRANSLITERATION // Create transliterator as needed. if (translit != NULL && *translit) { UParseError parse; UnicodeString str(translit), pestr; /* Create from rules or by ID as needed. */ parse.line = -1; if (uprv_strchr(translit, ':') || uprv_strchr(translit, '>') || uprv_strchr(translit, '<') || uprv_strchr(translit, '>')) { t = Transliterator::createFromRules("Uconv", str, UTRANS_FORWARD, parse, err); } else { t = Transliterator::createInstance(translit, UTRANS_FORWARD, err); } if (U_FAILURE(err)) { str.append((UChar32) 0); initMsg(pname); if (parse.line >= 0) { UChar linebuf[20], offsetbuf[20]; uprv_itou(linebuf, 20, parse.line, 10, 0); uprv_itou(offsetbuf, 20, parse.offset, 10, 0); u_wmsg(stderr, "cantCreateTranslitParseErr", str.getBuffer(), u_wmsg_errorName(err), linebuf, offsetbuf); } else { u_wmsg(stderr, "cantCreateTranslit", str.getBuffer(), u_wmsg_errorName(err)); } if (t) { delete t; t = 0; } goto error_exit; } } #endif // Create codepage converter. If the codepage or its aliases weren't // available, it returns NULL and a failure code. We also set the // callbacks, and return errors in the same way. convfrom = ucnv_open(fromcpage, &err); if (U_FAILURE(err)) { UnicodeString str(fromcpage, (int32_t)(uprv_strlen(fromcpage) + 1)); initMsg(pname); u_wmsg(stderr, "cantOpenFromCodeset", str.getBuffer(), u_wmsg_errorName(err)); goto error_exit; } ucnv_setToUCallBack(convfrom, toucallback, touctxt, 0, 0, &err); if (U_FAILURE(err)) { initMsg(pname); u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err)); goto error_exit; } convto = ucnv_open(tocpage, &err); if (U_FAILURE(err)) { UnicodeString str(tocpage, (int32_t)(uprv_strlen(tocpage) + 1)); initMsg(pname); u_wmsg(stderr, "cantOpenToCodeset", str.getBuffer(), u_wmsg_errorName(err)); goto error_exit; } ucnv_setFromUCallBack(convto, fromucallback, fromuctxt, 0, 0, &err); if (U_FAILURE(err)) { initMsg(pname); u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err)); goto error_exit; } ucnv_setFallback(convto, fallback); // To ensure that the buffer always is of enough size, we // must take the worst case scenario, that is the character in // the codepage that uses the most bytes and multiply it against // the buffer size. // use bufsz+1 to allow for additional BOM/signature character (U+FEFF) tobufsz = (bufsz+1) * ucnv_getMaxCharSize(convto); buf = new char[tobufsz]; unibuf = new UChar[bufsz]; fromoffsets = new int32_t[bufsz]; tooffsets = new int32_t[tobufsz]; // OK, we can convert now. do { char willexit = 0; rd = fread(buf, 1, bufsz, infile); if (ferror(infile) != 0) { UnicodeString str(strerror(errno)); str.append((UChar32) 0); initMsg(pname); u_wmsg(stderr, "cantRead", str.getBuffer()); goto error_exit; } // Convert the read buffer into the new coding // After the call 'unibufp' will be placed on the last // character that was converted in the 'unibuf'. // Also the 'cbufp' is positioned on the last converted // character. // At the last conversion in the file, flush should be set to // true so that we get all characters converted // // The converter must be flushed at the end of conversion so // that characters on hold also will be written. unibufp = unibuf; cbufp = buf; flush = rd != bufsz; ucnv_toUnicode(convfrom, &unibufp, unibufp + bufsz, &cbufp, cbufp + rd, fromoffsets, flush, &err); infoffset += (uint32_t)(cbufp - buf); if (U_FAILURE(err)) { char pos[32]; sprintf(pos, "%u", infoffset - 1); UnicodeString str(pos, (int32_t)(uprv_strlen(pos) + 1)); initMsg(pname); u_wmsg(stderr, "problemCvtToU", str.getBuffer(), u_wmsg_errorName(err)); willexit = 1; err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */ } // At the last conversion, the converted characters should be // equal to number of chars read. if (flush && !willexit && cbufp != (buf + rd)) { char pos[32]; sprintf(pos, "%u", infoffset); UnicodeString str(pos, (int32_t)(uprv_strlen(pos) + 1)); initMsg(pname); u_wmsg(stderr, "premEndInput", str.getBuffer()); willexit = 1; } // Prepare to transliterate and convert. Transliterate if needed. #if !UCONFIG_NO_TRANSLITERATION if (t) { u.setTo(unibuf, (int32_t)(unibufp - unibuf)); // Copy into string. t->transliterate(u); } else #endif { u.setTo(unibuf, (int32_t)(unibufp - unibuf), (int32_t)(bufsz)); // Share the buffer. } int32_t ulen = u.length(); // Convert the Unicode buffer into the destination codepage // Again 'bufp' will be placed on the last converted character // And 'unibufbp' will be placed on the last converted unicode character // At the last conversion flush should be set to true to ensure that // all characters left get converted const UChar *unibufu = unibufbp = u.getBuffer(); do { int32_t len = ulen > (int32_t)bufsz ? (int32_t)bufsz : ulen; bufp = buf; unibufp = (UChar *) (unibufbp + len); ucnv_fromUnicode(convto, &bufp, bufp + tobufsz, &unibufbp, unibufp, tooffsets, flush, &err); if (U_FAILURE(err)) { const char *errtag; char pos[32]; uint32_t erroffset = dataOffset((int32_t)(bufp - buf - 1), fromoffsets, (int32_t)(bufsz), tooffsets, (int32_t)(tobufsz)); int32_t ferroffset = (int32_t)(infoffset - (unibufp - unibufu) + erroffset); if ((int32_t) ferroffset < 0) { ferroffset = (int32_t)(outfoffset + (bufp - buf)); errtag = "problemCvtFromUOut"; } else { errtag = "problemCvtFromU"; } sprintf(pos, "%u", ferroffset); UnicodeString str(pos, (int32_t)(uprv_strlen(pos) + 1)); initMsg(pname); u_wmsg(stderr, errtag, str.getBuffer(), u_wmsg_errorName(err)); willexit = 1; } // At the last conversion, the converted characters should be equal to number // of consumed characters. if (flush && !willexit && unibufbp != (unibufu + (size_t) (unibufp - unibufu))) { char pos[32]; sprintf(pos, "%u", infoffset); UnicodeString str(pos, (int32_t)(uprv_strlen(pos) + 1)); initMsg(pname); u_wmsg(stderr, "premEnd", str.getBuffer()); willexit = 1; } // Finally, write the converted buffer to the output file rd = (size_t) (bufp - buf); outfoffset += (int32_t)(wr = fwrite(buf, 1, rd, outfile)); if (wr != rd) { UnicodeString str(strerror(errno), ""); initMsg(pname); u_wmsg(stderr, "cantWrite", str.getBuffer()); willexit = 1; } if (willexit) { goto error_exit; } } while ((ulen -= (int32_t)(bufsz)) > 0); } while (!flush); // Stop when we have flushed the // converters (this means that it's // the end of output) goto normal_exit; error_exit: ret = FALSE; normal_exit: // Cleanup. if (convfrom) ucnv_close(convfrom); if (convto) ucnv_close(convto); #if !UCONFIG_NO_TRANSLITERATION if (t) delete t; #endif if (buf) delete[] buf; if (unibuf) delete[] unibuf; if (fromoffsets) delete[] fromoffsets; if (tooffsets) delete[] tooffsets; if (infile != stdin) { fclose(infile); } return ret; }
void charsetConverter_icu::convert (utility::inputStream& in, utility::outputStream& out, status* st) { UErrorCode err = U_ZERO_ERROR; ucnv_reset(m_from); ucnv_reset(m_to); if (st) new (st) status(); // From buffers byte_t cpInBuffer[16]; // stream data put here const size_t outSize = ucnv_getMinCharSize(m_from) * sizeof(cpInBuffer) * sizeof(UChar); std::vector <UChar> uOutBuffer(outSize); // Unicode chars end up here // To buffers // converted (char) data end up here const size_t cpOutBufferSz = ucnv_getMaxCharSize(m_to) * outSize; std::vector <char> cpOutBuffer(cpOutBufferSz); // Tell ICU what to do when encountering an illegal byte sequence if (m_options.silentlyReplaceInvalidSequences) { // Set replacement chars for when converting from Unicode to codepage icu::UnicodeString substString(m_options.invalidSequence.c_str()); ucnv_setSubstString(m_to, substString.getTerminatedBuffer(), -1, &err); if (U_FAILURE(err)) throw exceptions::charset_conv_error("[ICU] Error when setting substitution string."); } else { // Tell ICU top stop (and return an error) on illegal byte sequences ucnv_setToUCallBack (m_from, UCNV_TO_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err); if (U_FAILURE(err)) throw exceptions::charset_conv_error("[ICU] Error when setting ToU callback."); ucnv_setFromUCallBack (m_to, UCNV_FROM_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err); if (U_FAILURE(err)) throw exceptions::charset_conv_error("[ICU] Error when setting FromU callback."); } // Input data available while (!in.eof()) { // Read input data into buffer size_t inLength = in.read(cpInBuffer, sizeof(cpInBuffer)); // Beginning of read data const char* source = reinterpret_cast <const char*>(&cpInBuffer[0]); const char* sourceLimit = source + inLength; // end + 1 UBool flush = in.eof(); // is this last run? UErrorCode toErr; // Loop until all source has been processed do { // Set up target pointers UChar* target = &uOutBuffer[0]; UChar* targetLimit = &target[0] + outSize; toErr = U_ZERO_ERROR; ucnv_toUnicode(m_from, &target, targetLimit, &source, sourceLimit, NULL, flush, &toErr); if (st) st->inputBytesRead += (source - reinterpret_cast <const char*>(&cpInBuffer[0])); if (toErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(toErr)) { if (toErr == U_INVALID_CHAR_FOUND || toErr == U_TRUNCATED_CHAR_FOUND || toErr == U_ILLEGAL_CHAR_FOUND) { // Error will be thrown later (*) } else { throw exceptions::charset_conv_error("[ICU] Error converting to Unicode from " + m_source.getName()); } } // The Unicode source is the buffer just written and the limit // is where the previous conversion stopped (target is moved in the conversion) const UChar* uSource = &uOutBuffer[0]; UChar* uSourceLimit = &target[0]; UErrorCode fromErr; // Loop until converted chars are fully written do { char* cpTarget = &cpOutBuffer[0]; const char* cpTargetLimit = &cpOutBuffer[0] + cpOutBufferSz; fromErr = U_ZERO_ERROR; // Write converted bytes (Unicode) to destination codepage ucnv_fromUnicode(m_to, &cpTarget, cpTargetLimit, &uSource, uSourceLimit, NULL, flush, &fromErr); if (st) { // Decrement input bytes count by the number of input bytes in error char errBytes[16]; int8_t errBytesLen = sizeof(errBytes); UErrorCode errBytesErr = U_ZERO_ERROR; ucnv_getInvalidChars(m_from, errBytes, &errBytesLen, &errBytesErr); st->inputBytesRead -= errBytesLen; st->outputBytesWritten += cpTarget - &cpOutBuffer[0]; } // (*) If an error occurred while converting from input charset, throw it now if (toErr == U_INVALID_CHAR_FOUND || toErr == U_TRUNCATED_CHAR_FOUND || toErr == U_ILLEGAL_CHAR_FOUND) { throw exceptions::illegal_byte_sequence_for_charset(); } if (fromErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(fromErr)) { if (fromErr == U_INVALID_CHAR_FOUND || fromErr == U_TRUNCATED_CHAR_FOUND || fromErr == U_ILLEGAL_CHAR_FOUND) { throw exceptions::illegal_byte_sequence_for_charset(); } else { throw exceptions::charset_conv_error("[ICU] Error converting from Unicode to " + m_dest.getName()); } } // Write to destination stream out.write(&cpOutBuffer[0], (cpTarget - &cpOutBuffer[0])); } while (fromErr == U_BUFFER_OVERFLOW_ERROR); } while (toErr == U_BUFFER_OVERFLOW_ERROR); } }
/* fill the uchar buffer */ static UCHARBUF* ucbuf_fillucbuf( UCHARBUF* buf,UErrorCode* error){ UChar* pTarget=NULL; UChar* target=NULL; const char* source=NULL; char carr[MAX_IN_BUF] = {'\0'}; char* cbuf = carr; int32_t inputRead=0; int32_t outputWritten=0; int32_t offset=0; const char* sourceLimit =NULL; int32_t cbufSize=0; pTarget = buf->buffer; /* check if we arrived here without exhausting the buffer*/ if(buf->currentPos<buf->bufLimit){ offset = (int32_t)(buf->bufLimit-buf->currentPos); memmove(buf->buffer,buf->currentPos,offset* sizeof(UChar)); } #if DEBUG memset(pTarget+offset,0xff,sizeof(UChar)*(MAX_IN_BUF-offset)); #endif if(buf->isBuffered){ cbufSize = MAX_IN_BUF; /* read the file */ inputRead=T_FileStream_read(buf->in,cbuf,cbufSize-offset); buf->remaining-=inputRead; }else{ cbufSize = T_FileStream_size(buf->in); cbuf = (char*)uprv_malloc(cbufSize); if (cbuf == NULL) { *error = U_MEMORY_ALLOCATION_ERROR; return NULL; } inputRead= T_FileStream_read(buf->in,cbuf,cbufSize); buf->remaining-=inputRead; } /* just to be sure...*/ if ( 0 == inputRead ) buf->remaining = 0; target=pTarget; /* convert the bytes */ if(buf->conv){ /* set the callback to stop */ UConverterToUCallback toUOldAction ; void* toUOldContext; void* toUNewContext=NULL; ucnv_setToUCallBack(buf->conv, UCNV_TO_U_CALLBACK_STOP, toUNewContext, &toUOldAction, (const void**)&toUOldContext, error); /* since state is saved in the converter we add offset to source*/ target = pTarget+offset; source = cbuf; sourceLimit = source + inputRead; ucnv_toUnicode(buf->conv,&target,target+(buf->bufCapacity-offset), &source,sourceLimit,NULL, (UBool)(buf->remaining==0),error); if(U_FAILURE(*error)){ char context[CONTEXT_LEN+1]; char preContext[CONTEXT_LEN+1]; char postContext[CONTEXT_LEN+1]; int8_t len = CONTEXT_LEN; int32_t start=0; int32_t stop =0; int32_t pos =0; /* use erro1 to preserve the error code */ UErrorCode error1 =U_ZERO_ERROR; if( buf->showWarning==TRUE){ fprintf(stderr,"\n###WARNING: Encountered abnormal bytes while" " converting input stream to target encoding: %s\n", u_errorName(*error)); } /* now get the context chars */ ucnv_getInvalidChars(buf->conv,context,&len,&error1); context[len]= 0 ; /* null terminate the buffer */ pos = (int32_t)(source - cbuf - len); /* for pre-context */ start = (pos <=CONTEXT_LEN)? 0 : (pos - (CONTEXT_LEN-1)); stop = pos-len; memcpy(preContext,cbuf+start,stop-start); /* null terminate the buffer */ preContext[stop-start] = 0; /* for post-context */ start = pos+len; stop = (int32_t)(((pos+CONTEXT_LEN)<= (sourceLimit-cbuf) )? (pos+(CONTEXT_LEN-1)) : (sourceLimit-cbuf)); memcpy(postContext,source,stop-start); /* null terminate the buffer */ postContext[stop-start] = 0; if(buf->showWarning ==TRUE){ /* print out the context */ fprintf(stderr,"\tPre-context: %s\n",preContext); fprintf(stderr,"\tContext: %s\n",context); fprintf(stderr,"\tPost-context: %s\n", postContext); } /* reset the converter */ ucnv_reset(buf->conv); /* set the call back to substitute * and restart conversion */ ucnv_setToUCallBack(buf->conv, UCNV_TO_U_CALLBACK_SUBSTITUTE, toUNewContext, &toUOldAction, (const void**)&toUOldContext, &error1); /* reset source and target start positions */ target = pTarget+offset; source = cbuf; /* re convert */ ucnv_toUnicode(buf->conv,&target,target+(buf->bufCapacity-offset), &source,sourceLimit,NULL, (UBool)(buf->remaining==0),&error1); } outputWritten = (int32_t)(target - pTarget); #if DEBUG { int i; target = pTarget; for(i=0;i<numRead;i++){ /* printf("%c", (char)(*target++));*/ } } #endif }else{ u_charsToUChars(cbuf,target+offset,inputRead); outputWritten=((buf->remaining>cbufSize)? cbufSize:inputRead+offset); } buf->currentPos = pTarget; buf->bufLimit=pTarget+outputWritten; *buf->bufLimit=0; /*NUL terminate*/ if(cbuf!=carr){ uprv_free(cbuf); } return buf; }
Bool CodeSet_GenericToGenericDb(const char *codeIn, // IN const char *bufIn, // IN size_t sizeIn, // IN const char *codeOut, // IN unsigned int flags, // IN DynBuf *db) // IN/OUT { Bool result = FALSE; UErrorCode uerr; const char *bufInCur; const char *bufInEnd; UChar bufPiv[1024]; UChar *bufPivSource; UChar *bufPivTarget; UChar *bufPivEnd; char *bufOut; char *bufOutCur; char *bufOutEnd; size_t bufOutSize; size_t bufOutOffset; UConverter *cvin = NULL; UConverter *cvout = NULL; UConverterToUCallback toUCb; UConverterFromUCallback fromUCb; ASSERT(codeIn); ASSERT(sizeIn == 0 || bufIn); ASSERT(codeOut); ASSERT(db); ASSERT((CSGTG_NORMAL == flags) || (CSGTG_TRANSLIT == flags) || (CSGTG_IGNORE == flags)); if (dontUseIcu) { // fall back return CodeSetOld_GenericToGenericDb(codeIn, bufIn, sizeIn, codeOut, flags, db); } /* * Trivial case. */ if ((0 == sizeIn) || (NULL == bufIn)) { result = TRUE; goto exit; } /* * Open converters. */ uerr = U_ZERO_ERROR; cvin = ucnv_open(codeIn, &uerr); if (!cvin) { goto exit; } uerr = U_ZERO_ERROR; cvout = ucnv_open(codeOut, &uerr); if (!cvout) { goto exit; } /* * Set callbacks according to flags. */ switch (flags) { case CSGTG_NORMAL: toUCb = UCNV_TO_U_CALLBACK_STOP; fromUCb = UCNV_FROM_U_CALLBACK_STOP; break; case CSGTG_TRANSLIT: toUCb = UCNV_TO_U_CALLBACK_SUBSTITUTE; fromUCb = UCNV_FROM_U_CALLBACK_SUBSTITUTE; break; case CSGTG_IGNORE: toUCb = UCNV_TO_U_CALLBACK_SKIP; fromUCb = UCNV_FROM_U_CALLBACK_SKIP; break; default: NOT_IMPLEMENTED(); break; } uerr = U_ZERO_ERROR; ucnv_setToUCallBack(cvin, toUCb, NULL, NULL, NULL, &uerr); if (U_ZERO_ERROR != uerr) { goto exit; } uerr = U_ZERO_ERROR; ucnv_setFromUCallBack(cvout, fromUCb, NULL, NULL, NULL, &uerr); if (U_ZERO_ERROR != uerr) { goto exit; } /* * Convert using ucnv_convertEx(). * As a starting guess, make the output buffer the same size as * the input string (with a fudge constant added in to avoid degen * cases). */ bufInCur = bufIn; bufInEnd = bufIn + sizeIn; bufOutSize = sizeIn + 4; bufOutOffset = 0; bufPivSource = bufPiv; bufPivTarget = bufPiv; bufPivEnd = bufPiv + ARRAYSIZE(bufPiv); for (;;) { if (!DynBuf_Enlarge(db, bufOutSize)) { goto exit; } bufOut = DynBuf_Get(db); bufOutCur = bufOut + bufOutOffset; bufOutSize = DynBuf_GetAllocatedSize(db); bufOutEnd = bufOut + bufOutSize; uerr = U_ZERO_ERROR; ucnv_convertEx(cvout, cvin, &bufOutCur, bufOutEnd, &bufInCur, bufInEnd, bufPiv, &bufPivSource, &bufPivTarget, bufPivEnd, FALSE, TRUE, &uerr); if (!U_FAILURE(uerr)) { /* * "This was a triumph. * I'm making a note here: * HUGE SUCCESS. * It's hard to overstate * my satisfaction." */ break; } if (U_BUFFER_OVERFLOW_ERROR != uerr) { // failure goto exit; } /* * Our guess at 'bufOutSize' was obviously wrong, just double it. * We'll be reallocating bufOut, so will need to recompute bufOutCur * based on bufOutOffset. */ bufOutSize *= 2; bufOutOffset = bufOutCur - bufOut; } /* * Set final size and return. */ DynBuf_SetSize(db, bufOutCur - bufOut); result = TRUE; exit: if (cvin) { ucnv_close(cvin); } if (cvout) { ucnv_close(cvout); } return result; }
UErrorCode convert_to_unicode(const text* buffer, const text* encoding, UChar** uBuf, int32_t *uBuf_len, bool force, bool* dropped_bytes) { UErrorCode status = U_ZERO_ERROR; UConverter *conv; int32_t uConvertedLen = 0; // used to set dropped_bytes flag if force is true ToUFLAGContext * context = NULL; size_t uBufSize = 0; const char* encoding_cstr = text_to_cstring(encoding); // open converter for detected encoding conv = ucnv_open(encoding_cstr, &status); if (U_FAILURE(status)) { ereport(WARNING, (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION), errmsg("Cannot open %s converter - error: %s.\n", (const char *) encoding_cstr, u_errorName(status)))); if (NULL != encoding_cstr) pfree((void *) encoding_cstr); ucnv_close(conv); return status; } if (force) { // set callback to skip illegal, irregular or unassigned bytes // set converter to use SKIP callback // contecxt will save and call it after calling custom callback ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_SKIP, NULL, NULL, NULL, &status); //TODO: refactor warning and error message reporting if (U_FAILURE(status)) { ereport(WARNING, (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION), errmsg("Cannot set callback on converter - error: %s.\n", u_errorName(status)))); if (NULL != encoding_cstr) pfree((void *) encoding_cstr); ucnv_close(conv); return status; } // initialize flagging callback context = flagCB_toU_openContext(); /* Set our special callback */ ucnv_setToUCallBack(conv, flagCB_toU, context, &(context->subCallback), &(context->subContext), &status ); if (U_FAILURE(status)) { ereport(WARNING, (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION), errmsg("Cannot set callback on converter - error: %s.\n", u_errorName(status)))); if (NULL != encoding_cstr) pfree((void *) encoding_cstr); ucnv_close(conv); return status; } } // allocate unicode buffer // must pfree before exiting calling function uBufSize = (VARSIZE_ANY_EXHDR(buffer)/ucnv_getMinCharSize(conv) + 1); *uBuf = (UChar*) palloc0(uBufSize * sizeof(UChar)); if (*uBuf == NULL) { status = U_MEMORY_ALLOCATION_ERROR; ereport(WARNING, (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION), errmsg("Cannot allocate %d bytes for Unicode pivot buffer - error: %s.\n", (int) uBufSize, u_errorName(status)))); if (NULL != encoding_cstr) pfree((void *) encoding_cstr); ucnv_close(conv); return status; } ereport(DEBUG1, (errcode(ERRCODE_SUCCESSFUL_COMPLETION), errmsg("Original string: %s\n", (const char*) text_to_cstring(buffer)))); // convert to Unicode // returns length of converted string, not counting NUL-terminator uConvertedLen = ucnv_toUChars(conv, *uBuf, uBufSize, (const char*) text_to_cstring(buffer), STRING_IS_NULL_TERMINATED, &status ); if (U_SUCCESS(status)) { // add 1 for NUL terminator *uBuf_len = uConvertedLen + 1; ereport(DEBUG1, (errcode(ERRCODE_SUCCESSFUL_COMPLETION), errmsg("Converted string: %s\n", (const char*) *uBuf))); // see if any bytes where dropped // context struct will go away with converter is closed if (NULL != context) *dropped_bytes = context->flag; else *dropped_bytes = false; } if (U_FAILURE(status)) { ereport(WARNING, (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION), errmsg("ICU conversion from %s to Unicode failed - error: %s.\n", encoding_cstr, u_errorName(status)))); } if (NULL != encoding_cstr) pfree((void *) encoding_cstr); ucnv_close(conv); return status; }
uint16_t * ppb_char_set_char_set_to_utf16(PP_Instance instance, const char *input, uint32_t input_len, const char *input_char_set, enum PP_CharSet_ConversionError on_error, uint32_t *output_length) { // each character could be converted into a surrogate pair const uint32_t output_buffer_length = (input_len + 2) * 2 * sizeof(uint16_t); uint16_t *output = ppb_memory_mem_alloc(output_buffer_length); if (!output) { trace_error("%s, can't allocate memory, %u bytes\n", __func__, output_buffer_length); goto err; } const char *charset = encoding_alias_get_canonical_name(input_char_set); const UChar subst = '?'; UErrorCode st = U_ZERO_ERROR; UConverter *u = ucnv_open(charset, &st); if (!U_SUCCESS(st)) { trace_error("%s, wrong charset %s\n", __func__, input_char_set); goto err; } switch (on_error) { default: case PP_CHARSET_CONVERSIONERROR_FAIL: st = U_ZERO_ERROR; ucnv_setToUCallBack(u, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &st); break; case PP_CHARSET_CONVERSIONERROR_SKIP: st = U_ZERO_ERROR; ucnv_setToUCallBack(u, UCNV_TO_U_CALLBACK_SKIP, NULL, NULL, NULL, &st); break; case PP_CHARSET_CONVERSIONERROR_SUBSTITUTE: st = U_ZERO_ERROR; ucnv_setToUCallBack(u, UCNV_TO_U_CALLBACK_SUBSTITUTE, NULL, NULL, NULL, &st); st = U_ZERO_ERROR; ucnv_setSubstString(u, &subst, 1, &st); break; } st = U_ZERO_ERROR; *output_length = ucnv_toUChars(u, output, output_buffer_length / sizeof(uint16_t), input, input_len, &st); if (st != U_BUFFER_OVERFLOW_ERROR && !U_SUCCESS(st)) goto err; ucnv_close(u); return output; err: *output_length = 0; ppb_memory_mem_free(output); if (u) ucnv_close(u); return NULL; }