U_CAPI char* U_EXPORT2 u_austrncpy(char *s1, const UChar *ucs2, int32_t n) { char *target = s1; UErrorCode err = U_ZERO_ERROR; UConverter *cnv = u_getDefaultConverter(&err); if(U_SUCCESS(err) && cnv != NULL) { ucnv_reset(cnv); ucnv_fromUnicode(cnv, &target, s1+n, &ucs2, ucs2+u_ustrnlen(ucs2, n), NULL, TRUE, &err); ucnv_reset(cnv); /* be good citizens */ u_releaseDefaultConverter(cnv); if(U_FAILURE(err) && (err != U_BUFFER_OVERFLOW_ERROR) ) { *s1 = 0; /* failure */ } if(target < (s1+n)) { /* U_BUFFER_OVERFLOW_ERROR isn't an err, just means no termination will happen. */ *target = 0; /* terminate */ } } else { *s1 = 0; } return s1; }
char *aescstrdup(const UChar* unichars,int32_t length){ char *newString,*targetLimit,*target; UConverterFromUCallback cb; const void *p; UErrorCode errorCode = U_ZERO_ERROR; #if U_CHARSET_FAMILY==U_EBCDIC_FAMILY # if U_PLATFORM == U_PF_OS390 static const char convName[] = "ibm-1047"; # else static const char convName[] = "ibm-37"; # endif #else static const char convName[] = "US-ASCII"; #endif UConverter* conv = ucnv_open(convName, &errorCode); if(length==-1){ length = u_strlen( unichars); } newString = (char*)ctst_malloc ( sizeof(char) * 8 * (length +1)); target = newString; targetLimit = newString+sizeof(char) * 8 * (length +1); ucnv_setFromUCallBack(conv, UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C, &cb, &p, &errorCode); ucnv_fromUnicode(conv,&target,targetLimit, &unichars, (UChar*)(unichars+length),NULL,TRUE,&errorCode); ucnv_close(conv); *target = '\0'; return newString; }
QByteArray QIcuCodec::convertFromUnicode(const QChar *unicode, int length, QTextCodec::ConverterState *state) const { UConverter *conv = getConverter(state); int requiredLength = UCNV_GET_MAX_BYTES_FOR_STRING(length, ucnv_getMaxCharSize(conv)); QByteArray string(requiredLength, Qt::Uninitialized); const UChar *uc = (const UChar *)unicode; const UChar *end = uc + length; int convertedChars = 0; while (1) { char *ch = (char *)string.data(); char *chEnd = ch + string.length(); ch += convertedChars; UErrorCode error = U_ZERO_ERROR; ucnv_fromUnicode(conv, &ch, chEnd, &uc, end, 0, false, &error); if (!U_SUCCESS(error)) qDebug() << "convertFromUnicode failed:" << u_errorName(error); convertedChars = ch - string.data(); if (uc >= end) break; string.resize(string.length()*2); } string.resize(convertedChars); if (!state) ucnv_close(conv); return string; }
U_IO_API STD_OSTREAM & U_EXPORT2 operator<<(STD_OSTREAM& stream, const UnicodeString& str) { if(str.length() > 0) { char buffer[200]; UConverter *converter; UErrorCode errorCode = U_ZERO_ERROR; // use the default converter to convert chunks of text converter = u_getDefaultConverter(&errorCode); if(U_SUCCESS(errorCode)) { const UChar *us = str.getBuffer(); const UChar *uLimit = us + str.length(); char *s, *sLimit = buffer + (sizeof(buffer) - 1); do { errorCode = U_ZERO_ERROR; s = buffer; ucnv_fromUnicode(converter, &s, sLimit, &us, uLimit, 0, FALSE, &errorCode); *s = 0; // write this chunk if(s > buffer) { stream << buffer; } } while(errorCode == U_BUFFER_OVERFLOW_ERROR); u_releaseDefaultConverter(converter); } } /* stream.flush();*/ return stream; }
inline int mod_websocket_conv(UConverter *to, UConverter *from, char **dst, size_t *dstsiz, const char *src, size_t srcsiz) { UErrorCode err = U_ZERO_ERROR; size_t unisiz; UChar *unibuf, *punibuf, *ppunibuf; char *pdst; if (srcsiz == 0) { return -1; } if (!to) { *dst = (char *)malloc(srcsiz + 1); if (*dst == NULL) { return -1; } memcpy(*dst, src, srcsiz); (*dst)[srcsiz] = '\0'; *dstsiz = srcsiz; return 0; } if (!from || !dst || !src || !dstsiz) { return -1; } unisiz = srcsiz / ucnv_getMinCharSize(from); unibuf = (UChar *)malloc(sizeof(UChar) * unisiz + 1); if (!unibuf) { return -1; } punibuf = unibuf; ucnv_toUnicode(from, &punibuf, punibuf + unisiz, &src, src + srcsiz, 0, 0, &err); if (U_FAILURE(err)) { free(unibuf); return -1; } *punibuf = '\0'; *dstsiz = (punibuf - unibuf) * ucnv_getMaxCharSize(to); *dst = (char *)malloc(*dstsiz + 1); if (!*dst) { free(unibuf); return -1; } pdst = *dst; ppunibuf = unibuf; ucnv_fromUnicode(to, &pdst, pdst + *dstsiz, (const UChar **)&ppunibuf, punibuf, 0, 0, &err); free(unibuf); if (U_FAILURE(err)) { free(*dst); return -1; } *pdst = '\0'; *dstsiz = pdst - *dst; return 0; }
static jint NativeConverter_encode(JNIEnv* env, jclass, jlong address, jcharArray source, jint sourceEnd, jbyteArray target, jint targetEnd, jintArray data, jboolean flush) { UConverter* cnv = toUConverter(address); if (cnv == NULL) { maybeThrowIcuException(env, "toUConverter", U_ILLEGAL_ARGUMENT_ERROR); return U_ILLEGAL_ARGUMENT_ERROR; } ScopedCharArrayRO uSource(env, source); if (uSource.get() == NULL) { maybeThrowIcuException(env, "uSource", U_ILLEGAL_ARGUMENT_ERROR); return U_ILLEGAL_ARGUMENT_ERROR; } ScopedByteArrayRW uTarget(env, target); if (uTarget.get() == NULL) { maybeThrowIcuException(env, "uTarget", U_ILLEGAL_ARGUMENT_ERROR); return U_ILLEGAL_ARGUMENT_ERROR; } ScopedIntArrayRW myData(env, data); if (myData.get() == NULL) { maybeThrowIcuException(env, "myData", U_ILLEGAL_ARGUMENT_ERROR); return U_ILLEGAL_ARGUMENT_ERROR; } // Do the conversion. jint* sourceOffset = &myData[0]; jint* targetOffset = &myData[1]; const jchar* mySource = uSource.get() + *sourceOffset; const UChar* mySourceLimit= uSource.get() + sourceEnd; char* cTarget = reinterpret_cast<char*>(uTarget.get() + *targetOffset); const char* cTargetLimit = reinterpret_cast<const char*>(uTarget.get() + targetEnd); UErrorCode errorCode = U_ZERO_ERROR; ucnv_fromUnicode(cnv , &cTarget, cTargetLimit, &mySource, mySourceLimit, NULL, (UBool) flush, &errorCode); *sourceOffset = (mySource - uSource.get()) - *sourceOffset; *targetOffset = (reinterpret_cast<jbyte*>(cTarget) - uTarget.get()); // If there was an error, count the problematic characters. if (errorCode == U_ILLEGAL_CHAR_FOUND || errorCode == U_INVALID_CHAR_FOUND || errorCode == U_TRUNCATED_CHAR_FOUND) { int8_t invalidUCharCount = 32; UChar invalidUChars[32]; UErrorCode minorErrorCode = U_ZERO_ERROR; ucnv_getInvalidUChars(cnv, invalidUChars, &invalidUCharCount, &minorErrorCode); if (U_SUCCESS(minorErrorCode)) { myData[2] = invalidUCharCount; } } // Managed code handles some cases; throw all other errors. if (shouldCodecThrow(flush, errorCode)) { maybeThrowIcuException(env, "ucnv_fromUnicode", errorCode); } return errorCode; }
CString TextCodecICU::encode(const UChar* characters, size_t length, UnencodableHandling handling) { if (!length) return ""; if (!m_converterICU) createICUConverter(); if (!m_converterICU) return CString(); // FIXME: We should see if there is "force ASCII range" mode in ICU; // until then, we change the backslash into a yen sign. // Encoding will change the yen sign back into a backslash. String copy(characters, length); copy.replace('\\', m_encoding.backslashAsCurrencySymbol()); const UChar* source = copy.characters(); const UChar* sourceLimit = source + copy.length(); UErrorCode err = U_ZERO_ERROR; switch (handling) { case QuestionMarksForUnencodables: ucnv_setSubstChars(m_converterICU, "?", 1, &err); ucnv_setFromUCallBack(m_converterICU, m_needsGBKFallbacks ? gbkCallbackSubstitute : UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0, 0, 0, &err); break; case EntitiesForUnencodables: ucnv_setFromUCallBack(m_converterICU, m_needsGBKFallbacks ? gbkCallbackEscape : UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, 0, 0, &err); break; case URLEncodedEntitiesForUnencodables: ucnv_setFromUCallBack(m_converterICU, m_needsGBKFallbacks ? gbkUrlEscapedEntityCallack : urlEscapedEntityCallback, 0, 0, 0, &err); break; } ASSERT(U_SUCCESS(err)); if (U_FAILURE(err)) return CString(); Vector<char> result; size_t size = 0; do { char buffer[ConversionBufferSize]; char* target = buffer; char* targetLimit = target + ConversionBufferSize; err = U_ZERO_ERROR; ucnv_fromUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, 0, true, &err); size_t count = target - buffer; result.grow(size + count); memcpy(result.data() + size, buffer, count); size += count; } while (err == U_BUFFER_OVERFLOW_ERROR); return CString(result.data(), size); }
/* Print a ustring to the specified FILE* in the default codepage */ static void uprint(const UChar *s, int32_t sourceLen, FILE *f, UErrorCode *status) { /* converter */ UConverter *converter; char buf [BUF_SIZE]; const UChar *mySource; const UChar *mySourceEnd; char *myTarget; int32_t arraySize; if(s == 0) return; /* set up the conversion parameters */ mySource = s; mySourceEnd = mySource + sourceLen; myTarget = buf; arraySize = BUF_SIZE; /* open a default converter */ converter = ucnv_open(0, status); /* if we failed, clean up and exit */ if(U_FAILURE(*status)) goto finish; /* perform the conversion */ do { /* reset the error code */ *status = U_ZERO_ERROR; /* perform the conversion */ ucnv_fromUnicode(converter, &myTarget, myTarget + arraySize, &mySource, mySourceEnd, NULL, TRUE, status); /* Write the converted data to the FILE* */ fwrite(buf, sizeof(char), myTarget - buf, f); /* update the conversion parameters*/ myTarget = buf; arraySize = BUF_SIZE; } while(*status == U_BUFFER_OVERFLOW_ERROR); finish: /* close the converter */ ucnv_close(converter); }
CString TextCodecICU::encodeInternal(const TextCodecInput& input, UnencodableHandling handling) { const UChar* source = input.begin(); const UChar* end = input.end(); UErrorCode err = U_ZERO_ERROR; switch (handling) { case QuestionMarksForUnencodables: ucnv_setSubstChars(m_converterICU, "?", 1, &err); #if !defined(USING_SYSTEM_ICU) ucnv_setFromUCallBack(m_converterICU, UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0, 0, 0, &err); #else ucnv_setFromUCallBack(m_converterICU, m_needsGBKFallbacks ? gbkCallbackSubstitute : UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0, 0, 0, &err); #endif break; case EntitiesForUnencodables: #if !defined(USING_SYSTEM_ICU) ucnv_setFromUCallBack(m_converterICU, UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, 0, 0, &err); #else ucnv_setFromUCallBack(m_converterICU, m_needsGBKFallbacks ? gbkCallbackEscape : UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, 0, 0, &err); #endif break; case URLEncodedEntitiesForUnencodables: #if !defined(USING_SYSTEM_ICU) ucnv_setFromUCallBack(m_converterICU, urlEscapedEntityCallback, 0, 0, 0, &err); #else ucnv_setFromUCallBack(m_converterICU, m_needsGBKFallbacks ? gbkUrlEscapedEntityCallack : urlEscapedEntityCallback, 0, 0, 0, &err); #endif break; } ASSERT(U_SUCCESS(err)); if (U_FAILURE(err)) return CString(); Vector<char> result; size_t size = 0; do { char buffer[ConversionBufferSize]; char* target = buffer; char* targetLimit = target + ConversionBufferSize; err = U_ZERO_ERROR; ucnv_fromUnicode(m_converterICU, &target, targetLimit, &source, end, 0, true, &err); size_t count = target - buffer; result.grow(size + count); memcpy(result.data() + size, buffer, count); size += count; } while (err == U_BUFFER_OVERFLOW_ERROR); return CString(result.data(), size); }
static void printString(FILE *out, UConverter *converter, const UChar *str, int32_t len) { char buf[256]; const UChar *strEnd; if (len < 0) { len = u_strlen(str); } strEnd = str + len; do { UErrorCode err = U_ZERO_ERROR; char *bufp = buf, *bufend = buf + sizeof(buf) - 1 ; ucnv_fromUnicode(converter, &bufp, bufend, &str, strEnd, 0, 0, &err); *bufp = 0; fprintf(out, "%s", buf); } while (str < strEnd); }
bool filter(const char_type*& src_begin, const char_type* src_end, char_type*& dest_begin, char_type* dest_end, bool flush) { if (! ucnv_from && ! ucnv_to) { // no converter... simply copy! const size_t copy_size = std::min(src_end - src_begin, dest_end - dest_begin); std::copy(src_begin, src_begin + copy_size, dest_begin); src_begin += copy_size; dest_begin += copy_size; return false; } UChar* pivot_end = pivot_start + boost::iostreams::default_device_buffer_size; UErrorCode status = U_ZERO_ERROR; if (pivot_target != pivot_end) { const char_type* src_begin_prev = src_begin; UChar* pivot_target_prev = pivot_target; status = U_ZERO_ERROR; ucnv_toUnicode(ucnv_from, &pivot_target, pivot_end, &src_begin, src_end, 0, flush, &status); offset_from += src_begin - src_begin_prev; offset_pivot_target += pivot_target - pivot_target_prev; if (status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(status)) { UErrorCode status_getname = U_ZERO_ERROR; const char* encoding = ucnv_getName(ucnv_from, &status_getname); std::ostringstream offset_stream; offset_stream << offset_from; std::ostringstream offset_stream_unicode; offset_stream_unicode << offset_pivot_target; message_from = (std::string("ucnv_toUnicode(): ") + u_errorName(status) + " from " + encoding + " offset: " + offset_stream.str() + " unicode offset: " + offset_stream_unicode.str()); throw BOOST_IOSTREAMS_FAILURE(message_from); } } char_type* dest_begin_prev = dest_begin; const UChar* pivot_source_prev = pivot_source; status = U_ZERO_ERROR; ucnv_fromUnicode(ucnv_to, &dest_begin, dest_end, &pivot_source, pivot_target, 0, flush, &status); offset_to += dest_begin - dest_begin_prev; offset_pivot_source += pivot_source - pivot_source_prev; if (status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(status)) { UErrorCode status_getname = U_ZERO_ERROR; const char* encoding = ucnv_getName(ucnv_to, &status_getname); std::ostringstream offset_stream; offset_stream << offset_to; std::ostringstream offset_stream_unicode; offset_stream_unicode << offset_pivot_source; message_to = (std::string("ucnv_fromUnicode(): ") + u_errorName(status) + " to " + encoding + " offset: " + offset_stream.str() + " unicode offset: " + offset_stream_unicode.str()); throw BOOST_IOSTREAMS_FAILURE(message_to); } if (pivot_source == pivot_target) { pivot_source = pivot_start; pivot_target = pivot_start; } return status == U_BUFFER_OVERFLOW_ERROR; }
unsigned int ICUTranscoder::transcodeTo( const XMLCh* const srcData , const unsigned int srcCount , XMLByte* const toFill , const unsigned int maxBytes , unsigned int& charsEaten , const UnRepOpts options) { // // Get a pointer to the buffer to transcode. If UChar and XMLCh are // the same size here, then use the original. Else, create a temp // one and put a janitor on it. // const UChar* srcPtr; UChar* tmpBufPtr = 0; if (sizeof(XMLCh) == sizeof(UChar)) { srcPtr = (const UChar*)srcData; } else { tmpBufPtr = convertToUChar(srcData, srcCount, getMemoryManager()); srcPtr = tmpBufPtr; } ArrayJanitor<UChar> janTmpBuf(tmpBufPtr, getMemoryManager()); // // Set the appropriate callback so that it will either fail or use // the rep char. Remember the old one so we can put it back. // UErrorCode err = U_ZERO_ERROR; UConverterFromUCallback oldCB = NULL; #if (U_ICU_VERSION_MAJOR_NUM < 2) void* orgContent; #else const void* orgContent; #endif ucnv_setFromUCallBack ( fConverter , (options == UnRep_Throw) ? UCNV_FROM_U_CALLBACK_STOP : UCNV_FROM_U_CALLBACK_SUBSTITUTE , NULL , &oldCB , &orgContent , &err ); // // Ok, lets transcode as many chars as we we can in one shot. The // ICU API gives enough info not to have to do this one char by char. // XMLByte* startTarget = toFill; const UChar* startSrc = srcPtr; err = U_ZERO_ERROR; ucnv_fromUnicode ( fConverter , (char**)&startTarget , (char*)(startTarget + maxBytes) , &startSrc , srcPtr + srcCount , 0 , false , &err ); // Rememember the status before we possibly overite the error code const bool res = (err == U_ZERO_ERROR); // Put the old handler back err = U_ZERO_ERROR; UConverterFromUCallback orgAction = NULL; ucnv_setFromUCallBack(fConverter, oldCB, NULL, &orgAction, &orgContent, &err); if (!res) { XMLCh tmpBuf[17]; XMLString::binToText((unsigned int)*startSrc, tmpBuf, 16, 16, getMemoryManager()); ThrowXMLwithMemMgr2 ( TranscodingException , XMLExcepts::Trans_Unrepresentable , tmpBuf , getEncodingName() , getMemoryManager() ); } // Fill in the chars we ate from the input charsEaten = startSrc - srcPtr; // Return the chars we stored return startTarget - toFill; }
// Convert a file from one encoding to another static UBool convertFile(const char *pname, const char *fromcpage, UConverterToUCallback toucallback, const void *touctxt, const char *tocpage, UConverterFromUCallback fromucallback, const void *fromuctxt, int fallback, size_t bufsz, const char *translit, const char *infilestr, FILE * outfile, int verbose) { FILE *infile; UBool ret = TRUE; UConverter *convfrom = 0; UConverter *convto = 0; UErrorCode err = U_ZERO_ERROR; UBool flush; const char *cbufp; char *bufp; char *buf = 0; uint32_t infoffset = 0, outfoffset = 0; /* Where we are in the file, for error reporting. */ const UChar *unibufbp; UChar *unibufp; UChar *unibuf = 0; int32_t *fromoffsets = 0, *tooffsets = 0; size_t rd, wr, tobufsz; #if !UCONFIG_NO_TRANSLITERATION Transliterator *t = 0; // Transliterator acting on Unicode data. #endif UnicodeString u; // String to do the transliteration. // Open the correct input file or connect to stdin for reading input if (infilestr != 0 && strcmp(infilestr, "-")) { infile = fopen(infilestr, "rb"); if (infile == 0) { UnicodeString str1(infilestr, ""); str1.append((UChar32) 0); UnicodeString str2(strerror(errno), ""); str2.append((UChar32) 0); initMsg(pname); u_wmsg(stderr, "cantOpenInputF", str1.getBuffer(), str2.getBuffer()); return FALSE; } } else { infilestr = "-"; infile = stdin; #ifdef WIN32 if (setmode(fileno(stdin), O_BINARY) == -1) { initMsg(pname); u_wmsg(stderr, "cantSetInBinMode"); return FALSE; } #endif } if (verbose) { fprintf(stderr, "%s:\n", infilestr); } #if !UCONFIG_NO_TRANSLITERATION // Create transliterator as needed. if (translit != NULL && *translit) { UParseError parse; UnicodeString str(translit), pestr; /* Create from rules or by ID as needed. */ parse.line = -1; if (uprv_strchr(translit, ':') || uprv_strchr(translit, '>') || uprv_strchr(translit, '<') || uprv_strchr(translit, '>')) { t = Transliterator::createFromRules("Uconv", str, UTRANS_FORWARD, parse, err); } else { t = Transliterator::createInstance(translit, UTRANS_FORWARD, err); } if (U_FAILURE(err)) { str.append((UChar32) 0); initMsg(pname); if (parse.line >= 0) { UChar linebuf[20], offsetbuf[20]; uprv_itou(linebuf, 20, parse.line, 10, 0); uprv_itou(offsetbuf, 20, parse.offset, 10, 0); u_wmsg(stderr, "cantCreateTranslitParseErr", str.getBuffer(), u_wmsg_errorName(err), linebuf, offsetbuf); } else { u_wmsg(stderr, "cantCreateTranslit", str.getBuffer(), u_wmsg_errorName(err)); } if (t) { delete t; t = 0; } goto error_exit; } } #endif // Create codepage converter. If the codepage or its aliases weren't // available, it returns NULL and a failure code. We also set the // callbacks, and return errors in the same way. convfrom = ucnv_open(fromcpage, &err); if (U_FAILURE(err)) { UnicodeString str(fromcpage, (int32_t)(uprv_strlen(fromcpage) + 1)); initMsg(pname); u_wmsg(stderr, "cantOpenFromCodeset", str.getBuffer(), u_wmsg_errorName(err)); goto error_exit; } ucnv_setToUCallBack(convfrom, toucallback, touctxt, 0, 0, &err); if (U_FAILURE(err)) { initMsg(pname); u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err)); goto error_exit; } convto = ucnv_open(tocpage, &err); if (U_FAILURE(err)) { UnicodeString str(tocpage, (int32_t)(uprv_strlen(tocpage) + 1)); initMsg(pname); u_wmsg(stderr, "cantOpenToCodeset", str.getBuffer(), u_wmsg_errorName(err)); goto error_exit; } ucnv_setFromUCallBack(convto, fromucallback, fromuctxt, 0, 0, &err); if (U_FAILURE(err)) { initMsg(pname); u_wmsg(stderr, "cantSetCallback", u_wmsg_errorName(err)); goto error_exit; } ucnv_setFallback(convto, fallback); // To ensure that the buffer always is of enough size, we // must take the worst case scenario, that is the character in // the codepage that uses the most bytes and multiply it against // the buffer size. // use bufsz+1 to allow for additional BOM/signature character (U+FEFF) tobufsz = (bufsz+1) * ucnv_getMaxCharSize(convto); buf = new char[tobufsz]; unibuf = new UChar[bufsz]; fromoffsets = new int32_t[bufsz]; tooffsets = new int32_t[tobufsz]; // OK, we can convert now. do { char willexit = 0; rd = fread(buf, 1, bufsz, infile); if (ferror(infile) != 0) { UnicodeString str(strerror(errno)); str.append((UChar32) 0); initMsg(pname); u_wmsg(stderr, "cantRead", str.getBuffer()); goto error_exit; } // Convert the read buffer into the new coding // After the call 'unibufp' will be placed on the last // character that was converted in the 'unibuf'. // Also the 'cbufp' is positioned on the last converted // character. // At the last conversion in the file, flush should be set to // true so that we get all characters converted // // The converter must be flushed at the end of conversion so // that characters on hold also will be written. unibufp = unibuf; cbufp = buf; flush = rd != bufsz; ucnv_toUnicode(convfrom, &unibufp, unibufp + bufsz, &cbufp, cbufp + rd, fromoffsets, flush, &err); infoffset += (uint32_t)(cbufp - buf); if (U_FAILURE(err)) { char pos[32]; sprintf(pos, "%u", infoffset - 1); UnicodeString str(pos, (int32_t)(uprv_strlen(pos) + 1)); initMsg(pname); u_wmsg(stderr, "problemCvtToU", str.getBuffer(), u_wmsg_errorName(err)); willexit = 1; err = U_ZERO_ERROR; /* reset the error for the rest of the conversion. */ } // At the last conversion, the converted characters should be // equal to number of chars read. if (flush && !willexit && cbufp != (buf + rd)) { char pos[32]; sprintf(pos, "%u", infoffset); UnicodeString str(pos, (int32_t)(uprv_strlen(pos) + 1)); initMsg(pname); u_wmsg(stderr, "premEndInput", str.getBuffer()); willexit = 1; } // Prepare to transliterate and convert. Transliterate if needed. #if !UCONFIG_NO_TRANSLITERATION if (t) { u.setTo(unibuf, (int32_t)(unibufp - unibuf)); // Copy into string. t->transliterate(u); } else #endif { u.setTo(unibuf, (int32_t)(unibufp - unibuf), (int32_t)(bufsz)); // Share the buffer. } int32_t ulen = u.length(); // Convert the Unicode buffer into the destination codepage // Again 'bufp' will be placed on the last converted character // And 'unibufbp' will be placed on the last converted unicode character // At the last conversion flush should be set to true to ensure that // all characters left get converted const UChar *unibufu = unibufbp = u.getBuffer(); do { int32_t len = ulen > (int32_t)bufsz ? (int32_t)bufsz : ulen; bufp = buf; unibufp = (UChar *) (unibufbp + len); ucnv_fromUnicode(convto, &bufp, bufp + tobufsz, &unibufbp, unibufp, tooffsets, flush, &err); if (U_FAILURE(err)) { const char *errtag; char pos[32]; uint32_t erroffset = dataOffset((int32_t)(bufp - buf - 1), fromoffsets, (int32_t)(bufsz), tooffsets, (int32_t)(tobufsz)); int32_t ferroffset = (int32_t)(infoffset - (unibufp - unibufu) + erroffset); if ((int32_t) ferroffset < 0) { ferroffset = (int32_t)(outfoffset + (bufp - buf)); errtag = "problemCvtFromUOut"; } else { errtag = "problemCvtFromU"; } sprintf(pos, "%u", ferroffset); UnicodeString str(pos, (int32_t)(uprv_strlen(pos) + 1)); initMsg(pname); u_wmsg(stderr, errtag, str.getBuffer(), u_wmsg_errorName(err)); willexit = 1; } // At the last conversion, the converted characters should be equal to number // of consumed characters. if (flush && !willexit && unibufbp != (unibufu + (size_t) (unibufp - unibufu))) { char pos[32]; sprintf(pos, "%u", infoffset); UnicodeString str(pos, (int32_t)(uprv_strlen(pos) + 1)); initMsg(pname); u_wmsg(stderr, "premEnd", str.getBuffer()); willexit = 1; } // Finally, write the converted buffer to the output file rd = (size_t) (bufp - buf); outfoffset += (int32_t)(wr = fwrite(buf, 1, rd, outfile)); if (wr != rd) { UnicodeString str(strerror(errno), ""); initMsg(pname); u_wmsg(stderr, "cantWrite", str.getBuffer()); willexit = 1; } if (willexit) { goto error_exit; } } while ((ulen -= (int32_t)(bufsz)) > 0); } while (!flush); // Stop when we have flushed the // converters (this means that it's // the end of output) goto normal_exit; error_exit: ret = FALSE; normal_exit: // Cleanup. if (convfrom) ucnv_close(convfrom); if (convto) ucnv_close(convto); #if !UCONFIG_NO_TRANSLITERATION if (t) delete t; #endif if (buf) delete[] buf; if (unibuf) delete[] unibuf; if (fromoffsets) delete[] fromoffsets; if (tooffsets) delete[] tooffsets; if (infile != stdin) { fclose(infile); } return ret; }
/************************** * Will convert a sequence of bytes from one codepage to another. * @param toConverterName: The name of the converter that will be used to encode the output buffer * @param fromConverterName: The name of the converter that will be used to decode the input buffer * @param target: Pointer to the output buffer* written * @param targetLength: on input contains the capacity of target, on output the number of bytes copied to target * @param source: Pointer to the input buffer * @param sourceLength: on input contains the capacity of source, on output the number of bytes processed in "source" * @param internal: used internally to store store state data across calls * @param err: fills in an error status */ void T_UConverter_fromCodepageToCodepage (UConverter * outConverter, UConverter * inConverter, char **target, const char *targetLimit, const char **source, const char *sourceLimit, int32_t* offsets, int flush, UErrorCode * err) { UChar out_chunk[CHUNK_SIZE]; const UChar *out_chunk_limit = out_chunk + CHUNK_SIZE; UChar *out_chunk_alias; UChar const *out_chunk_alias2; if (U_FAILURE (*err)) return; /*loops until the input buffer is completely consumed *or if an error has be encountered *first we convert from inConverter codepage to Unicode *then from Unicode to outConverter codepage */ while ((*source != sourceLimit) && U_SUCCESS (*err)) { out_chunk_alias = out_chunk; ucnv_toUnicode (inConverter, &out_chunk_alias, out_chunk_limit, source, sourceLimit, NULL, flush, err); /*BUFFER_OVERFLOW_ERROR means that the output "CHUNK" is full *we will require at least another loop (it's a recoverable error) */ if (U_SUCCESS (*err) || (*err == U_BUFFER_OVERFLOW_ERROR)) { *err = U_ZERO_ERROR; out_chunk_alias2 = out_chunk; while ((out_chunk_alias2 != out_chunk_alias) && U_SUCCESS (*err)) { ucnv_fromUnicode (outConverter, target, targetLimit, &out_chunk_alias2, out_chunk_alias, NULL, TRUE, err); } } else break; } return; }
int32_t ucnv_fromUChars (const UConverter * converter, char *target, int32_t targetSize, const UChar * source, UErrorCode * err) { const UChar *mySource = source; const UChar *mySource_limit; int32_t mySourceLength = 0; UConverter myConverter; char *myTarget = target; int32_t targetCapacity = 0; if (U_FAILURE (*err)) return 0; if ((converter == NULL) || (targetSize < 0)) { *err = U_ILLEGAL_ARGUMENT_ERROR; return 0; } /*makes a local copy of the UConverter */ myConverter = *converter; /*if the source is empty we return immediately */ mySourceLength = u_strlen (source); if (mySourceLength == 0) { /*for consistency we still need to *store 0 in the targetCapacity *if the user requires it */ return 0; } mySource_limit = mySource + mySourceLength; if (targetSize > 0) { ucnv_fromUnicode (&myConverter, &myTarget, target + targetSize, &mySource, mySource_limit, NULL, TRUE, err); targetCapacity = myTarget - target; } /*Updates targetCapacity to contain the number of bytes written to target */ if (targetSize == 0) { *err = U_BUFFER_OVERFLOW_ERROR; } /* If the output buffer is exhausted, we need to stop writing * to it but continue the conversion in order to store in targetSize * the number of bytes that was required*/ if (*err == U_BUFFER_OVERFLOW_ERROR) { char target2[CHUNK_SIZE]; char *target2_alias = target2; const char *target2_limit = target2 + CHUNK_SIZE; /*We use a stack allocated buffer around which we loop *(in case the output is greater than CHUNK_SIZE) */ while (*err == U_BUFFER_OVERFLOW_ERROR) { *err = U_ZERO_ERROR; target2_alias = target2; ucnv_fromUnicode (&myConverter, &target2_alias, target2_limit, &mySource, mySource_limit, NULL, TRUE, err); /*updates the output parameter to contain the number of char required */ targetCapacity += (target2_alias - target2) + 1; } /*We will set the erro code to BUFFER_OVERFLOW_ERROR only if *nothing graver happened in the previous loop*/ (targetCapacity)--; if (U_SUCCESS (*err)) *err = U_BUFFER_OVERFLOW_ERROR; } return targetCapacity; }
U_CAPI void U_EXPORT2 ucnv_cbFromUWriteUChars(UConverterFromUnicodeArgs *args, const UChar** source, const UChar* sourceLimit, int32_t offsetIndex, UErrorCode * err) { /* This is a fun one. Recursion can occur - we're basically going to just retry shoving data through the same converter. Note, if you got here through some kind of invalid sequence, you maybe should emit a reset sequence of some kind and/or call ucnv_reset(). Since this IS an actual conversion, take care that you've changed the callback or the data, or you'll get an infinite loop. Please set the err value to something reasonable before calling into this. */ char *oldTarget; if(U_FAILURE(*err)) { return; } oldTarget = args->target; ucnv_fromUnicode(args->converter, &args->target, args->targetLimit, source, sourceLimit, NULL, /* no offsets */ FALSE, /* no flush */ err); if(args->offsets) { while (args->target != oldTarget) /* if it moved at all.. */ { *(args->offsets)++ = offsetIndex; oldTarget++; } } /* Note, if you did something like used a Stop subcallback, things would get interesting. In fact, here's where we want to return the partially consumed in-source! */ if(*err == U_BUFFER_OVERFLOW_ERROR) /* && (*source < sourceLimit && args->target >= args->targetLimit) -- S. Hrcek */ { /* Overflowed the target. Now, we'll write into the charErrorBuffer. It's a fixed size. If we overflow it... Hmm */ char *newTarget; const char *newTargetLimit; UErrorCode err2 = U_ZERO_ERROR; int8_t errBuffLen; errBuffLen = args->converter->charErrorBufferLength; /* start the new target at the first free slot in the errbuff.. */ newTarget = (char *)(args->converter->charErrorBuffer + errBuffLen); newTargetLimit = (char *)(args->converter->charErrorBuffer + sizeof(args->converter->charErrorBuffer)); if(newTarget >= newTargetLimit) { *err = U_INTERNAL_PROGRAM_ERROR; return; } /* We're going to tell the converter that the errbuff len is empty. This prevents the existing errbuff from being 'flushed' out onto itself. If the errbuff is needed by the converter this time, we're hosed - we're out of space! */ args->converter->charErrorBufferLength = 0; ucnv_fromUnicode(args->converter, &newTarget, newTargetLimit, source, sourceLimit, NULL, FALSE, &err2); /* We can go ahead and overwrite the length here. We know just how to recalculate it. */ args->converter->charErrorBufferLength = (int8_t)( newTarget - (char*)args->converter->charErrorBuffer); if((newTarget >= newTargetLimit) || (err2 == U_BUFFER_OVERFLOW_ERROR)) { /* now we're REALLY in trouble. Internal program error - callback shouldn't have written this much data! */ *err = U_INTERNAL_PROGRAM_ERROR; return; } /*else {*/ /* sub errs could be invalid/truncated/illegal chars or w/e. These might want to be passed on up.. But the problem is, we already need to pass U_BUFFER_OVERFLOW_ERROR. That has to override these other errs.. */ /* if(U_FAILURE(err2)) ?? */ /*}*/ } }
static int32_t u_scanf_string_handler(UFILE *input, u_scanf_spec_info *info, ufmt_args *args, const UChar *fmt, int32_t *fmtConsumed, int32_t *argConverted) { const UChar *source; UConverter *conv; char *arg = (char*)(args[0].ptrValue); char *alias = arg; char *limit; UErrorCode status = U_ZERO_ERROR; int32_t count; int32_t skipped = 0; UChar c; UBool isNotEOF = FALSE; /* skip all ws in the input */ if (info->fIsString) { skipped = u_scanf_skip_leading_ws(input, info->fPadChar); } /* get the string one character at a time, truncating to the width */ count = 0; /* open the default converter */ conv = u_getDefaultConverter(&status); if(U_FAILURE(status)) return -1; while( (info->fWidth == -1 || count < info->fWidth) && (isNotEOF = ufile_getch(input, &c)) && (!info->fIsString || (c != info->fPadChar && !u_isWhitespace(c)))) { if (!info->fSkipArg) { /* put the character from the input onto the target */ source = &c; /* Since we do this one character at a time, do it this way. */ if (info->fWidth > 0) { limit = alias + info->fWidth - count; } else { limit = alias + ucnv_getMaxCharSize(conv); } /* convert the character to the default codepage */ ucnv_fromUnicode(conv, &alias, limit, &source, source + 1, NULL, TRUE, &status); if(U_FAILURE(status)) { /* clean up */ u_releaseDefaultConverter(conv); return -1; } } /* increment the count */ ++count; } /* put the final character we read back on the input */ if (!info->fSkipArg) { if ((info->fWidth == -1 || count < info->fWidth) && isNotEOF) u_fungetc(c, input); /* add the terminator */ if (info->fIsString) { *alias = 0x00; } } /* clean up */ u_releaseDefaultConverter(conv); /* we converted 1 arg */ *argConverted = !info->fSkipArg; return count + skipped; }
U_CFUNC int32_t U_EXPORT2 u_file_write_flush(const UChar *chars, int32_t count, UFILE *f, UBool flushIO, UBool flushTranslit) { /* Set up conversion parameters */ UErrorCode status = U_ZERO_ERROR; const UChar *mySource = chars; const UChar *mySourceBegin; const UChar *mySourceEnd; char charBuffer[UFILE_CHARBUFFER_SIZE]; char *myTarget = charBuffer; int32_t written = 0; int32_t numConverted = 0; if (count < 0) { count = u_strlen(chars); } #if !UCONFIG_NO_TRANSLITERATION if((f->fTranslit) && (f->fTranslit->translit)) { /* Do the transliteration */ mySource = u_file_translit(f, chars, &count, flushTranslit); } #endif /* Write to a string. */ if (!f->fFile) { int32_t charsLeft = (int32_t)(f->str.fLimit - f->str.fPos); if (flushIO && charsLeft > count) { count++; } written = ufmt_min(count, charsLeft); u_strncpy(f->str.fPos, mySource, written); f->str.fPos += written; return written; } mySourceEnd = mySource + count; /* Perform the conversion in a loop */ do { mySourceBegin = mySource; /* beginning location for this loop */ status = U_ZERO_ERROR; if(f->fConverter != NULL) { /* We have a valid converter */ ucnv_fromUnicode(f->fConverter, &myTarget, charBuffer + UFILE_CHARBUFFER_SIZE, &mySource, mySourceEnd, NULL, flushIO, &status); } else { /*weiv: do the invariant conversion */ int32_t convertChars = (int32_t) (mySourceEnd - mySource); if (convertChars > UFILE_CHARBUFFER_SIZE) { convertChars = UFILE_CHARBUFFER_SIZE; status = U_BUFFER_OVERFLOW_ERROR; } u_UCharsToChars(mySource, myTarget, convertChars); mySource += convertChars; myTarget += convertChars; } numConverted = (int32_t)(myTarget - charBuffer); if (numConverted > 0) { /* write the converted bytes */ fwrite(charBuffer, sizeof(char), numConverted, f->fFile); written += (int32_t) (mySource - mySourceBegin); } myTarget = charBuffer; } while(status == U_BUFFER_OVERFLOW_ERROR); /* return # of chars written */ return written; }
/* helper function */ static wchar_t* _strToWCS(wchar_t *dest, int32_t destCapacity, int32_t *pDestLength, const UChar *src, int32_t srcLength, UErrorCode *pErrorCode){ char stackBuffer [_STACK_BUFFER_CAPACITY]; char* tempBuf = stackBuffer; int32_t tempBufCapacity = _STACK_BUFFER_CAPACITY; char* tempBufLimit = stackBuffer + tempBufCapacity; UConverter* conv = NULL; char* saveBuf = tempBuf; wchar_t* intTarget=NULL; int32_t intTargetCapacity=0; int count=0,retVal=0; const UChar *pSrcLimit =NULL; const UChar *pSrc = src; conv = u_getDefaultConverter(pErrorCode); if(U_FAILURE(*pErrorCode)){ return NULL; } if(srcLength == -1){ srcLength = u_strlen(pSrc); } pSrcLimit = pSrc + srcLength; for(;;) { /* reset the error state */ *pErrorCode = U_ZERO_ERROR; /* convert to chars using default converter */ ucnv_fromUnicode(conv,&tempBuf,tempBufLimit,&pSrc,pSrcLimit,NULL,(UBool)(pSrc==pSrcLimit),pErrorCode); count =(tempBuf - saveBuf); /* This should rarely occur */ if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR){ tempBuf = saveBuf; /* we dont have enough room on the stack grow the buffer */ if(!u_growAnyBufferFromStatic(stackBuffer,(void**) &tempBuf, &tempBufCapacity, (_BUFFER_CAPACITY_MULTIPLIER * (srcLength)), count,sizeof(char))){ goto cleanup; } saveBuf = tempBuf; tempBufLimit = tempBuf + tempBufCapacity; tempBuf = tempBuf + count; } else { break; } } if(U_FAILURE(*pErrorCode)){ goto cleanup; } /* done with conversion null terminate the char buffer */ if(count>=tempBufCapacity){ tempBuf = saveBuf; /* we dont have enough room on the stack grow the buffer */ if(!u_growAnyBufferFromStatic(stackBuffer,(void**) &tempBuf, &tempBufCapacity, tempBufCapacity-count+1, count,sizeof(char))){ goto cleanup; } saveBuf = tempBuf; } saveBuf[count]=0; /* allocate more space than required * here we assume that every char requires * no more than 2 wchar_ts */ intTargetCapacity = (count * _BUFFER_CAPACITY_MULTIPLIER + 1) /*for null termination */; intTarget = (wchar_t*)uprv_malloc( intTargetCapacity * sizeof(wchar_t) ); if(intTarget){ int32_t nulLen = 0; int32_t remaining = intTargetCapacity; wchar_t* pIntTarget=intTarget; tempBuf = saveBuf; /* now convert the mbs to wcs */ for(;;){ /* we can call the system API since we are sure that * there is atleast 1 null in the input */ retVal = uprv_mbstowcs(pIntTarget,(tempBuf+nulLen),remaining); if(retVal==-1){ *pErrorCode = U_INVALID_CHAR_FOUND; break; }else if(retVal== remaining){/* should never occur */ int numWritten = (pIntTarget-intTarget); u_growAnyBufferFromStatic(NULL,(void**) &intTarget, &intTargetCapacity, intTargetCapacity * _BUFFER_CAPACITY_MULTIPLIER, numWritten, sizeof(wchar_t)); pIntTarget = intTarget; remaining=intTargetCapacity; if(nulLen!=count){ /*there are embedded nulls*/ pIntTarget+=numWritten; remaining-=numWritten; } }else{ int32_t nulVal; /*scan for nulls */ /* we donot check for limit since tempBuf is null terminated */ while(tempBuf[nulLen++] != 0){ } nulVal = (nulLen < srcLength) ? 1 : 0; pIntTarget = pIntTarget + retVal+nulVal; remaining -=(retVal+nulVal); /* check if we have reached the source limit*/ if(nulLen>=(count)){ break; } } } count = (int32_t)(pIntTarget-intTarget); if(0 < count && count <= destCapacity){ uprv_memcpy(dest,intTarget,count*sizeof(wchar_t)); } if(pDestLength){ *pDestLength = count; } /* free the allocated memory */ uprv_free(intTarget); }else{ *pErrorCode = U_MEMORY_ALLOCATION_ERROR; } cleanup: /* are we still using stack buffer */ if(stackBuffer != saveBuf){ uprv_free(saveBuf); } u_terminateWChars(dest,destCapacity,count,pErrorCode); u_releaseDefaultConverter(conv); return dest; }
void charsetFilteredOutputStream_icu::writeImpl (const byte_t* const data, const size_t count) { if (m_from == NULL || m_to == NULL) throw exceptions::charset_conv_error("Cannot initialize converters."); // Allocate buffer for Unicode chars const size_t uniSize = ucnv_getMinCharSize(m_from) * count * sizeof(UChar); std::vector <UChar> uniBuffer(uniSize); // Conversion loop UErrorCode toErr = U_ZERO_ERROR; const char* uniSource = reinterpret_cast <const char*>(data); const char* uniSourceLimit = uniSource + count; do { // Convert from source charset to Unicode UChar* uniTarget = &uniBuffer[0]; UChar* uniTargetLimit = &uniBuffer[0] + uniSize; toErr = U_ZERO_ERROR; ucnv_toUnicode(m_from, &uniTarget, uniTargetLimit, &uniSource, uniSourceLimit, NULL, /* flush */ FALSE, &toErr); if (U_FAILURE(toErr) && toErr != U_BUFFER_OVERFLOW_ERROR) { if (toErr == U_INVALID_CHAR_FOUND || toErr == U_TRUNCATED_CHAR_FOUND || toErr == U_ILLEGAL_CHAR_FOUND) { throw exceptions::illegal_byte_sequence_for_charset(); } else { throw exceptions::charset_conv_error ("[ICU] Error converting to Unicode from '" + m_sourceCharset.getName() + "'."); } } const size_t uniLength = uniTarget - &uniBuffer[0]; // Allocate buffer for destination charset const size_t cpSize = ucnv_getMinCharSize(m_to) * uniLength; std::vector <char> cpBuffer(cpSize); // Convert from Unicode to destination charset UErrorCode fromErr = U_ZERO_ERROR; const UChar* cpSource = &uniBuffer[0]; const UChar* cpSourceLimit = &uniBuffer[0] + uniLength; do { char* cpTarget = &cpBuffer[0]; char* cpTargetLimit = &cpBuffer[0] + cpSize; fromErr = U_ZERO_ERROR; ucnv_fromUnicode(m_to, &cpTarget, cpTargetLimit, &cpSource, cpSourceLimit, NULL, /* flush */ FALSE, &fromErr); if (fromErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(fromErr)) { if (fromErr == U_INVALID_CHAR_FOUND || fromErr == U_TRUNCATED_CHAR_FOUND || fromErr == U_ILLEGAL_CHAR_FOUND) { throw exceptions::illegal_byte_sequence_for_charset(); } else { throw exceptions::charset_conv_error ("[ICU] Error converting from Unicode to '" + m_destCharset.getName() + "'."); } } const size_t cpLength = cpTarget - &cpBuffer[0]; // Write successfully converted bytes m_stream.write(&cpBuffer[0], cpLength); } while (fromErr == U_BUFFER_OVERFLOW_ERROR); } while (toErr == U_BUFFER_OVERFLOW_ERROR); }
void charsetFilteredOutputStream_icu::flush() { if (m_from == NULL || m_to == NULL) throw exceptions::charset_conv_error("Cannot initialize converters."); // Allocate buffer for Unicode chars const size_t uniSize = ucnv_getMinCharSize(m_from) * 1024 * sizeof(UChar); std::vector <UChar> uniBuffer(uniSize); // Conversion loop (with flushing) UErrorCode toErr = U_ZERO_ERROR; const char* uniSource = 0; const char* uniSourceLimit = 0; do { // Convert from source charset to Unicode UChar* uniTarget = &uniBuffer[0]; UChar* uniTargetLimit = &uniBuffer[0] + uniSize; toErr = U_ZERO_ERROR; ucnv_toUnicode(m_from, &uniTarget, uniTargetLimit, &uniSource, uniSourceLimit, NULL, /* flush */ TRUE, &toErr); if (U_FAILURE(toErr) && toErr != U_BUFFER_OVERFLOW_ERROR) { throw exceptions::charset_conv_error ("[ICU] Error converting to Unicode from '" + m_sourceCharset.getName() + "'."); } const size_t uniLength = uniTarget - &uniBuffer[0]; // Allocate buffer for destination charset const size_t cpSize = ucnv_getMinCharSize(m_to) * uniLength; std::vector <char> cpBuffer(cpSize); // Convert from Unicode to destination charset UErrorCode fromErr = U_ZERO_ERROR; const UChar* cpSource = &uniBuffer[0]; const UChar* cpSourceLimit = &uniBuffer[0] + uniLength; do { char* cpTarget = &cpBuffer[0]; char* cpTargetLimit = &cpBuffer[0] + cpSize; fromErr = U_ZERO_ERROR; ucnv_fromUnicode(m_to, &cpTarget, cpTargetLimit, &cpSource, cpSourceLimit, NULL, /* flush */ TRUE, &fromErr); if (fromErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(fromErr)) { throw exceptions::charset_conv_error ("[ICU] Error converting from Unicode to '" + m_destCharset.getName() + "'."); } const size_t cpLength = cpTarget - &cpBuffer[0]; // Write successfully converted bytes m_stream.write(&cpBuffer[0], cpLength); } while (fromErr == U_BUFFER_OVERFLOW_ERROR); } while (toErr == U_BUFFER_OVERFLOW_ERROR); m_stream.flush(); }
static UBool testConvertFromUnicode(const UChar *source, int sourceLen, const uint8_t *expect, int expectLen, const char *codepage, UBool fallback, const int32_t *expectOffsets) { UErrorCode status = U_ZERO_ERROR; UConverter *conv = 0; char junkout[NEW_MAX_BUFFER]; /* FIX */ int32_t junokout[NEW_MAX_BUFFER]; /* FIX */ const UChar *src; char *end; char *targ; int32_t *offs; int i; int32_t realBufferSize; char *realBufferEnd; const UChar *realSourceEnd; const UChar *sourceLimit; UBool checkOffsets = TRUE; UBool doFlush; UBool action=FALSE; char *p; for(i=0;i<NEW_MAX_BUFFER;i++) junkout[i] = (char)0xF0; for(i=0;i<NEW_MAX_BUFFER;i++) junokout[i] = 0xFF; setNuConvTestName(codepage, "FROM"); log_verbose("\nTesting========= %s FROM \n inputbuffer= %d outputbuffer= %d\n", codepage, gInBufferSize, gOutBufferSize); conv = my_ucnv_open(codepage, &status); if(U_FAILURE(status)) { log_data_err("Couldn't open converter %s\n",codepage); return TRUE; } log_verbose("Converter opened..\n"); /*----setting the callback routine----*/ ucnv_setFallback (conv, fallback); action = ucnv_usesFallback(conv); if(action != fallback){ log_err("FAIL: Error is setting fallback. Errocode=%s\n", myErrorName(status)); } /*------------------------*/ src = source; targ = junkout; offs = junokout; realBufferSize = (sizeof(junkout)/sizeof(junkout[0])); realBufferEnd = junkout + realBufferSize; realSourceEnd = source + sourceLen; if ( gOutBufferSize != realBufferSize ) checkOffsets = FALSE; if( gInBufferSize != NEW_MAX_BUFFER ) checkOffsets = FALSE; do { end = nct_min(targ + gOutBufferSize, realBufferEnd); sourceLimit = nct_min(src + gInBufferSize, realSourceEnd); doFlush = (UBool)(sourceLimit == realSourceEnd); if(targ == realBufferEnd) { log_err("Error, overflowed the real buffer while about to call fromUnicode! targ=%08lx %s", targ, gNuConvTestName); return FALSE; } log_verbose("calling fromUnicode @ SOURCE:%08lx to %08lx TARGET: %08lx to %08lx, flush=%s\n", src,sourceLimit, targ,end, doFlush?"TRUE":"FALSE"); status = U_ZERO_ERROR; ucnv_fromUnicode (conv, (char **)&targ, (const char *)end, &src, sourceLimit, checkOffsets ? offs : NULL, doFlush, /* flush if we're at the end of the input data */ &status); } while ( (status == U_BUFFER_OVERFLOW_ERROR) || (sourceLimit < realSourceEnd) ); if(U_FAILURE(status)) { log_err("Problem doing toUnicode, errcode %d %s\n", myErrorName(status), gNuConvTestName); return FALSE; } log_verbose("\nConversion done [%d uchars in -> %d chars out]. \nResult :", sourceLen, targ-junkout); if(VERBOSITY) { char junk[9999]; char offset_str[9999]; junk[0] = 0; offset_str[0] = 0; for(p = junkout;p<targ;p++) { sprintf(junk + uprv_strlen(junk), "0x%02x, ", (0xFF) & (unsigned int)*p); sprintf(offset_str + strlen(offset_str), "0x%02x, ", (0xFF) & (unsigned int)junokout[p-junkout]); } log_verbose(junk); printSeq((const unsigned char*)expect, expectLen); if ( checkOffsets ) { log_verbose("\nOffsets:"); log_verbose(offset_str); } log_verbose("\n"); } ucnv_close(conv); if(expectLen != targ-junkout) { log_err("Expected %d chars out, got %d %s\n", expectLen, targ-junkout, gNuConvTestName); log_verbose("Expected %d chars out, got %d %s\n", expectLen, targ-junkout, gNuConvTestName); printSeqErr((const unsigned char*)junkout, (int32_t)(targ-junkout)); printSeqErr((const unsigned char*)expect, expectLen); return FALSE; } if (checkOffsets && (expectOffsets != 0) ) { log_verbose("\ncomparing %d offsets..\n", targ-junkout); if(uprv_memcmp(junokout,expectOffsets,(targ-junkout) * sizeof(int32_t) )){ log_err("\ndid not get the expected offsets while %s \n", gNuConvTestName); log_err("Got : "); printSeqErr((const unsigned char*)junkout, (int32_t)(targ-junkout)); for(p=junkout;p<targ;p++) log_err("%d, ", junokout[p-junkout]); log_err("\nExpected: "); for(i=0; i<(targ-junkout); i++) log_err("%d,", expectOffsets[i]); } } log_verbose("\n\ncomparing..\n"); if(!memcmp(junkout, expect, expectLen)) { log_verbose("Matches!\n"); return TRUE; } else { log_err("String does not match. %s\n", gNuConvTestName); log_verbose("String does not match. %s\n", gNuConvTestName); printSeqErr((const unsigned char*)junkout, expectLen); printSeqErr((const unsigned char*)expect, expectLen); return FALSE; } }
void charsetConverter_icu::convert (utility::inputStream& in, utility::outputStream& out, status* st) { UErrorCode err = U_ZERO_ERROR; ucnv_reset(m_from); ucnv_reset(m_to); if (st) new (st) status(); // From buffers byte_t cpInBuffer[16]; // stream data put here const size_t outSize = ucnv_getMinCharSize(m_from) * sizeof(cpInBuffer) * sizeof(UChar); std::vector <UChar> uOutBuffer(outSize); // Unicode chars end up here // To buffers // converted (char) data end up here const size_t cpOutBufferSz = ucnv_getMaxCharSize(m_to) * outSize; std::vector <char> cpOutBuffer(cpOutBufferSz); // Tell ICU what to do when encountering an illegal byte sequence if (m_options.silentlyReplaceInvalidSequences) { // Set replacement chars for when converting from Unicode to codepage icu::UnicodeString substString(m_options.invalidSequence.c_str()); ucnv_setSubstString(m_to, substString.getTerminatedBuffer(), -1, &err); if (U_FAILURE(err)) throw exceptions::charset_conv_error("[ICU] Error when setting substitution string."); } else { // Tell ICU top stop (and return an error) on illegal byte sequences ucnv_setToUCallBack (m_from, UCNV_TO_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err); if (U_FAILURE(err)) throw exceptions::charset_conv_error("[ICU] Error when setting ToU callback."); ucnv_setFromUCallBack (m_to, UCNV_FROM_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err); if (U_FAILURE(err)) throw exceptions::charset_conv_error("[ICU] Error when setting FromU callback."); } // Input data available while (!in.eof()) { // Read input data into buffer size_t inLength = in.read(cpInBuffer, sizeof(cpInBuffer)); // Beginning of read data const char* source = reinterpret_cast <const char*>(&cpInBuffer[0]); const char* sourceLimit = source + inLength; // end + 1 UBool flush = in.eof(); // is this last run? UErrorCode toErr; // Loop until all source has been processed do { // Set up target pointers UChar* target = &uOutBuffer[0]; UChar* targetLimit = &target[0] + outSize; toErr = U_ZERO_ERROR; ucnv_toUnicode(m_from, &target, targetLimit, &source, sourceLimit, NULL, flush, &toErr); if (st) st->inputBytesRead += (source - reinterpret_cast <const char*>(&cpInBuffer[0])); if (toErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(toErr)) { if (toErr == U_INVALID_CHAR_FOUND || toErr == U_TRUNCATED_CHAR_FOUND || toErr == U_ILLEGAL_CHAR_FOUND) { // Error will be thrown later (*) } else { throw exceptions::charset_conv_error("[ICU] Error converting to Unicode from " + m_source.getName()); } } // The Unicode source is the buffer just written and the limit // is where the previous conversion stopped (target is moved in the conversion) const UChar* uSource = &uOutBuffer[0]; UChar* uSourceLimit = &target[0]; UErrorCode fromErr; // Loop until converted chars are fully written do { char* cpTarget = &cpOutBuffer[0]; const char* cpTargetLimit = &cpOutBuffer[0] + cpOutBufferSz; fromErr = U_ZERO_ERROR; // Write converted bytes (Unicode) to destination codepage ucnv_fromUnicode(m_to, &cpTarget, cpTargetLimit, &uSource, uSourceLimit, NULL, flush, &fromErr); if (st) { // Decrement input bytes count by the number of input bytes in error char errBytes[16]; int8_t errBytesLen = sizeof(errBytes); UErrorCode errBytesErr = U_ZERO_ERROR; ucnv_getInvalidChars(m_from, errBytes, &errBytesLen, &errBytesErr); st->inputBytesRead -= errBytesLen; st->outputBytesWritten += cpTarget - &cpOutBuffer[0]; } // (*) If an error occurred while converting from input charset, throw it now if (toErr == U_INVALID_CHAR_FOUND || toErr == U_TRUNCATED_CHAR_FOUND || toErr == U_ILLEGAL_CHAR_FOUND) { throw exceptions::illegal_byte_sequence_for_charset(); } if (fromErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(fromErr)) { if (fromErr == U_INVALID_CHAR_FOUND || fromErr == U_TRUNCATED_CHAR_FOUND || fromErr == U_ILLEGAL_CHAR_FOUND) { throw exceptions::illegal_byte_sequence_for_charset(); } else { throw exceptions::charset_conv_error("[ICU] Error converting from Unicode to " + m_dest.getName()); } } // Write to destination stream out.write(&cpOutBuffer[0], (cpTarget - &cpOutBuffer[0])); } while (fromErr == U_BUFFER_OVERFLOW_ERROR); } while (toErr == U_BUFFER_OVERFLOW_ERROR); } }
CF_PRIVATE CFIndex __CFStringEncodingICUToBytes(const char *icuName, uint32_t flags, const UniChar *characters, CFIndex numChars, CFIndex *usedCharLen, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) { UConverter *converter; UErrorCode errorCode = U_ZERO_ERROR; const UTF16Char *source = characters; const UTF16Char *sourceLimit = source + numChars; char *destination = (char *)bytes; const char *destinationLimit = destination + maxByteLen; bool flush = ((0 == (flags & kCFStringEncodingPartialInput)) ? true : false); CFIndex status; if (NULL == (converter = __CFStringEncodingConverterCreateICUConverter(icuName, flags, false))) return kCFStringEncodingConverterUnavailable; if (0 == maxByteLen) { char buffer[MAX_BUFFER_SIZE]; CFIndex totalLength = 0; while ((source < sourceLimit) && (U_ZERO_ERROR == errorCode)) { destination = buffer; destinationLimit = destination + MAX_BUFFER_SIZE; ucnv_fromUnicode(converter, &destination, destinationLimit, (const UChar **)&source, (const UChar *)sourceLimit, NULL, flush, &errorCode); totalLength += (destination - buffer); if (U_BUFFER_OVERFLOW_ERROR == errorCode) errorCode = U_ZERO_ERROR; } if (NULL != usedByteLen) *usedByteLen = totalLength; } else { ucnv_fromUnicode(converter, &destination, destinationLimit, (const UChar **)&source, (const UChar *)sourceLimit, NULL, flush, &errorCode); #if HAS_ICU_BUG_6024743 /* Another critical ICU design issue. Similar to conversion error, source pointer returned from U_BUFFER_OVERFLOW_ERROR is already beyond the last valid character position. It renders the returned value from source entirely unusable. We have to manually back up until succeeding <rdar://problem/7183045> Intrestingly, this issue doesn't apply to ucnv_toUnicode. The asynmmetric nature makes this more dangerous */ if (U_BUFFER_OVERFLOW_ERROR == errorCode) { const uint8_t *bitmap = CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet, 0); const uint8_t *nonBase; UTF32Char character; do { // Since the output buffer is filled, we can assume no invalid chars (including stray surrogates) do { sourceLimit = (source - 1); character = *sourceLimit; nonBase = bitmap; if (CFUniCharIsSurrogateLowCharacter(character)) { --sourceLimit; character = CFUniCharGetLongCharacterForSurrogatePair(*sourceLimit, character); nonBase = CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet, (character >> 16) & 0x000F); character &= 0xFFFF; } } while ((sourceLimit > characters) && CFUniCharIsMemberOfBitmap(character, nonBase)); if (sourceLimit > characters) { source = characters; destination = (char *)bytes; errorCode = U_ZERO_ERROR; ucnv_resetFromUnicode(converter); ucnv_fromUnicode(converter, &destination, destinationLimit, (const UChar **)&source, (const UChar *)sourceLimit, NULL, flush, &errorCode); } } while (U_BUFFER_OVERFLOW_ERROR == errorCode); errorCode = U_BUFFER_OVERFLOW_ERROR; } #endif if (NULL != usedByteLen) *usedByteLen = destination - (const char *)bytes; } status = ((U_ZERO_ERROR == errorCode) ? kCFStringEncodingConversionSuccess : ((U_BUFFER_OVERFLOW_ERROR == errorCode) ? kCFStringEncodingInsufficientOutputBufferLength : kCFStringEncodingInvalidInputStream)); if (NULL != usedCharLen) { #if HAS_ICU_BUG_6024743 /* ICU has a serious behavioral inconsistency issue that the source pointer returned from ucnv_fromUnicode() is after illegal input. We have to keep track of any changes in this area in order to prevent future binary compatiibility issues */ if (kCFStringEncodingInvalidInputStream == status) { #define MAX_ERROR_BUFFER_LEN (32) UTF16Char errorBuffer[MAX_ERROR_BUFFER_LEN]; int8_t errorLength = MAX_ERROR_BUFFER_LEN; #undef MAX_ERROR_BUFFER_LEN errorCode = U_ZERO_ERROR; ucnv_getInvalidUChars(converter, (UChar *)errorBuffer, &errorLength, &errorCode); if (U_ZERO_ERROR == errorCode) { source -= errorLength; } else { // Gah, something is terribly wrong. Reset everything source = characters; // 0 length if (NULL != usedByteLen) *usedByteLen = 0; } } #endif *usedCharLen = source - characters; } status |= __CFStringEncodingConverterReleaseICUConverter(converter, flags, status); return status; }
DeprecatedCString StreamingTextDecoderICU::fromUnicode(const DeprecatedString &qcs, bool allowEntities) { TextEncodingID encoding = m_encoding.effectiveEncoding().encodingID(); if (encoding == WinLatin1Encoding && qcs.isAllLatin1()) return qcs.latin1(); if ((encoding == WinLatin1Encoding || encoding == UTF8Encoding || encoding == ASCIIEncoding) && qcs.isAllASCII()) return qcs.ascii(); // FIXME: We should see if there is "force ASCII range" mode in ICU; // until then, we change the backslash into a yen sign. // Encoding will change the yen sign back into a backslash. DeprecatedString copy = qcs; copy.replace('\\', m_encoding.backslashAsCurrencySymbol()); if (!m_converterICU) createICUConverter(); if (!m_converterICU) return DeprecatedCString(); // FIXME: when DeprecatedString buffer is latin1, it would be nice to // convert from that w/o having to allocate a unicode buffer char buffer[ConversionBufferSize]; const UChar* source = reinterpret_cast<const UChar*>(copy.unicode()); const UChar* sourceLimit = source + copy.length(); UErrorCode err = U_ZERO_ERROR; DeprecatedString normalizedString; if (UNORM_YES != unorm_quickCheck(source, copy.length(), UNORM_NFC, &err)) { normalizedString.truncate(copy.length()); // normalization to NFC rarely increases the length, so this first attempt will usually succeed int32_t normalizedLength = unorm_normalize(source, copy.length(), UNORM_NFC, 0, reinterpret_cast<UChar*>(const_cast<DeprecatedChar*>(normalizedString.unicode())), copy.length(), &err); if (err == U_BUFFER_OVERFLOW_ERROR) { err = U_ZERO_ERROR; normalizedString.truncate(normalizedLength); normalizedLength = unorm_normalize(source, copy.length(), UNORM_NFC, 0, reinterpret_cast<UChar*>(const_cast<DeprecatedChar*>(normalizedString.unicode())), normalizedLength, &err); } source = reinterpret_cast<const UChar*>(normalizedString.unicode()); sourceLimit = source + normalizedLength; } DeprecatedCString result(1); // for trailing zero if (allowEntities) ucnv_setFromUCallBack(m_converterICU, UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, 0, 0, &err); else { ucnv_setSubstChars(m_converterICU, "?", 1, &err); ucnv_setFromUCallBack(m_converterICU, UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0, 0, 0, &err); } ASSERT(U_SUCCESS(err)); if (U_FAILURE(err)) return DeprecatedCString(); do { char* target = buffer; char* targetLimit = target + ConversionBufferSize; err = U_ZERO_ERROR; ucnv_fromUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, 0, true, &err); int count = target - buffer; buffer[count] = 0; result.append(buffer); } while (err == U_BUFFER_OVERFLOW_ERROR); return result; }
U_CAPI int32_t U_EXPORT2 u_file_write_flush( const UChar *chars, int32_t count, UFILE *f, UBool flush) { /* Set up conversion parameters */ UErrorCode status = U_ZERO_ERROR; const UChar *mySource = chars; const UChar *sourceAlias = chars; const UChar *mySourceEnd = chars + count; char *myTarget = f->fCharBuffer; int32_t bufferSize = UFILE_CHARBUFFER_SIZE; int32_t written = 0; int32_t numConverted = 0; #if !UCONFIG_NO_TRANSLITERATION if((f->fTranslit) && (f->fTranslit->translit)) { /* Do the transliteration */ mySource = u_file_translit(f, chars, &count, flush); sourceAlias = mySource; mySourceEnd = mySource + count; } #endif /* Perform the conversion in a loop */ do { status = U_ZERO_ERROR; sourceAlias = mySource; if(f->fConverter != NULL) { /* We have a valid converter */ ucnv_fromUnicode(f->fConverter, &myTarget, f->fCharBuffer + bufferSize, &mySource, mySourceEnd, NULL, flush, &status); } else { /*weiv: do the invariant conversion */ u_UCharsToChars(mySource, myTarget, count); myTarget += count; } numConverted = (int32_t)(myTarget - f->fCharBuffer); if (numConverted > 0) { /* write the converted bytes */ fwrite(f->fCharBuffer, sizeof(char), numConverted, f->fFile); written += numConverted; } myTarget = f->fCharBuffer; } while(status == U_BUFFER_OVERFLOW_ERROR); /* return # of chars written */ return written; }
UErrorCode convsample_46() { printf("\n\n==============================================\n" "Sample 46: C: convert data40.utf16 from UTF16 to latin2 [data46.out]\n"); FILE *f; FILE *out; int32_t count; UChar inBuf[BUFFERSIZE]; const UChar *source; const UChar *sourceLimit; char *buf; char *target; char *targetLimit; int32_t bufSize = 0; UConverter *conv = NULL; UErrorCode status = U_ZERO_ERROR; uint32_t inchars=0, total=0; f = fopen("data40.utf16", "rb"); if(!f) { fprintf(stderr, "Couldn't open file 'data40.utf16' (did you run convsample_40() ?)\n"); return U_FILE_ACCESS_ERROR; } out = fopen("data46.out", "wb"); if(!out) { fprintf(stderr, "Couldn't create file 'data46.out'.\n"); fclose(f); return U_FILE_ACCESS_ERROR; } // **************************** START SAMPLE ******************* conv = ucnv_open( "iso-8859-2", &status); assert(U_SUCCESS(status)); bufSize = (BUFFERSIZE*ucnv_getMaxCharSize(conv)); printf("input UChars[16] %d * max charsize %d = %d bytes output buffer\n", BUFFERSIZE, ucnv_getMaxCharSize(conv), bufSize); buf = (char*)malloc(bufSize * sizeof(char)); assert(buf!=NULL); // grab another buffer's worth while((!feof(f)) && ((count=fread(inBuf, sizeof(UChar), BUFFERSIZE , f)) > 0) ) { inchars += count; // Convert bytes to unicode source = inBuf; sourceLimit = inBuf + count; do { target = buf; targetLimit = buf + bufSize; ucnv_fromUnicode( conv, &target, targetLimit, &source, sourceLimit, NULL, feof(f)?TRUE:FALSE, /* pass 'flush' when eof */ /* is true (when no more data will come) */ &status); if(status == U_BUFFER_OVERFLOW_ERROR) { // simply ran out of space - we'll reset the target ptr the next // time through the loop. status = U_ZERO_ERROR; } else { // Check other errors here. assert(U_SUCCESS(status)); // Break out of the loop (by force) } // Process the Unicode assert(fwrite(buf, sizeof(buf[0]), (target-buf), out) == (size_t)(target-buf)); total += (target-buf); } while (source < sourceLimit); // while simply out of space } printf("%d Uchars (%d bytes) in, %d chars out.\n", inchars, inchars * sizeof(UChar), total); // ***************************** END SAMPLE ******************** ucnv_close(conv); fclose(f); fclose(out); printf("\n"); return U_ZERO_ERROR; }