int gtm_conv(UConverter* from, UConverter* to, mstr *src, char* dstbuff, int* bufflen) { char *dstptr, *dstbase, *srcptr; const char *ichset; int dstlen, src_charlen, srclen; UErrorCode status, status1; if (0 == src->len) return 0; if (NULL == dstbuff) { /* Compute the stringpool buffer space needed for conversion given that source * is encoded in the ichset representation. The ICU functions ucnv_getMinCharSize() * and ucnv_getMaxCharSize() are used to compute the minimum and maximum number of * bytes required per UChar if converted from/to ichset/ochset respectively */ src_charlen = (src->len / ucnv_getMinCharSize(from)) + 1; /* number of UChar's from ichset */ dstlen = UCNV_GET_MAX_BYTES_FOR_STRING(src_charlen, ucnv_getMaxCharSize(to)); dstlen = (dstlen > MAX_STRLEN) ? MAX_STRLEN : dstlen; ENSURE_STP_FREE_SPACE(dstlen); dstbase = (char *)stringpool.free; } else { dstbase = dstbuff; dstlen = *bufflen; } srcptr = src->addr; srclen = (int)src->len; dstptr = dstbase; status = U_ZERO_ERROR; /* initialization to "success" is required by ICU */ ucnv_convertEx(to, from, &dstptr, dstptr + dstlen, (const char**)&srcptr, srcptr + srclen, NULL, NULL, NULL, NULL, TRUE, TRUE, &status); if (U_FAILURE(status)) { if (U_BUFFER_OVERFLOW_ERROR == status) { /* translation requires more space than the maximum allowed GT.M string size */ if (NULL == dstbuff) rts_error_csa(NULL, VARLSTCNT(1) ERR_MAXSTRLEN); else { /* Insufficient buffer passed. Return the required buffer length */ src_charlen = (srclen / ucnv_getMinCharSize(from)) + 1; *bufflen = UCNV_GET_MAX_BYTES_FOR_STRING(src_charlen, ucnv_getMaxCharSize(to)); return -1; } } status1 = U_ZERO_ERROR; ichset = ucnv_getName(from, &status1); assert(U_SUCCESS(status1)); UTF8_BADCHAR(1,(unsigned char *) (srcptr - 1), NULL,STRLEN(ichset), ichset); } return (int) (dstptr - dstbase); }
bool CSICU_charset_init(charset* cs, const ASCII* charSetName) { UErrorCode status = U_ZERO_ERROR; UConverter* conv = ucnv_open(charSetName, &status); if (U_SUCCESS(status)) { // charSetName comes from stack. Copy it. ASCII* p = new ASCII[strlen(charSetName) + 1]; cs->charset_name = p; strcpy(p, charSetName); cs->charset_version = CHARSET_VERSION_1; cs->charset_flags |= CHARSET_ASCII_BASED; cs->charset_min_bytes_per_char = ucnv_getMinCharSize(conv); cs->charset_max_bytes_per_char = ucnv_getMaxCharSize(conv); cs->charset_fn_destroy = charset_destroy; cs->charset_fn_well_formed = NULL; const UChar unicodeSpace = 32; BYTE* p2 = new BYTE[cs->charset_max_bytes_per_char]; cs->charset_space_character = p2; cs->charset_space_length = ucnv_fromUChars(conv, reinterpret_cast<char*>(p2), cs->charset_max_bytes_per_char, &unicodeSpace, 1, &status); fb_assert(U_SUCCESS(status)); ucnv_close(conv); CVICU_convert_init(cs); } return U_SUCCESS(status); }
inline int mod_websocket_conv(UConverter *to, UConverter *from, char **dst, size_t *dstsiz, const char *src, size_t srcsiz) { UErrorCode err = U_ZERO_ERROR; size_t unisiz; UChar *unibuf, *punibuf, *ppunibuf; char *pdst; if (srcsiz == 0) { return -1; } if (!to) { *dst = (char *)malloc(srcsiz + 1); if (*dst == NULL) { return -1; } memcpy(*dst, src, srcsiz); (*dst)[srcsiz] = '\0'; *dstsiz = srcsiz; return 0; } if (!from || !dst || !src || !dstsiz) { return -1; } unisiz = srcsiz / ucnv_getMinCharSize(from); unibuf = (UChar *)malloc(sizeof(UChar) * unisiz + 1); if (!unibuf) { return -1; } punibuf = unibuf; ucnv_toUnicode(from, &punibuf, punibuf + unisiz, &src, src + srcsiz, 0, 0, &err); if (U_FAILURE(err)) { free(unibuf); return -1; } *punibuf = '\0'; *dstsiz = (punibuf - unibuf) * ucnv_getMaxCharSize(to); *dst = (char *)malloc(*dstsiz + 1); if (!*dst) { free(unibuf); return -1; } pdst = *dst; ppunibuf = unibuf; ucnv_fromUnicode(to, &pdst, pdst + *dstsiz, (const UChar **)&ppunibuf, punibuf, 0, 0, &err); free(unibuf); if (U_FAILURE(err)) { free(*dst); return -1; } *pdst = '\0'; *dstsiz = pdst - *dst; return 0; }
U_CAPI int32_t U_EXPORT2 ucbuf_size(UCHARBUF* buf){ if(buf){ if(buf->isBuffered){ return (T_FileStream_size(buf->in)-buf->signatureLength)/ucnv_getMinCharSize(buf->conv); }else{ return (int32_t)(buf->bufLimit - buf->buffer); } } return 0; }
// --------------------------------------------------------------------------- // ICUTranscoder: Constructors and Destructor // --------------------------------------------------------------------------- ICUTranscoder::ICUTranscoder(const XMLCh* const encodingName , UConverter* const toAdopt , const unsigned int blockSize , MemoryManager* const manager) : XMLTranscoder(encodingName, blockSize, manager) , fConverter(toAdopt) , fFixed(false) , fSrcOffsets(0) { // If there is a block size, then allocate our source offset array if (blockSize) fSrcOffsets = (XMLUInt32*) manager->allocate ( blockSize * sizeof(XMLUInt32) );//new XMLUInt32[blockSize]; // Remember if its a fixed size encoding fFixed = (ucnv_getMaxCharSize(fConverter) == ucnv_getMinCharSize(fConverter)); }
/* private function used for buffering input */ void ufile_fill_uchar_buffer(UFILE *f) { UErrorCode status; const char *mySource; const char *mySourceEnd; UChar *myTarget; int32_t bufferSize; int32_t maxCPBytes; int32_t bytesRead; int32_t availLength; int32_t dataSize; char charBuffer[UFILE_CHARBUFFER_SIZE]; u_localized_string *str; if (f->fFile == NULL) { /* There is nothing to do. It's a string. */ return; } str = &f->str; dataSize = (int32_t)(str->fLimit - str->fPos); if (f->fFileno == 0 && dataSize > 0) { /* Don't read from stdin too many times. There is still some data. */ return; } /* shift the buffer if it isn't empty */ if(dataSize != 0) { uprv_memmove(f->fUCBuffer, str->fPos, dataSize * sizeof(UChar)); } /* record how much buffer space is available */ availLength = UFILE_UCHARBUFFER_SIZE - dataSize; /* Determine the # of codepage bytes needed to fill our UChar buffer */ /* weiv: if converter is NULL, we use invariant converter with charwidth = 1)*/ maxCPBytes = availLength / (f->fConverter!=NULL?(2*ucnv_getMinCharSize(f->fConverter)):1); /* Read in the data to convert */ if (f->fFileno == 0) { /* Special case. Read from stdin one line at a time. */ char *retStr = fgets(charBuffer, ufmt_min(maxCPBytes, UFILE_CHARBUFFER_SIZE), f->fFile); bytesRead = (int32_t)(retStr ? uprv_strlen(charBuffer) : 0); } else { /* A normal file */ bytesRead = (int32_t)fread(charBuffer, sizeof(char), ufmt_min(maxCPBytes, UFILE_CHARBUFFER_SIZE), f->fFile); } /* Set up conversion parameters */ status = U_ZERO_ERROR; mySource = charBuffer; mySourceEnd = charBuffer + bytesRead; myTarget = f->fUCBuffer + dataSize; bufferSize = UFILE_UCHARBUFFER_SIZE; if(f->fConverter != NULL) { /* We have a valid converter */ /* Perform the conversion */ ucnv_toUnicode(f->fConverter, &myTarget, f->fUCBuffer + bufferSize, &mySource, mySourceEnd, NULL, (UBool)(feof(f->fFile) != 0), &status); } else { /*weiv: do the invariant conversion */ u_charsToUChars(mySource, myTarget, bytesRead); myTarget += bytesRead; } /* update the pointers into our array */ str->fPos = str->fBuffer; str->fLimit = myTarget; }
UErrorCode convsample_40() { printf("\n\n==============================================\n" "Sample 40: C: convert data02.bin from cp37 to UTF16 [data40.utf16]\n"); FILE *f; FILE *out; int32_t count; char inBuf[BUFFERSIZE]; const char *source; const char *sourceLimit; UChar *uBuf; UChar *target; UChar *targetLimit; int32_t uBufSize = 0; UConverter *conv = NULL; UErrorCode status = U_ZERO_ERROR; uint32_t inbytes=0, total=0; f = fopen("data02.bin", "rb"); if(!f) { fprintf(stderr, "Couldn't open file 'data02.bin' (cp37 data file).\n"); return U_FILE_ACCESS_ERROR; } out = fopen("data40.utf16", "wb"); if(!out) { fprintf(stderr, "Couldn't create file 'data40.utf16'.\n"); fclose(f); return U_FILE_ACCESS_ERROR; } // **************************** START SAMPLE ******************* conv = ucnv_openCCSID(37, UCNV_IBM, &status); assert(U_SUCCESS(status)); uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); printf("input bytes %d / min chars %d = %d UChars\n", BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); uBuf = (UChar*)malloc(uBufSize * sizeof(UChar)); assert(uBuf!=NULL); // grab another buffer's worth while((!feof(f)) && ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) ) { inbytes += count; // Convert bytes to unicode source = inBuf; sourceLimit = inBuf + count; do { target = uBuf; targetLimit = uBuf + uBufSize; ucnv_toUnicode( conv, &target, targetLimit, &source, sourceLimit, NULL, feof(f)?TRUE:FALSE, /* pass 'flush' when eof */ /* is true (when no more data will come) */ &status); if(status == U_BUFFER_OVERFLOW_ERROR) { // simply ran out of space - we'll reset the target ptr the next // time through the loop. status = U_ZERO_ERROR; } else { // Check other errors here. assert(U_SUCCESS(status)); // Break out of the loop (by force) } // Process the Unicode // Todo: handle UTF-16/surrogates assert(fwrite(uBuf, sizeof(uBuf[0]), (target-uBuf), out) == (size_t)(target-uBuf)); total += (target-uBuf); } while (source < sourceLimit); // while simply out of space } printf("%d bytes in, %d UChars out.\n", inbytes, total); // ***************************** END SAMPLE ******************** ucnv_close(conv); fclose(f); fclose(out); printf("\n"); return U_ZERO_ERROR; }
UErrorCode convsample_06() { printf("\n\n==============================================\n" "Sample 06: C: frequency distribution of letters in a UTF-8 document\n"); FILE *f; int32_t count; char inBuf[BUFFERSIZE]; const char *source; const char *sourceLimit; UChar *uBuf; int32_t uBufSize = 0; UConverter *conv; UErrorCode status = U_ZERO_ERROR; uint32_t letters=0, total=0; CharFreqInfo *info; UChar32 charCount = 0x10000; /* increase this if you want to handle non bmp.. todo: automatically bump it.. */ UChar32 p; uint32_t ie = 0; uint32_t gh = 0; UChar32 l = 0; f = fopen("data06.txt", "r"); if(!f) { fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n"); return U_FILE_ACCESS_ERROR; } info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount); if(!info) { fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", sizeof(CharFreqInfo)*charCount); } /* reset frequencies */ for(p=0;p<charCount;p++) { info[p].codepoint = p; info[p].frequency = 0; } // **************************** START SAMPLE ******************* conv = ucnv_open("utf-8", &status); assert(U_SUCCESS(status)); uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); printf("input bytes %d / min chars %d = %d UChars\n", BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); uBuf = (UChar*)malloc(uBufSize * sizeof(UChar)); assert(uBuf!=NULL); // grab another buffer's worth while((!feof(f)) && ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) ) { // Convert bytes to unicode source = inBuf; sourceLimit = inBuf + count; while(source < sourceLimit) { p = ucnv_getNextUChar(conv, &source, sourceLimit, &status); if(U_FAILURE(status)) { fprintf(stderr, "%s @ %d\n", u_errorName(status), total); status = U_ZERO_ERROR; continue; } U_ASSERT(status); total++; if(u_isalpha(p)) letters++; if((u_tolower(l) == 'i') && (u_tolower(p) == 'e')) ie++; if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127)) gh++; if(p>charCount) { fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p); return U_UNSUPPORTED_ERROR; } info[p].frequency++; l = p; } } fclose(f); ucnv_close(conv); printf("%d letters out of %d total UChars.\n", letters, total); printf("%d ie digraphs, %d gh digraphs.\n", ie, gh); // now, we could sort it.. // qsort(info, charCount, sizeof(info[0]), charfreq_compare); for(p=0;p<charCount;p++) { if(info[p].frequency) { printf("% 5d U+%06X ", info[p].frequency, p); if(p <= 0xFFFF) { prettyPrintUChar((UChar)p); } printf("\n"); } } free(info); // ***************************** END SAMPLE ******************** printf("\n"); return U_ZERO_ERROR; }
UErrorCode convsample_05() { printf("\n\n==============================================\n" "Sample 05: C: count the number of letters in a UTF-8 document\n"); FILE *f; int32_t count; char inBuf[BUFFERSIZE]; const char *source; const char *sourceLimit; UChar *uBuf; UChar *target; UChar *targetLimit; UChar *p; int32_t uBufSize = 0; UConverter *conv; UErrorCode status = U_ZERO_ERROR; uint32_t letters=0, total=0; f = fopen("data01.txt", "r"); if(!f) { fprintf(stderr, "Couldn't open file 'data01.txt' (UTF-8 data file).\n"); return U_FILE_ACCESS_ERROR; } // **************************** START SAMPLE ******************* conv = ucnv_open("utf-8", &status); assert(U_SUCCESS(status)); uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); printf("input bytes %d / min chars %d = %d UChars\n", BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); uBuf = (UChar*)malloc(uBufSize * sizeof(UChar)); assert(uBuf!=NULL); // grab another buffer's worth while((!feof(f)) && ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) ) { // Convert bytes to unicode source = inBuf; sourceLimit = inBuf + count; do { target = uBuf; targetLimit = uBuf + uBufSize; ucnv_toUnicode(conv, &target, targetLimit, &source, sourceLimit, NULL, feof(f)?TRUE:FALSE, /* pass 'flush' when eof */ /* is true (when no more data will come) */ &status); if(status == U_BUFFER_OVERFLOW_ERROR) { // simply ran out of space - we'll reset the target ptr the next // time through the loop. status = U_ZERO_ERROR; } else { // Check other errors here. assert(U_SUCCESS(status)); // Break out of the loop (by force) } // Process the Unicode // Todo: handle UTF-16/surrogates for(p = uBuf; p<target; p++) { if(u_isalpha(*p)) letters++; total++; } } while (source < sourceLimit); // while simply out of space } printf("%d letters out of %d total UChars.\n", letters, total); // ***************************** END SAMPLE ******************** ucnv_close(conv); printf("\n"); fclose(f); return U_ZERO_ERROR; }
UErrorCode convert_to_unicode(const text* buffer, const text* encoding, UChar** uBuf, int32_t *uBuf_len, bool force, bool* dropped_bytes) { UErrorCode status = U_ZERO_ERROR; UConverter *conv; int32_t uConvertedLen = 0; // used to set dropped_bytes flag if force is true ToUFLAGContext * context = NULL; size_t uBufSize = 0; const char* encoding_cstr = text_to_cstring(encoding); // open converter for detected encoding conv = ucnv_open(encoding_cstr, &status); if (U_FAILURE(status)) { ereport(WARNING, (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION), errmsg("Cannot open %s converter - error: %s.\n", (const char *) encoding_cstr, u_errorName(status)))); if (NULL != encoding_cstr) pfree((void *) encoding_cstr); ucnv_close(conv); return status; } if (force) { // set callback to skip illegal, irregular or unassigned bytes // set converter to use SKIP callback // contecxt will save and call it after calling custom callback ucnv_setToUCallBack(conv, UCNV_TO_U_CALLBACK_SKIP, NULL, NULL, NULL, &status); //TODO: refactor warning and error message reporting if (U_FAILURE(status)) { ereport(WARNING, (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION), errmsg("Cannot set callback on converter - error: %s.\n", u_errorName(status)))); if (NULL != encoding_cstr) pfree((void *) encoding_cstr); ucnv_close(conv); return status; } // initialize flagging callback context = flagCB_toU_openContext(); /* Set our special callback */ ucnv_setToUCallBack(conv, flagCB_toU, context, &(context->subCallback), &(context->subContext), &status ); if (U_FAILURE(status)) { ereport(WARNING, (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION), errmsg("Cannot set callback on converter - error: %s.\n", u_errorName(status)))); if (NULL != encoding_cstr) pfree((void *) encoding_cstr); ucnv_close(conv); return status; } } // allocate unicode buffer // must pfree before exiting calling function uBufSize = (VARSIZE_ANY_EXHDR(buffer)/ucnv_getMinCharSize(conv) + 1); *uBuf = (UChar*) palloc0(uBufSize * sizeof(UChar)); if (*uBuf == NULL) { status = U_MEMORY_ALLOCATION_ERROR; ereport(WARNING, (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION), errmsg("Cannot allocate %d bytes for Unicode pivot buffer - error: %s.\n", (int) uBufSize, u_errorName(status)))); if (NULL != encoding_cstr) pfree((void *) encoding_cstr); ucnv_close(conv); return status; } ereport(DEBUG1, (errcode(ERRCODE_SUCCESSFUL_COMPLETION), errmsg("Original string: %s\n", (const char*) text_to_cstring(buffer)))); // convert to Unicode // returns length of converted string, not counting NUL-terminator uConvertedLen = ucnv_toUChars(conv, *uBuf, uBufSize, (const char*) text_to_cstring(buffer), STRING_IS_NULL_TERMINATED, &status ); if (U_SUCCESS(status)) { // add 1 for NUL terminator *uBuf_len = uConvertedLen + 1; ereport(DEBUG1, (errcode(ERRCODE_SUCCESSFUL_COMPLETION), errmsg("Converted string: %s\n", (const char*) *uBuf))); // see if any bytes where dropped // context struct will go away with converter is closed if (NULL != context) *dropped_bytes = context->flag; else *dropped_bytes = false; } if (U_FAILURE(status)) { ereport(WARNING, (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION), errmsg("ICU conversion from %s to Unicode failed - error: %s.\n", encoding_cstr, u_errorName(status)))); } if (NULL != encoding_cstr) pfree((void *) encoding_cstr); ucnv_close(conv); return status; }
static jint NativeConverter_getMinBytesPerChar(JNIEnv*, jclass, jlong address) { UConverter* cnv = toUConverter(address); return (cnv != NULL) ? ucnv_getMinCharSize(cnv) : -1; }
/** Fetch information on an encoding * * @param enc either NULL or "" for default encoding, * or one string with encoding name * @return R list object with many components (see R doc for details) * * @version 0.1-?? (Marek Gagolewski) * * @version 0.2-1 (Marek Gagolewski) * use StriUcnv; make StriException-friendly * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_enc_info(SEXP enc) { const char* selected_enc = stri__prepare_arg_enc(enc, "enc", true/*default ok*/); /* this is R_alloc'ed */ STRI__ERROR_HANDLER_BEGIN(0) StriUcnv uconv_obj(selected_enc); //uconv_obj.setCallBackSubstitute(); // restore default callbacks (no warning) UConverter* uconv = uconv_obj.getConverter(false); UErrorCode status = U_ZERO_ERROR; // get the list of available standards vector<const char*> standards = StriUcnv::getStandards(); R_len_t standards_n = (R_len_t)standards.size(); // alloc output list SEXP vals; SEXP names; const int nval = standards_n+2+5; STRI__PROTECT(names = Rf_allocVector(STRSXP, nval)); SET_STRING_ELT(names, 0, Rf_mkChar("Name.friendly")); SET_STRING_ELT(names, 1, Rf_mkChar("Name.ICU")); for (R_len_t i=0; i<standards_n; ++i) { if (standards[i]) SET_STRING_ELT(names, i+2, Rf_mkChar((string("Name.")+standards[i]).c_str())); } SET_STRING_ELT(names, nval-5, Rf_mkChar("ASCII.subset")); SET_STRING_ELT(names, nval-4, Rf_mkChar("Unicode.1to1")); SET_STRING_ELT(names, nval-3, Rf_mkChar("CharSize.8bit")); SET_STRING_ELT(names, nval-2, Rf_mkChar("CharSize.min")); SET_STRING_ELT(names, nval-1, Rf_mkChar("CharSize.max")); STRI__PROTECT(vals = Rf_allocVector(VECSXP, nval)); // get canonical (ICU) name status = U_ZERO_ERROR; const char* canname = ucnv_getName(uconv, &status); if (U_FAILURE(status) || !canname) { SET_VECTOR_ELT(vals, 1, Rf_ScalarString(NA_STRING)); Rf_warning(MSG__ENC_ERROR_GETNAME); } else { SET_VECTOR_ELT(vals, 1, stri__make_character_vector_char_ptr(1, canname)); // friendly name const char* frname = StriUcnv::getFriendlyName(canname); if (frname) SET_VECTOR_ELT(vals, 0, stri__make_character_vector_char_ptr(1, frname)); else SET_VECTOR_ELT(vals, 0, Rf_ScalarString(NA_STRING)); // has ASCII as its subset? SET_VECTOR_ELT(vals, nval-5, Rf_ScalarLogical((int)uconv_obj.hasASCIIsubset())); // min,max character size, is 8bit? int mincharsize = (int)ucnv_getMinCharSize(uconv); int maxcharsize = (int)ucnv_getMaxCharSize(uconv); int is8bit = (mincharsize==1 && maxcharsize == 1); SET_VECTOR_ELT(vals, nval-3, Rf_ScalarLogical(is8bit)); SET_VECTOR_ELT(vals, nval-2, Rf_ScalarInteger(mincharsize)); SET_VECTOR_ELT(vals, nval-1, Rf_ScalarInteger(maxcharsize)); // is there a one-to-one correspondence with Unicode? if (!is8bit) SET_VECTOR_ELT(vals, nval-4, Rf_ScalarLogical(NA_LOGICAL)); else SET_VECTOR_ELT(vals, nval-4, Rf_ScalarLogical((int)uconv_obj.is1to1Unicode())); // other standard names for (R_len_t i=0; i<standards_n; ++i) { if (!standards[i]) continue; status = U_ZERO_ERROR; const char* stdname = ucnv_getStandardName(canname, standards[i], &status); if (U_FAILURE(status) || !stdname) SET_VECTOR_ELT(vals, i+2, Rf_ScalarString(NA_STRING)); else SET_VECTOR_ELT(vals, i+2, stri__make_character_vector_char_ptr(1, stdname)); } } Rf_setAttrib(vals, R_NamesSymbol, names); STRI__UNPROTECT_ALL return vals; STRI__ERROR_HANDLER_END({/* no special action on error */}) }
void charsetConverter_icu::convert (utility::inputStream& in, utility::outputStream& out, status* st) { UErrorCode err = U_ZERO_ERROR; ucnv_reset(m_from); ucnv_reset(m_to); if (st) new (st) status(); // From buffers byte_t cpInBuffer[16]; // stream data put here const size_t outSize = ucnv_getMinCharSize(m_from) * sizeof(cpInBuffer) * sizeof(UChar); std::vector <UChar> uOutBuffer(outSize); // Unicode chars end up here // To buffers // converted (char) data end up here const size_t cpOutBufferSz = ucnv_getMaxCharSize(m_to) * outSize; std::vector <char> cpOutBuffer(cpOutBufferSz); // Tell ICU what to do when encountering an illegal byte sequence if (m_options.silentlyReplaceInvalidSequences) { // Set replacement chars for when converting from Unicode to codepage icu::UnicodeString substString(m_options.invalidSequence.c_str()); ucnv_setSubstString(m_to, substString.getTerminatedBuffer(), -1, &err); if (U_FAILURE(err)) throw exceptions::charset_conv_error("[ICU] Error when setting substitution string."); } else { // Tell ICU top stop (and return an error) on illegal byte sequences ucnv_setToUCallBack (m_from, UCNV_TO_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err); if (U_FAILURE(err)) throw exceptions::charset_conv_error("[ICU] Error when setting ToU callback."); ucnv_setFromUCallBack (m_to, UCNV_FROM_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err); if (U_FAILURE(err)) throw exceptions::charset_conv_error("[ICU] Error when setting FromU callback."); } // Input data available while (!in.eof()) { // Read input data into buffer size_t inLength = in.read(cpInBuffer, sizeof(cpInBuffer)); // Beginning of read data const char* source = reinterpret_cast <const char*>(&cpInBuffer[0]); const char* sourceLimit = source + inLength; // end + 1 UBool flush = in.eof(); // is this last run? UErrorCode toErr; // Loop until all source has been processed do { // Set up target pointers UChar* target = &uOutBuffer[0]; UChar* targetLimit = &target[0] + outSize; toErr = U_ZERO_ERROR; ucnv_toUnicode(m_from, &target, targetLimit, &source, sourceLimit, NULL, flush, &toErr); if (st) st->inputBytesRead += (source - reinterpret_cast <const char*>(&cpInBuffer[0])); if (toErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(toErr)) { if (toErr == U_INVALID_CHAR_FOUND || toErr == U_TRUNCATED_CHAR_FOUND || toErr == U_ILLEGAL_CHAR_FOUND) { // Error will be thrown later (*) } else { throw exceptions::charset_conv_error("[ICU] Error converting to Unicode from " + m_source.getName()); } } // The Unicode source is the buffer just written and the limit // is where the previous conversion stopped (target is moved in the conversion) const UChar* uSource = &uOutBuffer[0]; UChar* uSourceLimit = &target[0]; UErrorCode fromErr; // Loop until converted chars are fully written do { char* cpTarget = &cpOutBuffer[0]; const char* cpTargetLimit = &cpOutBuffer[0] + cpOutBufferSz; fromErr = U_ZERO_ERROR; // Write converted bytes (Unicode) to destination codepage ucnv_fromUnicode(m_to, &cpTarget, cpTargetLimit, &uSource, uSourceLimit, NULL, flush, &fromErr); if (st) { // Decrement input bytes count by the number of input bytes in error char errBytes[16]; int8_t errBytesLen = sizeof(errBytes); UErrorCode errBytesErr = U_ZERO_ERROR; ucnv_getInvalidChars(m_from, errBytes, &errBytesLen, &errBytesErr); st->inputBytesRead -= errBytesLen; st->outputBytesWritten += cpTarget - &cpOutBuffer[0]; } // (*) If an error occurred while converting from input charset, throw it now if (toErr == U_INVALID_CHAR_FOUND || toErr == U_TRUNCATED_CHAR_FOUND || toErr == U_ILLEGAL_CHAR_FOUND) { throw exceptions::illegal_byte_sequence_for_charset(); } if (fromErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(fromErr)) { if (fromErr == U_INVALID_CHAR_FOUND || fromErr == U_TRUNCATED_CHAR_FOUND || fromErr == U_ILLEGAL_CHAR_FOUND) { throw exceptions::illegal_byte_sequence_for_charset(); } else { throw exceptions::charset_conv_error("[ICU] Error converting from Unicode to " + m_dest.getName()); } } // Write to destination stream out.write(&cpOutBuffer[0], (cpTarget - &cpOutBuffer[0])); } while (fromErr == U_BUFFER_OVERFLOW_ERROR); } while (toErr == U_BUFFER_OVERFLOW_ERROR); } }
void charsetFilteredOutputStream_icu::flush() { if (m_from == NULL || m_to == NULL) throw exceptions::charset_conv_error("Cannot initialize converters."); // Allocate buffer for Unicode chars const size_t uniSize = ucnv_getMinCharSize(m_from) * 1024 * sizeof(UChar); std::vector <UChar> uniBuffer(uniSize); // Conversion loop (with flushing) UErrorCode toErr = U_ZERO_ERROR; const char* uniSource = 0; const char* uniSourceLimit = 0; do { // Convert from source charset to Unicode UChar* uniTarget = &uniBuffer[0]; UChar* uniTargetLimit = &uniBuffer[0] + uniSize; toErr = U_ZERO_ERROR; ucnv_toUnicode(m_from, &uniTarget, uniTargetLimit, &uniSource, uniSourceLimit, NULL, /* flush */ TRUE, &toErr); if (U_FAILURE(toErr) && toErr != U_BUFFER_OVERFLOW_ERROR) { throw exceptions::charset_conv_error ("[ICU] Error converting to Unicode from '" + m_sourceCharset.getName() + "'."); } const size_t uniLength = uniTarget - &uniBuffer[0]; // Allocate buffer for destination charset const size_t cpSize = ucnv_getMinCharSize(m_to) * uniLength; std::vector <char> cpBuffer(cpSize); // Convert from Unicode to destination charset UErrorCode fromErr = U_ZERO_ERROR; const UChar* cpSource = &uniBuffer[0]; const UChar* cpSourceLimit = &uniBuffer[0] + uniLength; do { char* cpTarget = &cpBuffer[0]; char* cpTargetLimit = &cpBuffer[0] + cpSize; fromErr = U_ZERO_ERROR; ucnv_fromUnicode(m_to, &cpTarget, cpTargetLimit, &cpSource, cpSourceLimit, NULL, /* flush */ TRUE, &fromErr); if (fromErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(fromErr)) { throw exceptions::charset_conv_error ("[ICU] Error converting from Unicode to '" + m_destCharset.getName() + "'."); } const size_t cpLength = cpTarget - &cpBuffer[0]; // Write successfully converted bytes m_stream.write(&cpBuffer[0], cpLength); } while (fromErr == U_BUFFER_OVERFLOW_ERROR); } while (toErr == U_BUFFER_OVERFLOW_ERROR); m_stream.flush(); }
void charsetFilteredOutputStream_icu::writeImpl (const byte_t* const data, const size_t count) { if (m_from == NULL || m_to == NULL) throw exceptions::charset_conv_error("Cannot initialize converters."); // Allocate buffer for Unicode chars const size_t uniSize = ucnv_getMinCharSize(m_from) * count * sizeof(UChar); std::vector <UChar> uniBuffer(uniSize); // Conversion loop UErrorCode toErr = U_ZERO_ERROR; const char* uniSource = reinterpret_cast <const char*>(data); const char* uniSourceLimit = uniSource + count; do { // Convert from source charset to Unicode UChar* uniTarget = &uniBuffer[0]; UChar* uniTargetLimit = &uniBuffer[0] + uniSize; toErr = U_ZERO_ERROR; ucnv_toUnicode(m_from, &uniTarget, uniTargetLimit, &uniSource, uniSourceLimit, NULL, /* flush */ FALSE, &toErr); if (U_FAILURE(toErr) && toErr != U_BUFFER_OVERFLOW_ERROR) { if (toErr == U_INVALID_CHAR_FOUND || toErr == U_TRUNCATED_CHAR_FOUND || toErr == U_ILLEGAL_CHAR_FOUND) { throw exceptions::illegal_byte_sequence_for_charset(); } else { throw exceptions::charset_conv_error ("[ICU] Error converting to Unicode from '" + m_sourceCharset.getName() + "'."); } } const size_t uniLength = uniTarget - &uniBuffer[0]; // Allocate buffer for destination charset const size_t cpSize = ucnv_getMinCharSize(m_to) * uniLength; std::vector <char> cpBuffer(cpSize); // Convert from Unicode to destination charset UErrorCode fromErr = U_ZERO_ERROR; const UChar* cpSource = &uniBuffer[0]; const UChar* cpSourceLimit = &uniBuffer[0] + uniLength; do { char* cpTarget = &cpBuffer[0]; char* cpTargetLimit = &cpBuffer[0] + cpSize; fromErr = U_ZERO_ERROR; ucnv_fromUnicode(m_to, &cpTarget, cpTargetLimit, &cpSource, cpSourceLimit, NULL, /* flush */ FALSE, &fromErr); if (fromErr != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(fromErr)) { if (fromErr == U_INVALID_CHAR_FOUND || fromErr == U_TRUNCATED_CHAR_FOUND || fromErr == U_ILLEGAL_CHAR_FOUND) { throw exceptions::illegal_byte_sequence_for_charset(); } else { throw exceptions::charset_conv_error ("[ICU] Error converting from Unicode to '" + m_destCharset.getName() + "'."); } } const size_t cpLength = cpTarget - &cpBuffer[0]; // Write successfully converted bytes m_stream.write(&cpBuffer[0], cpLength); } while (fromErr == U_BUFFER_OVERFLOW_ERROR); } while (toErr == U_BUFFER_OVERFLOW_ERROR); }
/* private function used for buffering input */ void ufile_fill_uchar_buffer(UFILE *f) { UErrorCode status; const char *mySource; const char *mySourceEnd; UChar *myTarget; int32_t bufferSize; int32_t maxCPBytes; int32_t bytesRead; int32_t availLength; int32_t dataSize; /* shift the buffer if it isn't empty */ dataSize = (int32_t)(f->fUCLimit - f->fUCPos); if(dataSize != 0) { memmove(f->fUCBuffer, f->fUCPos, dataSize * sizeof(UChar)); } /* record how much buffer space is available */ availLength = UFILE_UCHARBUFFER_SIZE - dataSize; /* Determine the # of codepage bytes needed to fill our UChar buffer */ /* weiv: if converter is NULL, we use invariant converter with charwidth = 1)*/ maxCPBytes = availLength / (f->fConverter!=NULL?(2*ucnv_getMinCharSize(f->fConverter)):1); /* Read in the data to convert */ bytesRead = (int32_t)fread(f->fCharBuffer, sizeof(char), ufmt_min(maxCPBytes, UFILE_CHARBUFFER_SIZE), f->fFile); /* Set up conversion parameters */ status = U_ZERO_ERROR; mySource = f->fCharBuffer; mySourceEnd = f->fCharBuffer + bytesRead; myTarget = f->fUCBuffer + dataSize; bufferSize = UFILE_UCHARBUFFER_SIZE; if(f->fConverter != NULL) { /* We have a valid converter */ /* Perform the conversion */ ucnv_toUnicode(f->fConverter, &myTarget, f->fUCBuffer + bufferSize, &mySource, mySourceEnd, NULL, (UBool)(feof(f->fFile) != 0), &status); } else { /*weiv: do the invariant conversion */ u_charsToUChars(mySource, myTarget, bytesRead); myTarget += bytesRead; } /* update the pointers into our array */ f->fUCPos = f->fUCBuffer; f->fUCLimit = myTarget; }
static jfloat NativeConverter_getAveBytesPerChar(JNIEnv*, jclass, jlong address) { UConverter* cnv = toUConverter(address); return (cnv != NULL) ? ((ucnv_getMaxCharSize(cnv) + ucnv_getMinCharSize(cnv)) / 2.0) : -1; }