int gtm_conv(UConverter* from, UConverter* to, mstr *src, char* dstbuff, int* bufflen) { char *dstptr, *dstbase, *srcptr; const char *ichset; int dstlen, src_charlen, srclen; UErrorCode status, status1; if (0 == src->len) return 0; if (NULL == dstbuff) { /* Compute the stringpool buffer space needed for conversion given that source * is encoded in the ichset representation. The ICU functions ucnv_getMinCharSize() * and ucnv_getMaxCharSize() are used to compute the minimum and maximum number of * bytes required per UChar if converted from/to ichset/ochset respectively */ src_charlen = (src->len / ucnv_getMinCharSize(from)) + 1; /* number of UChar's from ichset */ dstlen = UCNV_GET_MAX_BYTES_FOR_STRING(src_charlen, ucnv_getMaxCharSize(to)); dstlen = (dstlen > MAX_STRLEN) ? MAX_STRLEN : dstlen; ENSURE_STP_FREE_SPACE(dstlen); dstbase = (char *)stringpool.free; } else { dstbase = dstbuff; dstlen = *bufflen; } srcptr = src->addr; srclen = (int)src->len; dstptr = dstbase; status = U_ZERO_ERROR; /* initialization to "success" is required by ICU */ ucnv_convertEx(to, from, &dstptr, dstptr + dstlen, (const char**)&srcptr, srcptr + srclen, NULL, NULL, NULL, NULL, TRUE, TRUE, &status); if (U_FAILURE(status)) { if (U_BUFFER_OVERFLOW_ERROR == status) { /* translation requires more space than the maximum allowed GT.M string size */ if (NULL == dstbuff) rts_error_csa(NULL, VARLSTCNT(1) ERR_MAXSTRLEN); else { /* Insufficient buffer passed. Return the required buffer length */ src_charlen = (srclen / ucnv_getMinCharSize(from)) + 1; *bufflen = UCNV_GET_MAX_BYTES_FOR_STRING(src_charlen, ucnv_getMaxCharSize(to)); return -1; } } status1 = U_ZERO_ERROR; ichset = ucnv_getName(from, &status1); assert(U_SUCCESS(status1)); UTF8_BADCHAR(1,(unsigned char *) (srcptr - 1), NULL,STRLEN(ichset), ichset); } return (int) (dstptr - dstbase); }
QByteArray QIcuCodec::convertFromUnicode(const QChar *unicode, int length, QTextCodec::ConverterState *state) const { UConverter *conv = getConverter(state); int requiredLength = UCNV_GET_MAX_BYTES_FOR_STRING(length, ucnv_getMaxCharSize(conv)); QByteArray string(requiredLength, Qt::Uninitialized); const UChar *uc = (const UChar *)unicode; const UChar *end = uc + length; int convertedChars = 0; while (1) { char *ch = (char *)string.data(); char *chEnd = ch + string.length(); ch += convertedChars; UErrorCode error = U_ZERO_ERROR; ucnv_fromUnicode(conv, &ch, chEnd, &uc, end, 0, false, &error); if (!U_SUCCESS(error)) qDebug() << "convertFromUnicode failed:" << u_errorName(error); convertedChars = ch - string.data(); if (uc >= end) break; string.resize(string.length()*2); } string.resize(convertedChars); if (!state) ucnv_close(conv); return string; }
void FStringConverter::ConvertString(const icu::UnicodeString& Source, const int32 SourceStartIndex, const int32 SourceLen, FString& Destination) { if (Source.length() > 0) { UErrorCode ICUStatus = U_ZERO_ERROR; ucnv_reset(ICUConverter); // Get the internal buffer of the string, we're going to use it as scratch space TArray<TCHAR>& InternalStringBuffer = Destination.GetCharArray(); // Work out the maximum size required and resize the buffer so it can hold enough data const int32_t DestinationCapacityBytes = UCNV_GET_MAX_BYTES_FOR_STRING(SourceLen, ucnv_getMaxCharSize(ICUConverter)); const int32 DestinationCapacityTCHARs = DestinationCapacityBytes / sizeof(TCHAR); InternalStringBuffer.SetNumUninitialized(DestinationCapacityTCHARs); // Perform the conversion into the string buffer, and then null terminate the FString and size it back down to the correct size const int32_t DestinationSizeBytes = ucnv_fromUChars(ICUConverter, reinterpret_cast<char*>(InternalStringBuffer.GetData()), DestinationCapacityBytes, Source.getBuffer() + SourceStartIndex, SourceLen, &ICUStatus); const int32 DestinationSizeTCHARs = DestinationSizeBytes / sizeof(TCHAR); InternalStringBuffer[DestinationSizeTCHARs] = 0; InternalStringBuffer.SetNum(DestinationSizeTCHARs + 1, /*bAllowShrinking*/false); // the array size includes null check(U_SUCCESS(ICUStatus)); } else { Destination.Empty(); } }
/** * Convert character vector between marked encodings and the encoding provided * * @param str input character vector or list of raw vectors * @param to target encoding, \code{NULL} or \code{""} for default enc * @param to_raw single logical, should list of raw vectors be returned? * @return a converted character vector or list of raw vectors * * @version 0.1-?? (Marek Gagolewski, 2013-11-12) * * @version 0.2-1 (Marek Gagolewski, 2014-03-28) * use StriUcnv * * @version 0.2-1 (Marek Gagolewski, 2014-04-01) * calc required buf size a priori * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_encode_from_marked(SEXP str, SEXP to, SEXP to_raw) { PROTECT(str = stri_prepare_arg_string(str, "str")); const char* selected_to = stri__prepare_arg_enc(to, "to", true); /* this is R_alloc'ed */ bool to_raw_logical = stri__prepare_arg_logical_1_notNA(to_raw, "to_raw"); STRI__ERROR_HANDLER_BEGIN(1) R_len_t str_n = LENGTH(str); StriContainerUTF16 str_cont(str, str_n); // get the number of strings to convert; if == 0, then you know what's the result if (str_n <= 0) return Rf_allocVector(to_raw_logical?VECSXP:STRSXP, 0); // Open converters StriUcnv ucnv(selected_to); UConverter* uconv_to = ucnv.getConverter(true /*register_callbacks*/); // Get target encoding mark cetype_t encmark_to = to_raw_logical?CE_BYTES:ucnv.getCE(); // Prepare out val SEXP ret; STRI__PROTECT(ret = Rf_allocVector(to_raw_logical?VECSXP:STRSXP, str_n)); // calculate required buf size R_len_t bufsize = 0; for (R_len_t i=0; i<str_n; ++i) { if (!str_cont.isNA(i) && str_cont.get(i).length() > bufsize) bufsize = str_cont.get(i).length(); } bufsize = UCNV_GET_MAX_BYTES_FOR_STRING(bufsize, ucnv_getMaxCharSize(uconv_to)); // "The calculated size is guaranteed to be sufficient for this conversion." String8buf buf(bufsize); for (R_len_t i=0; i<str_n; ++i) { if (str_cont.isNA(i)) { if (to_raw_logical) SET_VECTOR_ELT(ret, i, R_NilValue); else SET_STRING_ELT(ret, i, NA_STRING); continue; } R_len_t curn_tmp = str_cont.get(i).length(); const UChar* curs_tmp = str_cont.get(i).getBuffer(); // The buffer content is (probably) not NUL-terminated. if (!curs_tmp) throw StriException(MSG__INTERNAL_ERROR); UErrorCode status = U_ZERO_ERROR; ucnv_resetFromUnicode(uconv_to); R_len_t bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(), curs_tmp, curn_tmp, &status); if (bufneed <= buf.size()) { STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } else {// larger buffer needed [this shouldn't happen?] buf.resize(bufneed, false/*destroy contents*/); status = U_ZERO_ERROR; bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(), curs_tmp, curn_tmp, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } if (to_raw_logical) { SEXP outobj; STRI__PROTECT(outobj = Rf_allocVector(RAWSXP, bufneed)); memcpy(RAW(outobj), buf.data(), (size_t)bufneed); SET_VECTOR_ELT(ret, i, outobj); STRI__UNPROTECT(1); } else { SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), bufneed, encmark_to)); } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({/* nothing special on error */}) }
/** * Convert character vector between given encodings * * @param str input character vector or list of raw vectors * @param from source encoding, \code{NULL} or \code{""} for default enc * @param to target encoding, \code{NULL} or \code{""} for default enc * @param to_raw single logical, should list of raw vectors be returned? * @return a converted character vector or list of raw vectors * * @version 0.1 (Marek Gagolewski) * @version 0.2 (Marek Gagolewski) arg to_raw_added, encoding marking * @version 0.3 (Marek Gagolewski, 2013-06-16) make StriException-friendly * @version 0.4 (Marek Gagolewski, 2013-08-08) use StriContainerListRaw * @version 0.5 (Marek Gagolewski, 2013-11-20) BUGFIX call stri_encode_from_marked if necessary */ SEXP stri_encode(SEXP str, SEXP from, SEXP to, SEXP to_raw) { const char* selected_from = stri__prepare_arg_enc(from, "from", true); if (!selected_from && Rf_isVectorAtomic(str)) return stri_encode_from_marked(str, to, to_raw); str = stri_prepare_arg_list_raw(str, "str"); const char* selected_to = stri__prepare_arg_enc(to, "to", true); bool to_raw_logical = stri__prepare_arg_logical_1_notNA(to_raw, "to_raw"); UConverter* uconv_from = NULL; UConverter* uconv_to = NULL; STRI__ERROR_HANDLER_BEGIN StriContainerListRaw str_cont(str); R_len_t str_n = str_cont.get_n(); // get the number of strings to convert; if == 0, then you know what's the result if (str_n <= 0) return Rf_allocVector(to_raw_logical?VECSXP:STRSXP, 0); // Open converters uconv_from = stri__ucnv_open(selected_from); uconv_to = stri__ucnv_open(selected_to); // Get target encoding mark UErrorCode err = U_ZERO_ERROR; const char* uconv_to_name = ucnv_getName(uconv_to, &err); if (U_FAILURE(err)) throw StriException(err); cetype_t encmark_to = CE_BYTES; // all other cases than the below ones // - bytes enc (this is reasonable, isn't it?) if (!to_raw_logical) { // otherwise not needed if (!strcmp(uconv_to_name, "US-ASCII") || !strcmp(uconv_to_name, "UTF-8")) encmark_to = CE_UTF8; // no CE for ASCII, will be auto-detected by mkCharLenCE else if (!strcmp(uconv_to_name, "ISO-8859-1")) encmark_to = CE_LATIN1; else if (!strcmp(uconv_to_name, ucnv_getDefaultName())) encmark_to = CE_NATIVE; } // Prepare out val SEXP ret; PROTECT(ret = Rf_allocVector(to_raw_logical?VECSXP:STRSXP, str_n)); String8 buf(0); // will be extended in a moment for (R_len_t i=0; i<str_n; ++i) { if (str_cont.isNA(i)) { if (to_raw_logical) SET_VECTOR_ELT(ret, i, R_NilValue); else SET_STRING_ELT(ret, i, NA_STRING); continue; } const char* curd = str_cont.get(i).c_str(); R_len_t curn = str_cont.get(i).length(); err = U_ZERO_ERROR; UnicodeString encs(curd, curn, uconv_from, err); // FROM -> UTF-16 [this is the slow part] if (U_FAILURE(err)) throw StriException(err); R_len_t curn_tmp = encs.length(); const UChar* curs_tmp = encs.getBuffer(); // The buffer contents is (probably) not NUL-terminated. if (!curs_tmp) throw StriException(MSG__INTERNAL_ERROR); R_len_t bufneed = UCNV_GET_MAX_BYTES_FOR_STRING(curn_tmp, ucnv_getMaxCharSize(uconv_to)); // "The calculated size is guaranteed to be sufficient for this conversion." buf.resize(bufneed); err = U_ZERO_ERROR; // bufneed = encs.extract(buf.data(), buf.size(), uconv_to, err); // UTF-16 -> TO ucnv_resetFromUnicode(uconv_to); bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(), curs_tmp, curn_tmp, &err); if (bufneed <= buf.size()) { if (U_FAILURE(err)) throw StriException(err); } else {// larger buffer needed [this shouldn't happen?] // warning("buf extending"); buf.resize(bufneed); err = U_ZERO_ERROR; bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(), curs_tmp, curn_tmp, &err); if (U_FAILURE(err)) throw StriException(err); if (bufneed > buf.size()) throw StriException(MSG__INTERNAL_ERROR); } if (to_raw_logical) { SEXP outobj = Rf_allocVector(RAWSXP, bufneed); memcpy(RAW(outobj), buf.data(), (size_t)bufneed); SET_VECTOR_ELT(ret, i, outobj); } else { SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), bufneed, encmark_to)); } } if (uconv_from) { ucnv_close(uconv_from); uconv_from = NULL; } if (uconv_to) { ucnv_close(uconv_to); uconv_to = NULL; } UNPROTECT(1); return ret; STRI__ERROR_HANDLER_END({ if (uconv_from) ucnv_close(uconv_from); if (uconv_to) ucnv_close(uconv_to); }) }
int __get_max_bytes_for_string(UConverter *cnv, int src_length) { return UCNV_GET_MAX_BYTES_FOR_STRING(src_length, ucnv_getMaxCharSize(cnv)); }