/** * Convert character vector between given encodings * * @param str input character/raw vector or list of raw vectors * @param from source encoding, \code{NULL} or \code{""} for default enc * @param to target encoding, \code{NULL} or \code{""} for default enc * @param to_raw single logical, should list of raw vectors be returned? * @return a converted character vector or list of raw vectors * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski) * arg to_raw_added, encoding marking * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.1-?? (Marek Gagolewski, 2013-08-08) * use StriContainerListRaw * * @version 0.1-?? (Marek Gagolewski, 2013-11-20) * BUGFIX call stri_encode_from_marked if necessary * * @version 0.2-1 (Marek Gagolewski, 2014-03-28) * use StriUcnv * * @version 0.2-1 (Marek Gagolewski, 2014-04-01) * estimate required buf size a priori * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_encode(SEXP str, SEXP from, SEXP to, SEXP to_raw) { const char* selected_from = stri__prepare_arg_enc(from, "from", true); /* this is R_alloc'ed */ if (!selected_from && Rf_isVectorAtomic(str)) return stri_encode_from_marked(str, to, to_raw); const char* selected_to = stri__prepare_arg_enc(to, "to", true); /* this is R_alloc'ed */ bool to_raw_logical = stri__prepare_arg_logical_1_notNA(to_raw, "to_raw"); // raw vector, character vector, or list of raw vectors: PROTECT(str = stri_prepare_arg_list_raw(str, "str")); STRI__ERROR_HANDLER_BEGIN(1) StriContainerListRaw str_cont(str); R_len_t str_n = str_cont.get_n(); // get the number of strings to convert; if == 0, then you know what's the result if (str_n <= 0) { STRI__UNPROTECT_ALL return Rf_allocVector(to_raw_logical?VECSXP:STRSXP, 0); }
/** * Sets current (default) ICU charset * * If given charset is unavailable, an error is raised * * @param enc new charset (single string) * @return nothing (\code{R_NilValue}) * * @version 0.1-?? (Marek Gagolewski) * * @version 0.2-1 (Marek Gagolewski) * use StriUcnv; make StriException-friendly * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_enc_set(SEXP enc) { // here, the default encoding may not be requested: const char* selected_enc = stri__prepare_arg_enc(enc, "enc", false/*no default*/); /* this is R_alloc'ed */ STRI__ERROR_HANDLER_BEGIN(0) StriUcnv uconv_obj(selected_enc); // this will generate an error if selected_enc is not supported: UConverter* uconv = uconv_obj.getConverter(); UErrorCode status = U_ZERO_ERROR; // get "official" encoding name: const char* name = ucnv_getName(uconv, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) ucnv_setDefaultName(name); // set as default return R_NilValue; STRI__ERROR_HANDLER_END({/* no special action on error */}) }
/** * Convert character vector between marked encodings and the encoding provided * * @param str input character vector or list of raw vectors * @param to target encoding, \code{NULL} or \code{""} for default enc * @param to_raw single logical, should list of raw vectors be returned? * @return a converted character vector or list of raw vectors * * @version 0.1-?? (Marek Gagolewski, 2013-11-12) * * @version 0.2-1 (Marek Gagolewski, 2014-03-28) * use StriUcnv * * @version 0.2-1 (Marek Gagolewski, 2014-04-01) * calc required buf size a priori * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_encode_from_marked(SEXP str, SEXP to, SEXP to_raw) { PROTECT(str = stri_prepare_arg_string(str, "str")); const char* selected_to = stri__prepare_arg_enc(to, "to", true); /* this is R_alloc'ed */ bool to_raw_logical = stri__prepare_arg_logical_1_notNA(to_raw, "to_raw"); STRI__ERROR_HANDLER_BEGIN(1) R_len_t str_n = LENGTH(str); StriContainerUTF16 str_cont(str, str_n); // get the number of strings to convert; if == 0, then you know what's the result if (str_n <= 0) return Rf_allocVector(to_raw_logical?VECSXP:STRSXP, 0); // Open converters StriUcnv ucnv(selected_to); UConverter* uconv_to = ucnv.getConverter(true /*register_callbacks*/); // Get target encoding mark cetype_t encmark_to = to_raw_logical?CE_BYTES:ucnv.getCE(); // Prepare out val SEXP ret; STRI__PROTECT(ret = Rf_allocVector(to_raw_logical?VECSXP:STRSXP, str_n)); // calculate required buf size R_len_t bufsize = 0; for (R_len_t i=0; i<str_n; ++i) { if (!str_cont.isNA(i) && str_cont.get(i).length() > bufsize) bufsize = str_cont.get(i).length(); } bufsize = UCNV_GET_MAX_BYTES_FOR_STRING(bufsize, ucnv_getMaxCharSize(uconv_to)); // "The calculated size is guaranteed to be sufficient for this conversion." String8buf buf(bufsize); for (R_len_t i=0; i<str_n; ++i) { if (str_cont.isNA(i)) { if (to_raw_logical) SET_VECTOR_ELT(ret, i, R_NilValue); else SET_STRING_ELT(ret, i, NA_STRING); continue; } R_len_t curn_tmp = str_cont.get(i).length(); const UChar* curs_tmp = str_cont.get(i).getBuffer(); // The buffer content is (probably) not NUL-terminated. if (!curs_tmp) throw StriException(MSG__INTERNAL_ERROR); UErrorCode status = U_ZERO_ERROR; ucnv_resetFromUnicode(uconv_to); R_len_t bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(), curs_tmp, curn_tmp, &status); if (bufneed <= buf.size()) { STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } else {// larger buffer needed [this shouldn't happen?] buf.resize(bufneed, false/*destroy contents*/); status = U_ZERO_ERROR; bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(), curs_tmp, curn_tmp, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } if (to_raw_logical) { SEXP outobj; STRI__PROTECT(outobj = Rf_allocVector(RAWSXP, bufneed)); memcpy(RAW(outobj), buf.data(), (size_t)bufneed); SET_VECTOR_ELT(ret, i, outobj); STRI__UNPROTECT(1); } else { SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), bufneed, encmark_to)); } } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({/* nothing special on error */}) }
/** * Convert character vector between given encodings * * @param str input character vector or list of raw vectors * @param from source encoding, \code{NULL} or \code{""} for default enc * @param to target encoding, \code{NULL} or \code{""} for default enc * @param to_raw single logical, should list of raw vectors be returned? * @return a converted character vector or list of raw vectors * * @version 0.1 (Marek Gagolewski) * @version 0.2 (Marek Gagolewski) arg to_raw_added, encoding marking * @version 0.3 (Marek Gagolewski, 2013-06-16) make StriException-friendly * @version 0.4 (Marek Gagolewski, 2013-08-08) use StriContainerListRaw * @version 0.5 (Marek Gagolewski, 2013-11-20) BUGFIX call stri_encode_from_marked if necessary */ SEXP stri_encode(SEXP str, SEXP from, SEXP to, SEXP to_raw) { const char* selected_from = stri__prepare_arg_enc(from, "from", true); if (!selected_from && Rf_isVectorAtomic(str)) return stri_encode_from_marked(str, to, to_raw); str = stri_prepare_arg_list_raw(str, "str"); const char* selected_to = stri__prepare_arg_enc(to, "to", true); bool to_raw_logical = stri__prepare_arg_logical_1_notNA(to_raw, "to_raw"); UConverter* uconv_from = NULL; UConverter* uconv_to = NULL; STRI__ERROR_HANDLER_BEGIN StriContainerListRaw str_cont(str); R_len_t str_n = str_cont.get_n(); // get the number of strings to convert; if == 0, then you know what's the result if (str_n <= 0) return Rf_allocVector(to_raw_logical?VECSXP:STRSXP, 0); // Open converters uconv_from = stri__ucnv_open(selected_from); uconv_to = stri__ucnv_open(selected_to); // Get target encoding mark UErrorCode err = U_ZERO_ERROR; const char* uconv_to_name = ucnv_getName(uconv_to, &err); if (U_FAILURE(err)) throw StriException(err); cetype_t encmark_to = CE_BYTES; // all other cases than the below ones // - bytes enc (this is reasonable, isn't it?) if (!to_raw_logical) { // otherwise not needed if (!strcmp(uconv_to_name, "US-ASCII") || !strcmp(uconv_to_name, "UTF-8")) encmark_to = CE_UTF8; // no CE for ASCII, will be auto-detected by mkCharLenCE else if (!strcmp(uconv_to_name, "ISO-8859-1")) encmark_to = CE_LATIN1; else if (!strcmp(uconv_to_name, ucnv_getDefaultName())) encmark_to = CE_NATIVE; } // Prepare out val SEXP ret; PROTECT(ret = Rf_allocVector(to_raw_logical?VECSXP:STRSXP, str_n)); String8 buf(0); // will be extended in a moment for (R_len_t i=0; i<str_n; ++i) { if (str_cont.isNA(i)) { if (to_raw_logical) SET_VECTOR_ELT(ret, i, R_NilValue); else SET_STRING_ELT(ret, i, NA_STRING); continue; } const char* curd = str_cont.get(i).c_str(); R_len_t curn = str_cont.get(i).length(); err = U_ZERO_ERROR; UnicodeString encs(curd, curn, uconv_from, err); // FROM -> UTF-16 [this is the slow part] if (U_FAILURE(err)) throw StriException(err); R_len_t curn_tmp = encs.length(); const UChar* curs_tmp = encs.getBuffer(); // The buffer contents is (probably) not NUL-terminated. if (!curs_tmp) throw StriException(MSG__INTERNAL_ERROR); R_len_t bufneed = UCNV_GET_MAX_BYTES_FOR_STRING(curn_tmp, ucnv_getMaxCharSize(uconv_to)); // "The calculated size is guaranteed to be sufficient for this conversion." buf.resize(bufneed); err = U_ZERO_ERROR; // bufneed = encs.extract(buf.data(), buf.size(), uconv_to, err); // UTF-16 -> TO ucnv_resetFromUnicode(uconv_to); bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(), curs_tmp, curn_tmp, &err); if (bufneed <= buf.size()) { if (U_FAILURE(err)) throw StriException(err); } else {// larger buffer needed [this shouldn't happen?] // warning("buf extending"); buf.resize(bufneed); err = U_ZERO_ERROR; bufneed = ucnv_fromUChars(uconv_to, buf.data(), buf.size(), curs_tmp, curn_tmp, &err); if (U_FAILURE(err)) throw StriException(err); if (bufneed > buf.size()) throw StriException(MSG__INTERNAL_ERROR); } if (to_raw_logical) { SEXP outobj = Rf_allocVector(RAWSXP, bufneed); memcpy(RAW(outobj), buf.data(), (size_t)bufneed); SET_VECTOR_ELT(ret, i, outobj); } else { SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), bufneed, encmark_to)); } } if (uconv_from) { ucnv_close(uconv_from); uconv_from = NULL; } if (uconv_to) { ucnv_close(uconv_to); uconv_to = NULL; } UNPROTECT(1); return ret; STRI__ERROR_HANDLER_END({ if (uconv_from) ucnv_close(uconv_from); if (uconv_to) ucnv_close(uconv_to); }) }
/** Fetch information on an encoding * * @param enc either NULL or "" for default encoding, * or one string with encoding name * @return R list object with many components (see R doc for details) * * @version 0.1-?? (Marek Gagolewski) * * @version 0.2-1 (Marek Gagolewski) * use StriUcnv; make StriException-friendly * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc */ SEXP stri_enc_info(SEXP enc) { const char* selected_enc = stri__prepare_arg_enc(enc, "enc", true/*default ok*/); /* this is R_alloc'ed */ STRI__ERROR_HANDLER_BEGIN(0) StriUcnv uconv_obj(selected_enc); //uconv_obj.setCallBackSubstitute(); // restore default callbacks (no warning) UConverter* uconv = uconv_obj.getConverter(false); UErrorCode status = U_ZERO_ERROR; // get the list of available standards vector<const char*> standards = StriUcnv::getStandards(); R_len_t standards_n = (R_len_t)standards.size(); // alloc output list SEXP vals; SEXP names; const int nval = standards_n+2+5; STRI__PROTECT(names = Rf_allocVector(STRSXP, nval)); SET_STRING_ELT(names, 0, Rf_mkChar("Name.friendly")); SET_STRING_ELT(names, 1, Rf_mkChar("Name.ICU")); for (R_len_t i=0; i<standards_n; ++i) { if (standards[i]) SET_STRING_ELT(names, i+2, Rf_mkChar((string("Name.")+standards[i]).c_str())); } SET_STRING_ELT(names, nval-5, Rf_mkChar("ASCII.subset")); SET_STRING_ELT(names, nval-4, Rf_mkChar("Unicode.1to1")); SET_STRING_ELT(names, nval-3, Rf_mkChar("CharSize.8bit")); SET_STRING_ELT(names, nval-2, Rf_mkChar("CharSize.min")); SET_STRING_ELT(names, nval-1, Rf_mkChar("CharSize.max")); STRI__PROTECT(vals = Rf_allocVector(VECSXP, nval)); // get canonical (ICU) name status = U_ZERO_ERROR; const char* canname = ucnv_getName(uconv, &status); if (U_FAILURE(status) || !canname) { SET_VECTOR_ELT(vals, 1, Rf_ScalarString(NA_STRING)); Rf_warning(MSG__ENC_ERROR_GETNAME); } else { SET_VECTOR_ELT(vals, 1, stri__make_character_vector_char_ptr(1, canname)); // friendly name const char* frname = StriUcnv::getFriendlyName(canname); if (frname) SET_VECTOR_ELT(vals, 0, stri__make_character_vector_char_ptr(1, frname)); else SET_VECTOR_ELT(vals, 0, Rf_ScalarString(NA_STRING)); // has ASCII as its subset? SET_VECTOR_ELT(vals, nval-5, Rf_ScalarLogical((int)uconv_obj.hasASCIIsubset())); // min,max character size, is 8bit? int mincharsize = (int)ucnv_getMinCharSize(uconv); int maxcharsize = (int)ucnv_getMaxCharSize(uconv); int is8bit = (mincharsize==1 && maxcharsize == 1); SET_VECTOR_ELT(vals, nval-3, Rf_ScalarLogical(is8bit)); SET_VECTOR_ELT(vals, nval-2, Rf_ScalarInteger(mincharsize)); SET_VECTOR_ELT(vals, nval-1, Rf_ScalarInteger(maxcharsize)); // is there a one-to-one correspondence with Unicode? if (!is8bit) SET_VECTOR_ELT(vals, nval-4, Rf_ScalarLogical(NA_LOGICAL)); else SET_VECTOR_ELT(vals, nval-4, Rf_ScalarLogical((int)uconv_obj.is1to1Unicode())); // other standard names for (R_len_t i=0; i<standards_n; ++i) { if (!standards[i]) continue; status = U_ZERO_ERROR; const char* stdname = ucnv_getStandardName(canname, standards[i], &status); if (U_FAILURE(status) || !stdname) SET_VECTOR_ELT(vals, i+2, Rf_ScalarString(NA_STRING)); else SET_VECTOR_ELT(vals, i+2, stri__make_character_vector_char_ptr(1, stdname)); } } Rf_setAttrib(vals, R_NamesSymbol, names); STRI__UNPROTECT_ALL return vals; STRI__ERROR_HANDLER_END({/* no special action on error */}) }