/** * Sets current (default) ICU locale * * @param loc new locale (a single character string) * @return nothing (\code{R_NilValue}) * * @version 0.1-?? (Marek Gagolewski) */ SEXP stri_locale_set(SEXP loc) { const char* qloc = stri__prepare_arg_locale(loc, "locale", false); /* this is R_alloc'ed */ UErrorCode status = U_ZERO_ERROR; uloc_setDefault(qloc, &status); STRI__CHECKICUSTATUS_RFERROR(status, {/* do nothing special on err */}) // error() allowed here return R_NilValue;
/** List Localizable Date-Time Formatting Data * * @param locale single string or NULL * @param context single string * @param width single string * @return list * * @version 0.5-1 (Marek Gagolewski, 2014-12-25) * * @version 0.5-1 (Marek Gagolewski, 2015-01-01) * use calendar keyword in locale */ SEXP stri_datetime_symbols(SEXP locale, SEXP context, SEXP width) { const char* qloc = stri__prepare_arg_locale(locale, "locale", true); /* this is R_alloc'ed */ const char* context_str = stri__prepare_arg_string_1_notNA(context, "context"); const char* context_opts[] = {"format", "standalone", NULL}; int context_cur = stri__match_arg(context_str, context_opts); const char* width_str = stri__prepare_arg_string_1_notNA(width, "width"); const char* width_opts[] = {"abbreviated", "wide", "narrow", NULL}; int width_cur = stri__match_arg(width_str, width_opts); DateFormatSymbols::DtContextType context_val; if (context_cur == 0) context_val = DateFormatSymbols::FORMAT; else if (context_cur == 1) context_val = DateFormatSymbols::STANDALONE; else Rf_error(MSG__INCORRECT_MATCH_OPTION, "context"); DateFormatSymbols::DtWidthType width_val; if (width_cur == 0) width_val = DateFormatSymbols::ABBREVIATED; else if (width_cur == 1) width_val = DateFormatSymbols::WIDE; else if (width_cur == 2) width_val = DateFormatSymbols::NARROW; else Rf_error(MSG__INCORRECT_MATCH_OPTION, "width"); UErrorCode status = U_ZERO_ERROR; String8buf calendar_type(128); Locale loc = Locale::createFromName(qloc); int32_t kvlen = loc.getKeywordValue("calendar", calendar_type.data(), calendar_type.size(), status); STRI__CHECKICUSTATUS_RFERROR(status, {/* do nothing special on err */}) status = U_ZERO_ERROR; DateFormatSymbols sym(status); status = U_ZERO_ERROR; if (kvlen == 0) sym = DateFormatSymbols(loc, status); else sym = DateFormatSymbols(loc, calendar_type.data(), status); STRI__CHECKICUSTATUS_RFERROR(status, {/* do nothing special on err */})
/** Get localized time zone info * * @param tz single string or NULL * @param locale single string or NULL * @param display_type single string * @return list * * @version 0.5-1 (Marek Gagolewski, 2014-12-24) * * @version 0.5-1 (Marek Gagolewski, 2015-03-01) * new out: WindowsID, NameDaylight, new in: display_type */ SEXP stri_timezone_info(SEXP tz, SEXP locale, SEXP display_type) { TimeZone* curtz = stri__prepare_arg_timezone(tz, "tz", R_NilValue); const char* qloc = stri__prepare_arg_locale(locale, "locale", true); /* this is R_alloc'ed */ const char* dtype_str = stri__prepare_arg_string_1_notNA(display_type, "display_type"); /* this is R_alloc'ed */ const char* dtype_opts[] = { "short", "long", "generic_short", "generic_long", "gmt_short", "gmt_long", "common", "generic_location", NULL}; int dtype_cur = stri__match_arg(dtype_str, dtype_opts); TimeZone::EDisplayType dtype; switch (dtype_cur) { case 0: dtype = TimeZone::SHORT; break; case 1: dtype = TimeZone::LONG; break; case 2: dtype = TimeZone::SHORT_GENERIC; break; case 3: dtype = TimeZone::LONG_GENERIC; break; case 4: dtype = TimeZone::SHORT_GMT; break; case 5: dtype = TimeZone::LONG_GMT; break; case 6: dtype = TimeZone::SHORT_COMMONLY_USED; break; case 7: dtype = TimeZone::GENERIC_LOCATION; break; default: Rf_error(MSG__INCORRECT_MATCH_OPTION, "display_type"); break; } const R_len_t infosize = 6; SEXP vals; PROTECT(vals = Rf_allocVector(VECSXP, infosize)); for (int i=0; i<infosize; ++i) SET_VECTOR_ELT(vals, i, R_NilValue); R_len_t curidx = -1; ++curidx; UnicodeString val_ID; curtz->getID(val_ID); SET_VECTOR_ELT(vals, curidx, stri__make_character_vector_UnicodeString_ptr(1, &val_ID)); ++curidx; UnicodeString val_name; curtz->getDisplayName(false, dtype, Locale::createFromName(qloc), val_name); SET_VECTOR_ELT(vals, curidx, stri__make_character_vector_UnicodeString_ptr(1, &val_name)); ++curidx; if ((bool)curtz->useDaylightTime()) { UnicodeString val_name2; curtz->getDisplayName(true, dtype, Locale::createFromName(qloc), val_name2); SET_VECTOR_ELT(vals, curidx, stri__make_character_vector_UnicodeString_ptr(1, &val_name2)); } else SET_VECTOR_ELT(vals, curidx, Rf_ScalarString(NA_STRING)); ++curidx; UnicodeString val_windows; UErrorCode status = U_ZERO_ERROR; #if U_ICU_VERSION_MAJOR_NUM>=52 TimeZone::getWindowsID(val_ID, val_windows, status); // Stable since ICU 52 #endif if (U_SUCCESS(status) && val_windows.length() > 0) SET_VECTOR_ELT(vals, curidx, stri__make_character_vector_UnicodeString_ptr(1, &val_windows)); else SET_VECTOR_ELT(vals, curidx, Rf_ScalarString(NA_STRING)); ++curidx; SET_VECTOR_ELT(vals, curidx, Rf_ScalarReal(curtz->getRawOffset()/1000.0/3600.0)); ++curidx; SET_VECTOR_ELT(vals, curidx, Rf_ScalarLogical((bool)curtz->useDaylightTime())); delete curtz; stri__set_names(vals, infosize, "ID", "Name", "Name.Daylight", "Name.Windows", "RawOffset", "UsesDaylightTime"); UNPROTECT(1); return vals; }
/** * Convert case (upper, lowercase) * * * @param str character vector * @param locale single string identifying * the locale ("" or NULL for default locale) * @return character vector * * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF16 * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.1-?? (Marek Gagolewski, 2013-11-19) * use UCaseMap + StriContainerUTF8 * **THIS DOES NOT WORK WITH ICU 4.8**, we have to revert the changes * ** BTW, since stringi_0.1-25 we require ICU>=50 ** * * @version 0.2-1 (Marek Gagolewski, 2014-03-18) * use UCaseMap + StriContainerUTF8 * (this is much faster for UTF-8 and slightly faster for 8bit enc) * Estimates minimal buffer size. * * @version 0.3-1 (Marek Gagolewski, 2014-10-24) * Use a custom BreakIterator with stri_trans_totitle * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-03) * use StriUBreakIterator * * @version 0.6-1 (Marek Gagolewski, 2015-07-11) * now this is an internal function */ SEXP stri_trans_casemap(SEXP str, int _type, SEXP locale) { if (_type < 1 || _type > 2) Rf_error(MSG__INCORRECT_INTERNAL_ARG); const char* qloc = stri__prepare_arg_locale(locale, "locale", true); /* this is R_alloc'ed */ PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument // version 0.2-1 - Does not work with ICU 4.8 (but we require ICU >= 50) UCaseMap* ucasemap = NULL; STRI__ERROR_HANDLER_BEGIN(1) UErrorCode status = U_ZERO_ERROR; ucasemap = ucasemap_open(qloc, U_FOLD_CASE_DEFAULT, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) R_len_t str_n = LENGTH(str); StriContainerUTF8 str_cont(str, str_n); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_n)); // STEP 1. // Estimate the required buffer length // Notice: The resulting number of codepoints may be larger or smaller than // the number before casefolding R_len_t bufsize = str_cont.getMaxNumBytes(); bufsize += 10; // a small margin String8buf buf(bufsize); // STEP 2. // Do case folding for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { if (str_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); status = U_ZERO_ERROR; int buf_need; if (_type == 1) buf_need = ucasemap_utf8ToLower(ucasemap, buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status); else buf_need = ucasemap_utf8ToUpper(ucasemap, buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status); if (U_FAILURE(status)) { /* retry */ buf.resize(buf_need, false/*destroy contents*/); status = U_ZERO_ERROR; if (_type == 1) buf_need = ucasemap_utf8ToLower(ucasemap, buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status); else buf_need = ucasemap_utf8ToUpper(ucasemap, buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // this shouldn't happen // we do have the buffer size required to complete this op } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), buf_need, CE_UTF8)); } if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL;} STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL; } }) }
/** Date-time artithmetic * * @param time * @param value * @param units * @param tz * @param locale * * @return POSIXst * * @version 0.5-1 (Marek Gagolewski, 2014-12-30) * @version 0.5-1 (Marek Gagolewski, 2015-03-06) tz arg added */ SEXP stri_datetime_add(SEXP time, SEXP value, SEXP units, SEXP tz, SEXP locale) { PROTECT(time = stri_prepare_arg_POSIXct(time, "time")); PROTECT(value = stri_prepare_arg_integer(value, "value")); if (!isNull(tz)) PROTECT(tz = stri_prepare_arg_string_1(tz, "tz")); else PROTECT(tz); /* needed to set tzone attrib */ R_len_t vectorize_length = stri__recycling_rule(true, 2, LENGTH(time), LENGTH(value)); const char* units_val = stri__prepare_arg_string_1_notNA(units, "units"); const char* units_opts[] = {"years", "months", "weeks", "days", "hours", "minutes", "seconds", "milliseconds", NULL}; int units_cur = stri__match_arg(units_val, units_opts); const char* locale_val = stri__prepare_arg_locale(locale, "locale", true); TimeZone* tz_val = stri__prepare_arg_timezone(tz, "tz", true/*allowdefault*/); Calendar* cal = NULL; STRI__ERROR_HANDLER_BEGIN(3) StriContainerDouble time_cont(time, vectorize_length); StriContainerInteger value_cont(value, vectorize_length); UCalendarDateFields units_field; switch (units_cur) { case 0: units_field = UCAL_YEAR; break; case 1: units_field = UCAL_MONTH; break; case 2: units_field = UCAL_WEEK_OF_YEAR; break; case 3: units_field = UCAL_DAY_OF_MONTH; break; case 4: units_field = UCAL_HOUR_OF_DAY; break; case 5: units_field = UCAL_MINUTE; break; case 6: units_field = UCAL_SECOND; break; case 7: units_field = UCAL_MILLISECOND; break; default: throw StriException(MSG__INCORRECT_MATCH_OPTION, "units"); } UErrorCode status = U_ZERO_ERROR; cal = Calendar::createInstance(locale_val, status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) cal->adoptTimeZone(tz_val); tz_val = NULL; /* The Calendar takes ownership of the TimeZone. */ SEXP ret; STRI__PROTECT(ret = Rf_allocVector(REALSXP, vectorize_length)); double* ret_val = REAL(ret); for (R_len_t i=0; i<vectorize_length; ++i) { if (time_cont.isNA(i) || value_cont.isNA(i)) { ret_val[i] = NA_REAL; continue; } status = U_ZERO_ERROR; cal->setTime((UDate)(time_cont.get(i)*1000.0), status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) status = U_ZERO_ERROR; cal->add(units_field, value_cont.get(i), status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) status = U_ZERO_ERROR; ret_val[i] = ((double)cal->getTime(status))/1000.0; STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) } if (!isNull(tz)) Rf_setAttrib(ret, Rf_ScalarString(Rf_mkChar("tzone")), tz); stri__set_class_POSIXct(ret); if (tz_val) { delete tz_val; tz_val = NULL; } if (cal) { delete cal; cal = NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ if (tz_val) { delete tz_val; tz_val = NULL; } if (cal) { delete cal; cal = NULL; } }) }
/** * Get values of date-time fields * * @param time * @param locale * @param tz * * @return list * * @version 0.5-1 (Marek Gagolewski, 2015-01-01) * @version 0.5-1 (Marek Gagolewski, 2015-03-03) tz arg added */ SEXP stri_datetime_fields(SEXP time, SEXP tz, SEXP locale) { PROTECT(time = stri_prepare_arg_POSIXct(time, "time")); const char* locale_val = stri__prepare_arg_locale(locale, "locale", true); if (!isNull(tz)) PROTECT(tz = stri_prepare_arg_string_1(tz, "tz")); else PROTECT(tz); /* needed to set tzone attrib */ TimeZone* tz_val = stri__prepare_arg_timezone(tz, "tz", true/*allowdefault*/); Calendar* cal = NULL; STRI__ERROR_HANDLER_BEGIN(2) R_len_t vectorize_length = LENGTH(time); StriContainerDouble time_cont(time, vectorize_length); UErrorCode status = U_ZERO_ERROR; cal = Calendar::createInstance(locale_val, status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) cal->adoptTimeZone(tz_val); tz_val = NULL; /* The Calendar takes ownership of the TimeZone. */ SEXP ret; #define STRI__FIELDS_NUM 14 STRI__PROTECT(ret = Rf_allocVector(VECSXP, STRI__FIELDS_NUM)); for (R_len_t j=0; j<STRI__FIELDS_NUM; ++j) SET_VECTOR_ELT(ret, j, Rf_allocVector(INTSXP, vectorize_length)); for (R_len_t i=0; i<vectorize_length; ++i) { if (time_cont.isNA(i)) { for (R_len_t j=0; j<STRI__FIELDS_NUM; ++j) INTEGER(VECTOR_ELT(ret, j))[i] = NA_INTEGER; continue; } status = U_ZERO_ERROR; cal->setTime((UDate)(time_cont.get(i)*1000.0), status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) for (R_len_t j=0; j<STRI__FIELDS_NUM; ++j) { UCalendarDateFields units_field; switch (j) { case 0: units_field = UCAL_EXTENDED_YEAR; break; case 1: units_field = UCAL_MONTH; break; case 2: units_field = UCAL_DAY_OF_MONTH; break; case 3: units_field = UCAL_HOUR_OF_DAY; break; case 4: units_field = UCAL_MINUTE; break; case 5: units_field = UCAL_SECOND; break; case 6: units_field = UCAL_MILLISECOND; break; case 7: units_field = UCAL_WEEK_OF_YEAR; break; case 8: units_field = UCAL_WEEK_OF_MONTH; break; case 9: units_field = UCAL_DAY_OF_YEAR; break; case 10: units_field = UCAL_DAY_OF_WEEK; break; case 11: units_field = UCAL_HOUR; break; case 12: units_field = UCAL_AM_PM; break; case 13: units_field = UCAL_ERA; break; default: throw StriException(MSG__INCORRECT_MATCH_OPTION, "units"); } //UCAL_IS_LEAP_MONTH //UCAL_MILLISECONDS_IN_DAY -> SecondsInDay // UCAL_AM_PM -> "AM" or "PM" (localized? or factor?+index in stri_datetime_symbols) add arg use_symbols???? // UCAL_DAY_OF_WEEK -> (localized? or factor?) SUNDAY, MONDAY // UCAL_DAY_OF_YEAR ' // isWekend status = U_ZERO_ERROR; INTEGER(VECTOR_ELT(ret, j))[i] = cal->get(units_field, status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) if (units_field == UCAL_MONTH) ++INTEGER(VECTOR_ELT(ret, j))[i]; // month + 1 else if (units_field == UCAL_AM_PM) ++INTEGER(VECTOR_ELT(ret, j))[i]; // ampm + 1 else if (units_field == UCAL_ERA) ++INTEGER(VECTOR_ELT(ret, j))[i]; // era + 1 }
/** Word wrap text * * @param str character vector * @param width single integer * @param cost_exponent single double * @param indent single integer * @param exdent single integer * @param prefix single string * @param initial single string * @param locale locale identifier or NULL for default locale * @param use_length single logical value * * @return list * * @version 0.1-?? (Bartek Tartanus) * * @version 0.2-2 (Marek Gagolewski, 2014-04-27) * single function for wrap_greedy and wrap_dynamic * (dispatch inside); * use BreakIterator * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-06) * new args: indent, exdent, prefix, initial * * @version 0.5-1 (Marek Gagolewski, 2014-12-19) * #133 allow width <= 0 * * @version 0.5-1 (Marek Gagolewski, 2015-02-28) * don't trim so many white spaces at the end of each word (normalize arg does that) * #139: allow a "whitespace" break iterator * * @version 0.5-1 (Marek Gagolewski, 2015-04-23) * `use_length` arg added * * * @version 0.5-1 (Marek Gagolewski, 2015-06-09) * BIGSKIP: no more CHARSXP on out on "" input */ SEXP stri_wrap(SEXP str, SEXP width, SEXP cost_exponent, SEXP indent, SEXP exdent, SEXP prefix, SEXP initial, SEXP whitespace_only, SEXP use_length, SEXP locale) { bool use_length_val = stri__prepare_arg_logical_1_notNA(use_length, "use_length"); double exponent_val = stri__prepare_arg_double_1_notNA(cost_exponent, "cost_exponent"); bool whitespace_only_val = stri__prepare_arg_logical_1_notNA(whitespace_only, "whitespace_only"); int width_val = stri__prepare_arg_integer_1_notNA(width, "width"); if (width_val <= 0) width_val = 0; int indent_val = stri__prepare_arg_integer_1_notNA(indent, "indent"); if (indent_val < 0) Rf_error(MSG__EXPECTED_POSITIVE, "indent"); int exdent_val = stri__prepare_arg_integer_1_notNA(exdent, "exdent"); if (exdent_val < 0) Rf_error(MSG__EXPECTED_POSITIVE, "exdent"); const char* qloc = stri__prepare_arg_locale(locale, "locale", true); /* this is R_alloc'ed */ Locale loc = Locale::createFromName(qloc); PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(prefix = stri_prepare_arg_string_1(prefix, "prefix")); PROTECT(initial = stri_prepare_arg_string_1(initial, "initial")); BreakIterator* briter = NULL; UText* str_text = NULL; STRI__ERROR_HANDLER_BEGIN(3) UErrorCode status = U_ZERO_ERROR; briter = BreakIterator::createLineInstance(loc, status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) R_len_t str_length = LENGTH(str); StriContainerUTF8_indexable str_cont(str, str_length); StriContainerUTF8 prefix_cont(prefix, 1); StriContainerUTF8 initial_cont(initial, 1); // prepare indent/exdent/prefix/initial stuff: // 1st line, 1st para (i==0, u==0): initial+indent // nth line, 1st para (i==0, u> 0): prefix +exdent // 1st line, nth para (i> 0, u==0): prefix +indent // nth line, nth para (i> 0, u> 0): prefix +exdent StriWrapLineStart ii(initial_cont.get(0), indent_val); StriWrapLineStart pi(prefix_cont.get(0), indent_val); StriWrapLineStart pe(prefix_cont.get(0), exdent_val); status = U_ZERO_ERROR; //Unicode Newline Guidelines - Unicode Technical Report #13 UnicodeSet uset_linebreaks(UnicodeString::fromUTF8("[\\u000A-\\u000D\\u0085\\u2028\\u2029]"), status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) uset_linebreaks.freeze(); status = U_ZERO_ERROR; UnicodeSet uset_whitespaces(UnicodeString::fromUTF8("\\p{White_space}"), status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) uset_whitespaces.freeze(); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(VECSXP, str_length)); for (R_len_t i = 0; i < str_length; ++i) { if (str_cont.isNA(i) || prefix_cont.isNA(0) || initial_cont.isNA(0)) { SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1)); continue; } status = U_ZERO_ERROR; const char* str_cur_s = str_cont.get(i).c_str(); R_len_t str_cur_n = str_cont.get(i).length(); str_text = utext_openUTF8(str_text, str_cur_s, str_cont.get(i).length(), &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) status = U_ZERO_ERROR; briter->setText(str_text, status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // all right, first let's generate a list of places at which we may do line breaks deque< R_len_t > occurrences_list; // this could be an R_len_t queue R_len_t match = briter->first(); while (match != BreakIterator::DONE) { if (!whitespace_only_val) occurrences_list.push_back(match); else { if (match > 0 && match < str_cur_n) { UChar32 c; U8_GET((const uint8_t*)str_cur_s, 0, match-1, str_cur_n, c); if (uset_whitespaces.contains(c)) occurrences_list.push_back(match); } else occurrences_list.push_back(match); } match = briter->next(); } R_len_t noccurrences = (R_len_t)occurrences_list.size(); // number of boundaries if (noccurrences <= 1) { // no match (1 boundary == 0) SET_VECTOR_ELT(ret, i, Rf_ScalarString(str_cont.toR(i))); continue; } // the number of "words" is: R_len_t nwords = noccurrences - 1; // convert occurrences_list to a vector // in order to obtain end positions (in a string) of each "words", // noting that occurrences_list.at(0) == 0 #ifndef NDEBUG if (occurrences_list.at(0) != 0) throw StriException("NDEBUG: stri_wrap: (occurrences_list.at(0) != 0)"); #endif std::vector<R_len_t> end_pos_orig(nwords); deque<R_len_t>::iterator iter = ++(occurrences_list.begin()); for (R_len_t j = 0; iter != occurrences_list.end(); ++iter, ++j) { end_pos_orig[j] = (*iter); // this is a UTF-8 index } // now: // we'll get the total widths/number of code points in each "word" std::vector<R_len_t> widths_orig(nwords); // we'll get the total widths/number of code points without trailing whitespaces std::vector<R_len_t> widths_trim(nwords); // we'll get the end positions without trailing whitespaces std::vector<R_len_t> end_pos_trim(nwords); // detect line endings (fail on a match) UChar32 c = 0; R_len_t j = 0; R_len_t cur_block = 0; R_len_t cur_width_orig = 0; R_len_t cur_width_trim = 0; R_len_t cur_count_orig = 0; R_len_t cur_count_trim = 0; R_len_t cur_end_pos_trim = 0; while (j < str_cur_n) { R_len_t jlast = j; U8_NEXT(str_cur_s, j, str_cur_n, c); if (c < 0) // invalid utf-8 sequence throw StriException(MSG__INVALID_UTF8); if (uset_linebreaks.contains(c)) throw StriException(MSG__NEWLINE_FOUND); cur_width_orig += stri__width_char(c); ++cur_count_orig; if (uset_whitespaces.contains(c)) { // OLD: trim all white spaces from the end: // ++cur_count_trim; // [we have the normalize arg for that] // NEW: trim just one white space at the end: cur_width_trim = stri__width_char(c); cur_count_trim = 1; cur_end_pos_trim = jlast; } else { cur_width_trim = 0; cur_count_trim = 0; cur_end_pos_trim = j; } if (j >= str_cur_n || end_pos_orig[cur_block] <= j) { // we'll start a new block in a moment if (use_length_val) { widths_orig[cur_block] = cur_count_orig; widths_trim[cur_block] = cur_count_orig-cur_count_trim; } else { widths_orig[cur_block] = cur_width_orig; widths_trim[cur_block] = cur_width_orig-cur_width_trim; } end_pos_trim[cur_block] = cur_end_pos_trim; cur_block++; cur_width_orig = 0; cur_width_trim = 0; cur_count_orig = 0; cur_count_trim = 0; cur_end_pos_trim = j; } } // do wrap std::deque<R_len_t> wrap_after; // wrap line after which word in {0..nwords-1}? if (exponent_val <= 0.0) { stri__wrap_greedy(wrap_after, nwords, width_val, widths_orig, widths_trim, (use_length_val)?((i==0)?ii.count:pi.count):((i==0)?ii.width:pi.width), (use_length_val)?pe.count:pe.width); } else { stri__wrap_dynamic(wrap_after, nwords, width_val, exponent_val, widths_orig, widths_trim, (use_length_val)?((i==0)?ii.count:pi.count):((i==0)?ii.width:pi.width), (use_length_val)?pe.count:pe.width); } // wrap_after.size() line breaks => wrap_after.size()+1 lines R_len_t nlines = (R_len_t)wrap_after.size()+1; R_len_t last_pos = 0; SEXP ans; STRI__PROTECT(ans = Rf_allocVector(STRSXP, nlines)); deque<R_len_t>::iterator iter_wrap = wrap_after.begin(); for (R_len_t u = 0; iter_wrap != wrap_after.end(); ++iter_wrap, ++u) { R_len_t wrap_after_cur = *iter_wrap; R_len_t cur_pos = end_pos_trim[wrap_after_cur]; std::string cs; if (i == 0 && u == 0) cs = ii.str; else if (i > 0 && u == 0) cs = pi.str; else cs = pe.str; cs.append(str_cur_s+last_pos, cur_pos-last_pos); SET_STRING_ELT(ans, u, Rf_mkCharLenCE(cs.c_str(), cs.size(), CE_UTF8)); last_pos = end_pos_orig[wrap_after_cur]; } // last line goes here: std::string cs; if (i == 0 && nlines-1 == 0) cs = ii.str; else if (i > 0 && nlines-1 == 0) cs = pi.str; else cs = pe.str; cs.append(str_cur_s+last_pos, end_pos_trim[nwords-1]-last_pos); SET_STRING_ELT(ans, nlines-1, Rf_mkCharLenCE(cs.c_str(), cs.size(), CE_UTF8)); SET_VECTOR_ELT(ret, i, ans); STRI__UNPROTECT(1); } if (briter) { delete briter; briter = NULL; } if (str_text) { utext_close(str_text); str_text = NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ if (briter) { delete briter; briter = NULL; } if (str_text) { utext_close(str_text); str_text = NULL; } }) }
/** * Format date-time objects * * @param time * @param format * @param tz * @param locale * * @return character vector * * @version 0.5-1 (Marek Gagolewski, 2015-01-05) * * @version 0.5-1 (Marek Gagolewski, 2015-02-22) * use tz */ SEXP stri_datetime_format(SEXP time, SEXP format, SEXP tz, SEXP locale) { PROTECT(time = stri_prepare_arg_POSIXct(time, "time")); const char* locale_val = stri__prepare_arg_locale(locale, "locale", true); const char* format_val = stri__prepare_arg_string_1_notNA(format, "format"); // "format" may be one of: const char* format_opts[] = { "date_full", "date_long", "date_medium", "date_short", "date_relative_full", "date_relative_long", "date_relative_medium", "date_relative_short", "time_full", "time_long", "time_medium", "time_short", "time_relative_full", "time_relative_long", "time_relative_medium", "time_relative_short", "datetime_full", "datetime_long", "datetime_medium", "datetime_short", "datetime_relative_full", "datetime_relative_long", "datetime_relative_medium", "datetime_relative_short", NULL}; int format_cur = stri__match_arg(format_val, format_opts); TimeZone* tz_val = stri__prepare_arg_timezone(tz, "tz", true/*allowdefault*/); Calendar* cal = NULL; DateFormat* fmt = NULL; STRI__ERROR_HANDLER_BEGIN(1) R_len_t vectorize_length = LENGTH(time); StriContainerDouble time_cont(time, vectorize_length); UnicodeString format_str(format_val); UErrorCode status = U_ZERO_ERROR; if (format_cur >= 0) { DateFormat::EStyle style = DateFormat::kNone; switch (format_cur % 8) { case 0: style = DateFormat::kFull; break; case 1: style = DateFormat::kLong; break; case 2: style = DateFormat::kMedium; break; case 3: style = DateFormat::kShort; break; case 4: style = DateFormat::kFullRelative; break; case 5: style = DateFormat::kLongRelative; break; case 6: style = DateFormat::kMediumRelative; break; case 7: style = DateFormat::kShortRelative; break; default: style = DateFormat::kNone; break; } /* ICU 54.1: Relative time styles are not currently supported. */ switch (format_cur / 8) { case 0: fmt = DateFormat::createDateInstance(style, Locale::createFromName(locale_val)); break; case 1: fmt = DateFormat::createTimeInstance( (DateFormat::EStyle)(style & ~DateFormat::kRelative), Locale::createFromName(locale_val)); break; case 2: fmt = DateFormat::createDateTimeInstance(style, (DateFormat::EStyle)(style & ~DateFormat::kRelative), Locale::createFromName(locale_val)); break; default: fmt = NULL; break; } } else fmt = new SimpleDateFormat(format_str, Locale::createFromName(locale_val), status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) status = U_ZERO_ERROR; cal = Calendar::createInstance(locale_val, status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) cal->adoptTimeZone(tz_val); tz_val = NULL; /* The Calendar takes ownership of the TimeZone. */ SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, vectorize_length)); for (R_len_t i=0; i<vectorize_length; ++i) { if (time_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } status = U_ZERO_ERROR; cal->setTime((UDate)(time_cont.get(i)*1000.0), status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) FieldPosition pos; UnicodeString out; fmt->format(*cal, out, pos); std::string s; out.toUTF8String(s); SET_STRING_ELT(ret, i, Rf_mkCharLenCE(s.c_str(), (int)s.length(), (cetype_t)CE_UTF8)); } if (tz_val) { delete tz_val; tz_val = NULL; } if (fmt) { delete fmt; fmt = NULL; } if (cal) { delete cal; cal = NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ if (tz_val) { delete tz_val; tz_val = NULL; } if (fmt) { delete fmt; fmt = NULL; } if (cal) { delete cal; cal = NULL; } }) }
/** * Parse date-time objects * * @param str * @param format * @param tz * @param lenient * @param locale * * @return character vector * * @version 0.5-1 (Marek Gagolewski, 2015-01-08) * @version 0.5-1 (Marek Gagolewski, 2015-01-11) lenient arg added * @version 0.5-1 (Marek Gagolewski, 2015-02-22) use tz * @version 0.5-1 (Marek Gagolewski, 2015-03-01) set tzone attrib on retval */ SEXP stri_datetime_parse(SEXP str, SEXP format, SEXP lenient, SEXP tz, SEXP locale) { PROTECT(str = stri_prepare_arg_string(str, "str")); const char* locale_val = stri__prepare_arg_locale(locale, "locale", true); const char* format_val = stri__prepare_arg_string_1_notNA(format, "format"); bool lenient_val = stri__prepare_arg_logical_1_notNA(lenient, "lenient"); if (!isNull(tz)) PROTECT(tz = stri_prepare_arg_string_1(tz, "tz")); else PROTECT(tz); /* needed to set tzone attrib */ // "format" may be one of: const char* format_opts[] = { "date_full", "date_long", "date_medium", "date_short", "date_relative_full", "date_relative_long", "date_relative_medium", "date_relative_short", "time_full", "time_long", "time_medium", "time_short", "time_relative_full", "time_relative_long", "time_relative_medium", "time_relative_short", "datetime_full", "datetime_long", "datetime_medium", "datetime_short", "datetime_relative_full", "datetime_relative_long", "datetime_relative_medium", "datetime_relative_short", NULL}; int format_cur = stri__match_arg(format_val, format_opts); TimeZone* tz_val = stri__prepare_arg_timezone(tz, "tz", true/*allowdefault*/); Calendar* cal = NULL; DateFormat* fmt = NULL; STRI__ERROR_HANDLER_BEGIN(2) R_len_t vectorize_length = LENGTH(str); StriContainerUTF16 str_cont(str, vectorize_length); UnicodeString format_str(format_val); UErrorCode status = U_ZERO_ERROR; if (format_cur >= 0) { DateFormat::EStyle style = DateFormat::kNone; switch (format_cur % 8) { case 0: style = DateFormat::kFull; break; case 1: style = DateFormat::kLong; break; case 2: style = DateFormat::kMedium; break; case 3: style = DateFormat::kShort; break; case 4: style = DateFormat::kFullRelative; break; case 5: style = DateFormat::kLongRelative; break; case 6: style = DateFormat::kMediumRelative; break; case 7: style = DateFormat::kShortRelative; break; default: style = DateFormat::kNone; break; } /* ICU 54.1: Relative time styles are not currently supported. */ switch (format_cur / 8) { case 0: fmt = DateFormat::createDateInstance(style, Locale::createFromName(locale_val)); break; case 1: fmt = DateFormat::createTimeInstance((DateFormat::EStyle)(style & ~DateFormat::kRelative), Locale::createFromName(locale_val)); break; case 2: fmt = DateFormat::createDateTimeInstance(style, (DateFormat::EStyle)(style & ~DateFormat::kRelative), Locale::createFromName(locale_val)); break; default: fmt = NULL; break; } } else fmt = new SimpleDateFormat(format_str, Locale::createFromName(locale_val), status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) status = U_ZERO_ERROR; cal = Calendar::createInstance(locale_val, status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) cal->adoptTimeZone(tz_val); tz_val = NULL; /* The Calendar takes ownership of the TimeZone. */ cal->setLenient(lenient_val); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(REALSXP, vectorize_length)); for (R_len_t i=0; i<vectorize_length; ++i) { if (str_cont.isNA(i)) { REAL(ret)[i] = NA_REAL; continue; } status = U_ZERO_ERROR; ParsePosition pos; fmt->parse(str_cont.get(i), *cal, pos); if (pos.getErrorIndex() >= 0) REAL(ret)[i] = NA_REAL; else { status = U_ZERO_ERROR; REAL(ret)[i] = ((double)cal->getTime(status))/1000.0; if (U_FAILURE(status)) REAL(ret)[i] = NA_REAL; } } if (!isNull(tz)) Rf_setAttrib(ret, Rf_ScalarString(Rf_mkChar("tzone")), tz); stri__set_class_POSIXct(ret); if (tz_val) { delete tz_val; tz_val = NULL; } if (fmt) { delete fmt; fmt = NULL; } if (cal) { delete cal; cal = NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ if (tz_val) { delete tz_val; tz_val = NULL; } if (fmt) { delete fmt; fmt = NULL; } if (cal) { delete cal; cal = NULL; } }) }