/** * Split a string into parts [byte compare] * * The pattern matches identify delimiters that separate the input into fields. * The input data between the matches becomes the fields themselves. * * @param str character vector * @param pattern character vector * @param n_max integer vector * @param omit_empty logical vector * * * @version 0.1 (Bartek Tartanus) * @version 0.2 (Marek Gagolewski, 2013-06-25) StriException friendly, use StriContainerUTF8 * @version 0.3 (Marek Gagolewski, 2013-07-10) - BUGFIX: wrong behavior on empty str */ SEXP stri__split_fixed_byte(SEXP str, SEXP pattern, SEXP n_max, SEXP omit_empty) { str = stri_prepare_arg_string(str, "str"); pattern = stri_prepare_arg_string(pattern, "pattern"); n_max = stri_prepare_arg_integer(n_max, "n_max"); omit_empty = stri_prepare_arg_logical(omit_empty, "omit_empty"); STRI__ERROR_HANDLER_BEGIN R_len_t vectorize_length = stri__recycling_rule(true, 4, LENGTH(str), LENGTH(pattern), LENGTH(n_max), LENGTH(omit_empty)); StriContainerUTF8 str_cont(str, vectorize_length); StriContainerByteSearch pattern_cont(pattern, vectorize_length); StriContainerInteger n_max_cont(n_max, vectorize_length); StriContainerLogical omit_empty_cont(omit_empty, vectorize_length); SEXP ret; PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (n_max_cont.isNA(i) || omit_empty_cont.isNA(i)) { SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1)); continue; } int n_max_cur = n_max_cont.get(i); int omit_empty_cur = omit_empty_cont.get(i); STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1)); , SET_VECTOR_ELT(ret, i, stri__vector_empty_strings((omit_empty_cur || n_max_cur == 0)?0:1));)
/** * Split a string into parts. * * The pattern matches identify delimiters that separate the input into fields. * The input data between the matches becomes the fields themselves. * * @param str character vector * @param pattern character vector * @param n_max integer vector * @param opts_regex * @return list of character vectors * * @version 0.1 (Marek Gagolewski, 2013-06-21) * @version 0.2 (Marek Gagolewski, 2013-07-10) - BUGFIX: wrong behavior on empty str */ SEXP stri_split_regex(SEXP str, SEXP pattern, SEXP n_max, SEXP omit_empty, SEXP opts_regex) { str = stri_prepare_arg_string(str, "str"); pattern = stri_prepare_arg_string(pattern, "pattern"); n_max = stri_prepare_arg_integer(n_max, "n_max"); omit_empty = stri_prepare_arg_logical(omit_empty, "omit_empty"); R_len_t vectorize_length = stri__recycling_rule(true, 4, LENGTH(str), LENGTH(pattern), LENGTH(n_max), LENGTH(omit_empty)); uint32_t pattern_flags = StriContainerRegexPattern::getRegexFlags(opts_regex); UText* str_text = NULL; // may potentially be slower, but definitely is more convenient! STRI__ERROR_HANDLER_BEGIN StriContainerUTF8 str_cont(str, vectorize_length); StriContainerInteger n_max_cont(n_max, vectorize_length); StriContainerLogical omit_empty_cont(omit_empty, vectorize_length); StriContainerRegexPattern pattern_cont(pattern, vectorize_length, pattern_flags); SEXP ret; PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (n_max_cont.isNA(i) || omit_empty_cont.isNA(i)) { SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1)); continue; } int n_max_cur = n_max_cont.get(i); int omit_empty_cur = omit_empty_cont.get(i); STRI__CONTINUE_ON_EMPTY_OR_NA_STR_PATTERN(str_cont, pattern_cont, SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1));, SET_VECTOR_ELT(ret, i, stri__vector_empty_strings((omit_empty_cur || n_max_cur == 0)?0:1));)
/** * Split a string into text lines * * @param str character vector * @param n_max integer vector * @param omit_empty logical vector * * @return list of character vectors * * @version 0.1 (Marek Gagolewski, 2013-08-04) */ SEXP stri_split_lines(SEXP str, SEXP n_max, SEXP omit_empty) { str = stri_prepare_arg_string(str, "str"); n_max = stri_prepare_arg_integer(n_max, "n_max"); omit_empty = stri_prepare_arg_logical(omit_empty, "omit_empty"); R_len_t vectorize_length = stri__recycling_rule(true, 3, LENGTH(str), LENGTH(n_max), LENGTH(omit_empty)); STRI__ERROR_HANDLER_BEGIN StriContainerUTF8 str_cont(str, vectorize_length); StriContainerInteger n_max_cont(n_max, vectorize_length); StriContainerLogical omit_empty_cont(omit_empty, vectorize_length); SEXP ret; PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length)); for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { if (str_cont.isNA(i)) { SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1)); continue; } const char* str_cur_s = str_cont.get(i).c_str(); R_len_t str_cur_n = str_cont.get(i).length(); int n_max_cur = n_max_cont.get(i); int omit_empty_cur = omit_empty_cont.get(i); if (n_max_cur < 0) n_max_cur = INT_MAX; else if (n_max_cur == 0) { SET_VECTOR_ELT(ret, i, Rf_allocVector(STRSXP, 0)); continue; } //#define STRI_INDEX_NEWLINE_CR 0 //#define STRI_INDEX_NEWLINE_LF 1 //#define STRI_INDEX_NEWLINE_CRLF 2 //#define STRI_INDEX_NEWLINE_NEL 3 //#define STRI_INDEX_NEWLINE_VT 4 //#define STRI_INDEX_NEWLINE_FF 5 //#define STRI_INDEX_NEWLINE_LS 6 //#define STRI_INDEX_NEWLINE_PS 7 //#define STRI_INDEX_NEWLINE_LAST 8 // int counts[STRI_INDEX_NEWLINE_LAST]; // for (R_len_t j=0; j<STRI_INDEX_NEWLINE_LAST; ++j) // counts[j] = 0; UChar32 c; R_len_t jlast, k=1; deque<R_len_t_x2> occurences; occurences.push_back(R_len_t_x2(0, 0)); for (R_len_t j=0; j < str_cur_n && k < n_max_cur; /* null */) { jlast = j; U8_NEXT(str_cur_s, j, str_cur_n, c); switch (c) { case ASCII_CR: /* CR */ // counts[STRI_INDEX_NEWLINE_CR]++; /* check if next is LF */ if (str_cur_s[j] == ASCII_LF) { // look ahead one byte // counts[STRI_INDEX_NEWLINE_LF]++; // counts[STRI_INDEX_NEWLINE_CRLF]++; j++; // just one byte } break; case ASCII_LF: /* LF */ // counts[STRI_INDEX_NEWLINE_LF]++; break; case UCHAR_NEL: /* NEL */ // counts[STRI_INDEX_NEWLINE_NEL]++; break; case ASCII_VT: /* VT */ // counts[STRI_INDEX_NEWLINE_VT]++; break; case ASCII_FF: /* FF */ // counts[STRI_INDEX_NEWLINE_FF]++; break; case UCHAR_LS: /* LS */ // counts[STRI_INDEX_NEWLINE_LS]++; break; case UCHAR_PS: /* PS */ // counts[STRI_INDEX_NEWLINE_PS]++; break; default: /* not a newline character */ occurences.back().v2 = j; continue; } // if here, then at newline if (omit_empty_cur && occurences.back().v2 == occurences.back().v1) occurences.back().v1 = occurences.back().v2 = j; // don't start new field else { occurences.back().v2 = jlast; occurences.push_back(R_len_t_x2(j, j)); ++k; // another field } } if (k == n_max_cur) occurences.back().v2 = str_cur_n; if (omit_empty_cur && occurences.back().v1 == occurences.back().v2) occurences.pop_back(); SEXP ans; PROTECT(ans = Rf_allocVector(STRSXP, (R_len_t)occurences.size())); deque<R_len_t_x2>::iterator iter = occurences.begin(); for (R_len_t l = 0; iter != occurences.end(); ++iter, ++l) { R_len_t_x2 curoccur = *iter; SET_STRING_ELT(ans, l, Rf_mkCharLenCE(str_cur_s+curoccur.v1, curoccur.v2-curoccur.v1, CE_UTF8)); } SET_VECTOR_ELT(ret, i, ans); UNPROTECT(1); } UNPROTECT(1); return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** * Extract all occurences of a character class in each string * * @param str character vector * @param pattern character vector * @return list of character vectors * * @version 0.1 (Marek Gagolewski, 2013-06-08) * @version 0.2 (Marek Gagolewski, 2013-06-15) Use StrContainerCharClass * @version 0.3 (Marek Gagolewski, 2013-06-16) make StriException-friendly */ SEXP stri_extract_all_charclass(SEXP str, SEXP pattern, SEXP merge) { str = stri_prepare_arg_string(str, "str"); pattern = stri_prepare_arg_string(pattern, "pattern"); merge = stri_prepare_arg_logical(merge, "merge"); R_len_t vectorize_length = stri__recycling_rule(true, 3, LENGTH(str), LENGTH(pattern), LENGTH(merge)); STRI__ERROR_HANDLER_BEGIN StriContainerUTF8 str_cont(str, vectorize_length); StriContainerCharClass pattern_cont(pattern, vectorize_length); StriContainerLogical merge_cont(merge, vectorize_length); SEXP notfound; // this vector will be set iff not found or NA PROTECT(notfound = stri__vector_NA_strings(1)); SEXP ret; PROTECT(ret = Rf_allocVector(VECSXP, vectorize_length)); for (R_len_t i = pattern_cont.vectorize_init(); i != pattern_cont.vectorize_end(); i = pattern_cont.vectorize_next(i)) { if (pattern_cont.isNA(i) || str_cont.isNA(i) || merge_cont.isNA(i)) { SET_VECTOR_ELT(ret, i, notfound); continue; } bool merge_cur = merge_cont.get(i); CharClass pattern_cur = pattern_cont.get(i); R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); R_len_t j, jlast; UChar32 chr; deque<R_len_t_x2> occurences; // codepoint based-indices for (jlast=j=0; j<str_cur_n; ) { U8_NEXT(str_cur_s, j, str_cur_n, chr); if (pattern_cur.test(chr)) { occurences.push_back(R_len_t_x2(jlast, j)); } jlast = j; } R_len_t noccurences = (R_len_t)occurences.size(); if (noccurences == 0) SET_VECTOR_ELT(ret, i, notfound); else if (merge_cur && noccurences > 1) { // do merge deque<R_len_t_x2> occurences2; deque<R_len_t_x2>::iterator iter = occurences.begin(); occurences2.push_back(*iter); for (++iter; iter != occurences.end(); ++iter) { R_len_t_x2 curoccur = *iter; if (occurences2.back().v2 == curoccur.v1) { // continue seq occurences2.back().v2 = curoccur.v2; // change `end` } else { // new seq occurences2.push_back(curoccur); } } // create resulting matrix from occurences2 R_len_t noccurences2 = (R_len_t)occurences2.size(); SEXP cur_res; PROTECT(cur_res = Rf_allocVector(STRSXP, noccurences2)); iter = occurences2.begin(); for (R_len_t f = 0; iter != occurences2.end(); ++iter, ++f) { R_len_t_x2 curo = *iter; SET_STRING_ELT(cur_res, f, Rf_mkCharLenCE(str_cur_s+curo.v1, curo.v2-curo.v1, CE_UTF8)); } SET_VECTOR_ELT(ret, i, cur_res); UNPROTECT(1); } else { // do not merge SEXP cur_res; PROTECT(cur_res = Rf_allocVector(STRSXP, noccurences)); deque<R_len_t_x2>::iterator iter = occurences.begin(); for (R_len_t f = 0; iter != occurences.end(); ++iter, ++f) { R_len_t_x2 curo = *iter; SET_STRING_ELT(cur_res, f, Rf_mkCharLenCE(str_cur_s+curo.v1, curo.v2-curo.v1, CE_UTF8)); } SET_VECTOR_ELT(ret, i, cur_res); UNPROTECT(1); } } UNPROTECT(2); return ret; STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */) }
/** Word wrap text * * @param str character vector * @param width single integer * @param cost_exponent single double * @param indent single integer * @param exdent single integer * @param prefix single string * @param initial single string * @param locale locale identifier or NULL for default locale * @param use_length single logical value * * @return list * * @version 0.1-?? (Bartek Tartanus) * * @version 0.2-2 (Marek Gagolewski, 2014-04-27) * single function for wrap_greedy and wrap_dynamic * (dispatch inside); * use BreakIterator * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-06) * new args: indent, exdent, prefix, initial * * @version 0.5-1 (Marek Gagolewski, 2014-12-19) * #133 allow width <= 0 * * @version 0.5-1 (Marek Gagolewski, 2015-02-28) * don't trim so many white spaces at the end of each word (normalize arg does that) * #139: allow a "whitespace" break iterator * * @version 0.5-1 (Marek Gagolewski, 2015-04-23) * `use_length` arg added * * * @version 0.5-1 (Marek Gagolewski, 2015-06-09) * BIGSKIP: no more CHARSXP on out on "" input */ SEXP stri_wrap(SEXP str, SEXP width, SEXP cost_exponent, SEXP indent, SEXP exdent, SEXP prefix, SEXP initial, SEXP whitespace_only, SEXP use_length, SEXP locale) { bool use_length_val = stri__prepare_arg_logical_1_notNA(use_length, "use_length"); double exponent_val = stri__prepare_arg_double_1_notNA(cost_exponent, "cost_exponent"); bool whitespace_only_val = stri__prepare_arg_logical_1_notNA(whitespace_only, "whitespace_only"); int width_val = stri__prepare_arg_integer_1_notNA(width, "width"); if (width_val <= 0) width_val = 0; int indent_val = stri__prepare_arg_integer_1_notNA(indent, "indent"); if (indent_val < 0) Rf_error(MSG__EXPECTED_POSITIVE, "indent"); int exdent_val = stri__prepare_arg_integer_1_notNA(exdent, "exdent"); if (exdent_val < 0) Rf_error(MSG__EXPECTED_POSITIVE, "exdent"); const char* qloc = stri__prepare_arg_locale(locale, "locale", true); /* this is R_alloc'ed */ Locale loc = Locale::createFromName(qloc); PROTECT(str = stri_prepare_arg_string(str, "str")); PROTECT(prefix = stri_prepare_arg_string_1(prefix, "prefix")); PROTECT(initial = stri_prepare_arg_string_1(initial, "initial")); BreakIterator* briter = NULL; UText* str_text = NULL; STRI__ERROR_HANDLER_BEGIN(3) UErrorCode status = U_ZERO_ERROR; briter = BreakIterator::createLineInstance(loc, status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) R_len_t str_length = LENGTH(str); StriContainerUTF8_indexable str_cont(str, str_length); StriContainerUTF8 prefix_cont(prefix, 1); StriContainerUTF8 initial_cont(initial, 1); // prepare indent/exdent/prefix/initial stuff: // 1st line, 1st para (i==0, u==0): initial+indent // nth line, 1st para (i==0, u> 0): prefix +exdent // 1st line, nth para (i> 0, u==0): prefix +indent // nth line, nth para (i> 0, u> 0): prefix +exdent StriWrapLineStart ii(initial_cont.get(0), indent_val); StriWrapLineStart pi(prefix_cont.get(0), indent_val); StriWrapLineStart pe(prefix_cont.get(0), exdent_val); status = U_ZERO_ERROR; //Unicode Newline Guidelines - Unicode Technical Report #13 UnicodeSet uset_linebreaks(UnicodeString::fromUTF8("[\\u000A-\\u000D\\u0085\\u2028\\u2029]"), status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) uset_linebreaks.freeze(); status = U_ZERO_ERROR; UnicodeSet uset_whitespaces(UnicodeString::fromUTF8("\\p{White_space}"), status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) uset_whitespaces.freeze(); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(VECSXP, str_length)); for (R_len_t i = 0; i < str_length; ++i) { if (str_cont.isNA(i) || prefix_cont.isNA(0) || initial_cont.isNA(0)) { SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1)); continue; } status = U_ZERO_ERROR; const char* str_cur_s = str_cont.get(i).c_str(); R_len_t str_cur_n = str_cont.get(i).length(); str_text = utext_openUTF8(str_text, str_cur_s, str_cont.get(i).length(), &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) status = U_ZERO_ERROR; briter->setText(str_text, status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // all right, first let's generate a list of places at which we may do line breaks deque< R_len_t > occurrences_list; // this could be an R_len_t queue R_len_t match = briter->first(); while (match != BreakIterator::DONE) { if (!whitespace_only_val) occurrences_list.push_back(match); else { if (match > 0 && match < str_cur_n) { UChar32 c; U8_GET((const uint8_t*)str_cur_s, 0, match-1, str_cur_n, c); if (uset_whitespaces.contains(c)) occurrences_list.push_back(match); } else occurrences_list.push_back(match); } match = briter->next(); } R_len_t noccurrences = (R_len_t)occurrences_list.size(); // number of boundaries if (noccurrences <= 1) { // no match (1 boundary == 0) SET_VECTOR_ELT(ret, i, Rf_ScalarString(str_cont.toR(i))); continue; } // the number of "words" is: R_len_t nwords = noccurrences - 1; // convert occurrences_list to a vector // in order to obtain end positions (in a string) of each "words", // noting that occurrences_list.at(0) == 0 #ifndef NDEBUG if (occurrences_list.at(0) != 0) throw StriException("NDEBUG: stri_wrap: (occurrences_list.at(0) != 0)"); #endif std::vector<R_len_t> end_pos_orig(nwords); deque<R_len_t>::iterator iter = ++(occurrences_list.begin()); for (R_len_t j = 0; iter != occurrences_list.end(); ++iter, ++j) { end_pos_orig[j] = (*iter); // this is a UTF-8 index } // now: // we'll get the total widths/number of code points in each "word" std::vector<R_len_t> widths_orig(nwords); // we'll get the total widths/number of code points without trailing whitespaces std::vector<R_len_t> widths_trim(nwords); // we'll get the end positions without trailing whitespaces std::vector<R_len_t> end_pos_trim(nwords); // detect line endings (fail on a match) UChar32 c = 0; R_len_t j = 0; R_len_t cur_block = 0; R_len_t cur_width_orig = 0; R_len_t cur_width_trim = 0; R_len_t cur_count_orig = 0; R_len_t cur_count_trim = 0; R_len_t cur_end_pos_trim = 0; while (j < str_cur_n) { R_len_t jlast = j; U8_NEXT(str_cur_s, j, str_cur_n, c); if (c < 0) // invalid utf-8 sequence throw StriException(MSG__INVALID_UTF8); if (uset_linebreaks.contains(c)) throw StriException(MSG__NEWLINE_FOUND); cur_width_orig += stri__width_char(c); ++cur_count_orig; if (uset_whitespaces.contains(c)) { // OLD: trim all white spaces from the end: // ++cur_count_trim; // [we have the normalize arg for that] // NEW: trim just one white space at the end: cur_width_trim = stri__width_char(c); cur_count_trim = 1; cur_end_pos_trim = jlast; } else { cur_width_trim = 0; cur_count_trim = 0; cur_end_pos_trim = j; } if (j >= str_cur_n || end_pos_orig[cur_block] <= j) { // we'll start a new block in a moment if (use_length_val) { widths_orig[cur_block] = cur_count_orig; widths_trim[cur_block] = cur_count_orig-cur_count_trim; } else { widths_orig[cur_block] = cur_width_orig; widths_trim[cur_block] = cur_width_orig-cur_width_trim; } end_pos_trim[cur_block] = cur_end_pos_trim; cur_block++; cur_width_orig = 0; cur_width_trim = 0; cur_count_orig = 0; cur_count_trim = 0; cur_end_pos_trim = j; } } // do wrap std::deque<R_len_t> wrap_after; // wrap line after which word in {0..nwords-1}? if (exponent_val <= 0.0) { stri__wrap_greedy(wrap_after, nwords, width_val, widths_orig, widths_trim, (use_length_val)?((i==0)?ii.count:pi.count):((i==0)?ii.width:pi.width), (use_length_val)?pe.count:pe.width); } else { stri__wrap_dynamic(wrap_after, nwords, width_val, exponent_val, widths_orig, widths_trim, (use_length_val)?((i==0)?ii.count:pi.count):((i==0)?ii.width:pi.width), (use_length_val)?pe.count:pe.width); } // wrap_after.size() line breaks => wrap_after.size()+1 lines R_len_t nlines = (R_len_t)wrap_after.size()+1; R_len_t last_pos = 0; SEXP ans; STRI__PROTECT(ans = Rf_allocVector(STRSXP, nlines)); deque<R_len_t>::iterator iter_wrap = wrap_after.begin(); for (R_len_t u = 0; iter_wrap != wrap_after.end(); ++iter_wrap, ++u) { R_len_t wrap_after_cur = *iter_wrap; R_len_t cur_pos = end_pos_trim[wrap_after_cur]; std::string cs; if (i == 0 && u == 0) cs = ii.str; else if (i > 0 && u == 0) cs = pi.str; else cs = pe.str; cs.append(str_cur_s+last_pos, cur_pos-last_pos); SET_STRING_ELT(ans, u, Rf_mkCharLenCE(cs.c_str(), cs.size(), CE_UTF8)); last_pos = end_pos_orig[wrap_after_cur]; } // last line goes here: std::string cs; if (i == 0 && nlines-1 == 0) cs = ii.str; else if (i > 0 && nlines-1 == 0) cs = pi.str; else cs = pe.str; cs.append(str_cur_s+last_pos, end_pos_trim[nwords-1]-last_pos); SET_STRING_ELT(ans, nlines-1, Rf_mkCharLenCE(cs.c_str(), cs.size(), CE_UTF8)); SET_VECTOR_ELT(ret, i, ans); STRI__UNPROTECT(1); } if (briter) { delete briter; briter = NULL; } if (str_text) { utext_close(str_text); str_text = NULL; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ if (briter) { delete briter; briter = NULL; } if (str_text) { utext_close(str_text); str_text = NULL; } }) }