/** Locate all BreakIterator boundaries * * @param str character vector * @param omit_no_match logical * @param opts_brkiter named list * @return list * * @version 0.2-2 (Marek Gagolewski, 2014-04-22) * * @version 0.2-2 (Marek Gagolewski, 2014-04-23) * removed "title": For Unicode 4.0 and above title boundary * iteration, please use Word Boundary iterator. * * @version 0.2-2 (Marek Gagolewski, 2014-04-25) * use stri__split_or_locate_boundaries * * @version 0.3-1 (Marek Gagolewski, 2014-10-29) * use opts_brkiter * * @version 0.4-1 (Marek Gagolewski, 2014-11-28) * new args: omit_no_match * * @version 0.4-1 (Marek Gagolewski, 2014-12-02) * use StriRuleBasedBreakIterator */ SEXP stri_locate_all_boundaries(SEXP str, SEXP omit_no_match, SEXP opts_brkiter) { bool omit_no_match1 = stri__prepare_arg_logical_1_notNA(omit_no_match, "omit_no_match"); PROTECT(str = stri_prepare_arg_string(str, "str")); StriBrkIterOptions opts_brkiter2(opts_brkiter, "line_break"); STRI__ERROR_HANDLER_BEGIN(1) R_len_t str_length = LENGTH(str); StriContainerUTF8_indexable str_cont(str, str_length); StriRuleBasedBreakIterator brkiter(opts_brkiter2); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(VECSXP, str_length)); for (R_len_t i = 0; i < str_length; ++i) { if (str_cont.isNA(i)) { SET_VECTOR_ELT(ret, i, stri__matrix_NA_INTEGER(1, 2)); continue; } brkiter.setupMatcher(str_cont.get(i).c_str(), str_cont.get(i).length()); brkiter.first(); deque< pair<R_len_t,R_len_t> > occurrences; pair<R_len_t,R_len_t> curpair; while (brkiter.next(curpair)) occurrences.push_back(curpair); R_len_t noccurrences = (R_len_t)occurrences.size(); if (noccurrences <= 0) { SET_VECTOR_ELT(ret, i, stri__matrix_NA_INTEGER(omit_no_match1?0:1, 2)); continue; } SEXP ans; STRI__PROTECT(ans = Rf_allocMatrix(INTSXP, noccurrences, 2)); int* ans_tab = INTEGER(ans); deque< pair<R_len_t, R_len_t> >::iterator iter = occurrences.begin(); for (R_len_t j = 0; iter != occurrences.end(); ++iter, ++j) { pair<R_len_t, R_len_t> cur_match = *iter; ans_tab[j] = cur_match.first; ans_tab[j+noccurrences] = cur_match.second; } // Adjust UChar index -> UChar32 index (1-2 byte UTF16 to 1 byte UTF32-code points) str_cont.UTF8_to_UChar32_index(i, ans_tab, ans_tab+noccurrences, noccurrences, 1, // 0-based index -> 1-based 0 // end returns position of next character after match ); SET_VECTOR_ELT(ret, i, ans); STRI__UNPROTECT(1); } stri__locate_set_dimnames_list(ret); STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ /* nothing special t.b.d. on error */ }) }
/** Count the number of BreakIterator boundaries * * @param str character vector * @param opts_brkiter identifier * @return character vector * * @version 0.3-1 (Marek Gagolewski, 2014-10-30) * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-02) * use StriRuleBasedBreakIterator */ SEXP stri_count_boundaries(SEXP str, SEXP opts_brkiter) { PROTECT(str = stri_prepare_arg_string(str, "str")); StriBrkIterOptions opts_brkiter2(opts_brkiter, "line_break"); STRI__ERROR_HANDLER_BEGIN(1) R_len_t str_length = LENGTH(str); StriContainerUTF8_indexable str_cont(str, str_length); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(INTSXP, str_length)); StriRuleBasedBreakIterator brkiter(opts_brkiter2); for (R_len_t i = 0; i < str_length; ++i) { if (str_cont.isNA(i)) { INTEGER(ret)[i] = NA_INTEGER; continue; } brkiter.setupMatcher(str_cont.get(i).c_str(), str_cont.get(i).length()); brkiter.first(); R_len_t cur_count = 0; while (brkiter.next()) ++cur_count; INTEGER(ret)[i] = cur_count; } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ /* no action */ }) }
/** * Locate first or last boundaries * * @param str character vector * @param opts_brkiter list * @param first looking for first or last match? * @return integer matrix (2 columns) * * @version 0.4-1 (Marek Gagolewski, 2014-12-05) */ SEXP stri__locate_firstlast_boundaries(SEXP str, SEXP opts_brkiter, bool first) { PROTECT(str = stri_prepare_arg_string(str, "str")); StriBrkIterOptions opts_brkiter2(opts_brkiter, "line_break"); STRI__ERROR_HANDLER_BEGIN(1) R_len_t str_length = LENGTH(str); StriContainerUTF8_indexable str_cont(str, str_length); StriRuleBasedBreakIterator brkiter(opts_brkiter2); SEXP ret; STRI__PROTECT(ret = Rf_allocMatrix(INTSXP, str_length, 2)); stri__locate_set_dimnames_matrix(ret); int* ret_tab = INTEGER(ret); for (R_len_t i = 0; i < str_length; ++i) { ret_tab[i] = NA_INTEGER; ret_tab[i+str_length] = NA_INTEGER; if (str_cont.isNA(i) || str_cont.get(i).length() == 0) continue; brkiter.setupMatcher(str_cont.get(i).c_str(), str_cont.get(i).length()); pair<R_len_t,R_len_t> curpair; if (first) { brkiter.first(); if (!brkiter.next(curpair)) continue; } else { brkiter.last(); if (!brkiter.previous(curpair)) continue; } ret_tab[i] = curpair.first; ret_tab[i+str_length] = curpair.second; // Adjust UTF8 byte index -> UChar32 index str_cont.UTF8_to_UChar32_index(i, ret_tab+i, ret_tab+i+str_length, 1, 1, // 0-based index -> 1-based 0 // end returns position of next character after match ); } STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END( ;/* do nothing special on error */ ) }
/** * Convert case (TitleCase) * * * @param str character vector * @param opts_brkiter list * @return character vector * * @version 0.4-1 (Marek Gagolewski, 2014-12-03) * separated from stri_trans_casemap; * use StriUBreakIterator */ SEXP stri_trans_totitle(SEXP str, SEXP opts_brkiter) { StriBrkIterOptions opts_brkiter2(opts_brkiter, "word"); PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument // version 0.2-1 - Does not work with ICU 4.8 (but we require ICU >= 50) UCaseMap* ucasemap = NULL; STRI__ERROR_HANDLER_BEGIN(1) StriUBreakIterator brkiter(opts_brkiter2); UErrorCode status = U_ZERO_ERROR; ucasemap = ucasemap_open(brkiter.getLocale(), U_FOLD_CASE_DEFAULT, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) status = U_ZERO_ERROR; ucasemap_setBreakIterator(ucasemap, brkiter.getIterator(), &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) brkiter.free(false); // ucasemap_setOptions(ucasemap, U_TITLECASE_NO_LOWERCASE, &status); // to do? // now briter is owned by ucasemap. // it will be released on ucasemap_close // (checked with ICU man & src code) R_len_t str_n = LENGTH(str); StriContainerUTF8 str_cont(str, str_n); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_n)); // STEP 1. // Estimate the required buffer length // Notice: The resulting number of codepoints may be larger or smaller than // the number before casefolding R_len_t bufsize = str_cont.getMaxNumBytes(); bufsize += 10; // a small margin String8buf buf(bufsize); // STEP 2. // Do case folding for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { if (str_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); status = U_ZERO_ERROR; int buf_need = ucasemap_utf8ToTitle(ucasemap, buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status); if (U_FAILURE(status)) { buf.resize(buf_need, false/*destroy contents*/); status = U_ZERO_ERROR; buf_need = ucasemap_utf8ToTitle(ucasemap, buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // this shouldn't happen // we do have the buffer size required to complete this op } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), buf_need, CE_UTF8)); } if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL;} STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL; } }) }