/** Locate all BreakIterator boundaries
 *
 * @param str character vector
 * @param omit_no_match logical
 * @param opts_brkiter named list
 * @return list
 *
 * @version 0.2-2 (Marek Gagolewski, 2014-04-22)
 *
 * @version 0.2-2 (Marek Gagolewski, 2014-04-23)
 *          removed "title": For Unicode 4.0 and above title boundary
 *          iteration, please use Word Boundary iterator.
 *
 * @version 0.2-2 (Marek Gagolewski, 2014-04-25)
 *          use stri__split_or_locate_boundaries
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-10-29)
 *          use opts_brkiter
 *
 * @version 0.4-1 (Marek Gagolewski, 2014-11-28)
 *          new args: omit_no_match
 *
 * @version 0.4-1 (Marek Gagolewski, 2014-12-02)
 *          use StriRuleBasedBreakIterator
 */
SEXP stri_locate_all_boundaries(SEXP str, SEXP omit_no_match, SEXP opts_brkiter)
{
   bool omit_no_match1 = stri__prepare_arg_logical_1_notNA(omit_no_match, "omit_no_match");
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   StriBrkIterOptions opts_brkiter2(opts_brkiter, "line_break");

   STRI__ERROR_HANDLER_BEGIN(1)
   R_len_t str_length = LENGTH(str);
   StriContainerUTF8_indexable str_cont(str, str_length);
   StriRuleBasedBreakIterator brkiter(opts_brkiter2);

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(VECSXP, str_length));

   for (R_len_t i = 0; i < str_length; ++i)
   {
      if (str_cont.isNA(i)) {
         SET_VECTOR_ELT(ret, i, stri__matrix_NA_INTEGER(1, 2));
         continue;
      }

      brkiter.setupMatcher(str_cont.get(i).c_str(), str_cont.get(i).length());
      brkiter.first();

      deque< pair<R_len_t,R_len_t> > occurrences;
      pair<R_len_t,R_len_t> curpair;
      while (brkiter.next(curpair))
         occurrences.push_back(curpair);

      R_len_t noccurrences = (R_len_t)occurrences.size();
      if (noccurrences <= 0) {
         SET_VECTOR_ELT(ret, i, stri__matrix_NA_INTEGER(omit_no_match1?0:1, 2));
         continue;
      }

      SEXP ans;
      STRI__PROTECT(ans = Rf_allocMatrix(INTSXP, noccurrences, 2));
      int* ans_tab = INTEGER(ans);
      deque< pair<R_len_t, R_len_t> >::iterator iter = occurrences.begin();
      for (R_len_t j = 0; iter != occurrences.end(); ++iter, ++j) {
         pair<R_len_t, R_len_t> cur_match = *iter;
         ans_tab[j]             = cur_match.first;
         ans_tab[j+noccurrences] = cur_match.second;
      }

      // Adjust UChar index -> UChar32 index (1-2 byte UTF16 to 1 byte UTF32-code points)
      str_cont.UTF8_to_UChar32_index(i, ans_tab,
            ans_tab+noccurrences, noccurrences,
            1, // 0-based index -> 1-based
            0  // end returns position of next character after match
      );
      SET_VECTOR_ELT(ret, i, ans);
      STRI__UNPROTECT(1);
   }

   stri__locate_set_dimnames_list(ret);
   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END({ /* nothing special t.b.d. on error */ })
}
/** Count the number of BreakIterator boundaries
 *
 * @param str character vector
 * @param opts_brkiter identifier
 * @return character vector
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-10-30)
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 *
 * @version 0.4-1 (Marek Gagolewski, 2014-12-02)
 *          use StriRuleBasedBreakIterator
 */
SEXP stri_count_boundaries(SEXP str, SEXP opts_brkiter)
{
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   StriBrkIterOptions opts_brkiter2(opts_brkiter, "line_break");

   STRI__ERROR_HANDLER_BEGIN(1)
   R_len_t str_length = LENGTH(str);
   StriContainerUTF8_indexable str_cont(str, str_length);

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(INTSXP, str_length));
   StriRuleBasedBreakIterator brkiter(opts_brkiter2);

   for (R_len_t i = 0; i < str_length; ++i)
   {
      if (str_cont.isNA(i)) {
         INTEGER(ret)[i] = NA_INTEGER;
         continue;
      }

      brkiter.setupMatcher(str_cont.get(i).c_str(), str_cont.get(i).length());
      brkiter.first();

      R_len_t cur_count = 0;
      while (brkiter.next())
         ++cur_count;

      INTEGER(ret)[i] = cur_count;
   }

   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END({ /* no action */  })
}
/**
 * Locate first or last boundaries
 *
 * @param str character vector
 * @param opts_brkiter list
 * @param first looking for first or last match?
 * @return integer matrix (2 columns)
 *
 * @version 0.4-1 (Marek Gagolewski, 2014-12-05)
 */
SEXP stri__locate_firstlast_boundaries(SEXP str, SEXP opts_brkiter, bool first)
{
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   StriBrkIterOptions opts_brkiter2(opts_brkiter, "line_break");

   STRI__ERROR_HANDLER_BEGIN(1)
   R_len_t str_length = LENGTH(str);
   StriContainerUTF8_indexable str_cont(str, str_length);
   StriRuleBasedBreakIterator brkiter(opts_brkiter2);

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocMatrix(INTSXP, str_length, 2));
   stri__locate_set_dimnames_matrix(ret);
   int* ret_tab = INTEGER(ret);

   for (R_len_t i = 0; i < str_length; ++i)
   {
      ret_tab[i]            = NA_INTEGER;
      ret_tab[i+str_length] = NA_INTEGER;

      if (str_cont.isNA(i) || str_cont.get(i).length() == 0) continue;

      brkiter.setupMatcher(str_cont.get(i).c_str(), str_cont.get(i).length());
      pair<R_len_t,R_len_t> curpair;

      if (first) {
         brkiter.first();
         if (!brkiter.next(curpair)) continue;
      }
      else {
         brkiter.last();
         if (!brkiter.previous(curpair)) continue;
      }

      ret_tab[i]            = curpair.first;
      ret_tab[i+str_length] = curpair.second;

      // Adjust UTF8 byte index -> UChar32 index
      str_cont.UTF8_to_UChar32_index(i,
            ret_tab+i, ret_tab+i+str_length, 1,
            1, // 0-based index -> 1-based
            0  // end returns position of next character after match
      );
   }

   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END( ;/* do nothing special on error */ )
}
Пример #4
0
/**
 *  Convert case (TitleCase)
 *
 *
 *  @param str character vector
 *  @param opts_brkiter list
 *  @return character vector
 *
 * @version 0.4-1 (Marek Gagolewski, 2014-12-03)
 *    separated from stri_trans_casemap;
 *    use StriUBreakIterator
 */
SEXP stri_trans_totitle(SEXP str, SEXP opts_brkiter) {
   StriBrkIterOptions opts_brkiter2(opts_brkiter, "word");
   PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument

// version 0.2-1 - Does not work with ICU 4.8 (but we require ICU >= 50)
   UCaseMap* ucasemap = NULL;

   STRI__ERROR_HANDLER_BEGIN(1)
   StriUBreakIterator brkiter(opts_brkiter2);

   UErrorCode status = U_ZERO_ERROR;
   ucasemap = ucasemap_open(brkiter.getLocale(), U_FOLD_CASE_DEFAULT, &status);
   STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})

   status = U_ZERO_ERROR;
   ucasemap_setBreakIterator(ucasemap, brkiter.getIterator(), &status);
   STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
   brkiter.free(false);
   // ucasemap_setOptions(ucasemap, U_TITLECASE_NO_LOWERCASE, &status); // to do?
   // now briter is owned by ucasemap.
   // it will be released on ucasemap_close
   // (checked with ICU man & src code)

   R_len_t str_n = LENGTH(str);
   StriContainerUTF8 str_cont(str, str_n);
   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_n));


   // STEP 1.
   // Estimate the required buffer length
   // Notice: The resulting number of codepoints may be larger or smaller than
   // the number before casefolding
   R_len_t bufsize = str_cont.getMaxNumBytes();
   bufsize += 10; // a small margin
   String8buf buf(bufsize);

   // STEP 2.
   // Do case folding
   for (R_len_t i = str_cont.vectorize_init();
         i != str_cont.vectorize_end();
         i = str_cont.vectorize_next(i))
   {
      if (str_cont.isNA(i)) {
         SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      R_len_t str_cur_n     = str_cont.get(i).length();
      const char* str_cur_s = str_cont.get(i).c_str();

      status = U_ZERO_ERROR;
      int buf_need = ucasemap_utf8ToTitle(ucasemap, buf.data(), buf.size(),
               (const char*)str_cur_s, str_cur_n, &status);

      if (U_FAILURE(status)) {
         buf.resize(buf_need, false/*destroy contents*/);
         status = U_ZERO_ERROR;
         buf_need = ucasemap_utf8ToTitle(ucasemap, buf.data(), buf.size(),
               (const char*)str_cur_s, str_cur_n, &status);

         STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // this shouldn't happen
                                             // we do have the buffer size required to complete this op
      }

      SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), buf_need, CE_UTF8));
   }

   if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL;}
   STRI__UNPROTECT_ALL
   return ret;

   STRI__ERROR_HANDLER_END({
      if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL; }
   })
}