Ejemplo n.º 1
0
/**
 * Reverse Each String
 * @param str character vector
 * @return character vector with every string reversed
 *
 *
 * @version 0.1-?? (Bartek Tartanus)
 *
 * @version 0.1-?? (Marek Gagolewski)
 *          use StriContainerUTF16
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-16)
 *          make StriException-friendly + StriContainerUTF8 (bug fix, do reversing manually)
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-04-01)
 *          detect incorrect utf8 byte stream
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_reverse(SEXP str)
{
   PROTECT(str = stri_prepare_arg_string(str, "str"));    // prepare string argument

   STRI__ERROR_HANDLER_BEGIN(1)
   R_len_t str_len = LENGTH(str);
   StriContainerUTF8 str_cont(str, str_len); // writable, no recycle

   // STEP 1.
   // Calculate the required buffer length
   R_len_t bufsize = 0;
   for (R_len_t i=0; i<str_len; ++i) {
      if (str_cont.isNA(i))
         continue;

      R_len_t cursize = str_cont.get(i).length();
      if (cursize > bufsize)
         bufsize = cursize;
   }

   // STEP 2.
   // Alloc buffer & result vector
   String8buf buf(bufsize);
   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_len));

   for (R_len_t i = str_cont.vectorize_init();
         i != str_cont.vectorize_end();
         i = str_cont.vectorize_next(i))
   {
      if (str_cont.isNA(i)) {
         SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      R_len_t str_cur_n = str_cont.get(i).length();
      const char* str_cur_s = str_cont.get(i).c_str();

      R_len_t j, k;
      UChar32 chr;
      UBool isError = FALSE;

      for (j=str_cur_n, k=0; !isError && j>0; ) {
         U8_PREV(str_cur_s, 0, j, chr); // go backwards
         if (chr < 0) {
            throw StriException(MSG__INVALID_UTF8);
         }
         U8_APPEND((uint8_t*)buf.data(), k, str_cur_n, chr, isError);
      }

      if (isError)
         throw StriException(MSG__INTERNAL_ERROR);

      SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), str_cur_n, CE_UTF8));
   }

   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
Ejemplo n.º 2
0
static void TestAppend() {
    static const UChar32 codePoints[]={
        0x61, 0xdf, 0x901, 0x3040,
        0xac00, 0xd800, 0xdbff, 0xdcde,
        0xdffd, 0xe000, 0xffff, 0x10000,
        0x12345, 0xe0021, 0x10ffff, 0x110000,
        0x234567, 0x7fffffff, -1, -1000,
        0, 0x400
    };
    static const uint8_t expectUnsafe[]={
        0x61,  0xc3, 0x9f,  0xe0, 0xa4, 0x81,  0xe3, 0x81, 0x80,
        0xea, 0xb0, 0x80,  0xed, 0xa0, 0x80,  0xed, 0xaf, 0xbf,  0xed, 0xb3, 0x9e,
        0xed, 0xbf, 0xbd,  0xee, 0x80, 0x80,  0xef, 0xbf, 0xbf,  0xf0, 0x90, 0x80, 0x80,
        0xf0, 0x92, 0x8d, 0x85,  0xf3, 0xa0, 0x80, 0xa1,  0xf4, 0x8f, 0xbf, 0xbf,  /* not 0x110000 */
        /* none from this line */
        0,  0xd0, 0x80
    }, expectSafe[]={
        0x61,  0xc3, 0x9f,  0xe0, 0xa4, 0x81,  0xe3, 0x81, 0x80,
        0xea, 0xb0, 0x80,  /* no surrogates */
        /* no surrogates */  0xee, 0x80, 0x80,  0xef, 0xbf, 0xbf,  0xf0, 0x90, 0x80, 0x80,
        0xf0, 0x92, 0x8d, 0x85,  0xf3, 0xa0, 0x80, 0xa1,  0xf4, 0x8f, 0xbf, 0xbf,  /* not 0x110000 */
        /* none from this line */
        0,  0xd0, 0x80
    };

    uint8_t buffer[100];
    UChar32 c;
    int32_t i, length;
    UBool isError, expectIsError, wrongIsError;

    length=0;
    for(i=0; i<LENGTHOF(codePoints); ++i) {
        c=codePoints[i];
        if(c<0 || 0x10ffff<c) {
            continue; /* skip non-code points for U8_APPEND_UNSAFE */
        }

        U8_APPEND_UNSAFE(buffer, length, c);
    }
    if(length!=LENGTHOF(expectUnsafe) || 0!=memcmp(buffer, expectUnsafe, length)) {
        log_err("U8_APPEND_UNSAFE did not generate the expected output\n");
    }

    length=0;
    wrongIsError=FALSE;
    for(i=0; i<LENGTHOF(codePoints); ++i) {
        c=codePoints[i];
        expectIsError= c<0 || 0x10ffff<c || U_IS_SURROGATE(c);
        isError=FALSE;

        U8_APPEND(buffer, length, LENGTHOF(buffer), c, isError);
        wrongIsError|= isError!=expectIsError;
    }
    if(wrongIsError) {
        log_err("U8_APPEND did not set isError correctly\n");
    }
    if(length!=LENGTHOF(expectSafe) || 0!=memcmp(buffer, expectSafe, length)) {
        log_err("U8_APPEND did not generate the expected output\n");
    }
}
Ejemplo n.º 3
0
/* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */
static U_INLINE int32_t
appendResult(uint8_t *dest, int32_t destIndex, int32_t destCapacity,
             int32_t result, const UChar *s) {
    UChar32 c;
    int32_t length, destLength;
    UErrorCode errorCode;

    /* decode the result */
    if(result<0) {
        /* (not) original code point */
        c=~result;
        length=-1;
    } else if(result<=UCASE_MAX_STRING_LENGTH) {
        c=U_SENTINEL;
        length=result;
    } else {
        c=result;
        length=-1;
    }

    if(destIndex<destCapacity) {
        /* append the result */
        if(length<0) {
            /* code point */
            UBool isError=FALSE;
            U8_APPEND(dest, destIndex, destCapacity, c, isError);
            if(isError) {
                /* overflow, nothing written */
                destIndex+=U8_LENGTH(c);
            }
        } else {
            /* string */
            errorCode=U_ZERO_ERROR;
            u_strToUTF8(
                (char *)(dest+destIndex), destCapacity-destIndex, &destLength,
                s, length,
                &errorCode);
            destIndex+=destLength;
            /* we might have an overflow, but we know the actual length */
        }
    } else {
        /* preflight */
        if(length<0) {
            destIndex+=U8_LENGTH(c);
        } else {
            errorCode=U_ZERO_ERROR;
            u_strToUTF8(
                NULL, 0, &destLength,
                s, length,
                &errorCode);
            destIndex+=destLength;
        }
    }
    return destIndex;
}
/** Convert from UTF-32
 *
 * @param vec integer vector or list with integer vectors
 * @return character vector
 *
 * @version 0.1-?? (Marek Gagolewski)
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-03-25)
 *          StriException friently;
 *          use StriContainerListInt
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_enc_fromutf32(SEXP vec)
{
   PROTECT(vec = stri_prepare_arg_list_integer(vec, "vec"));

   STRI__ERROR_HANDLER_BEGIN(1)
   StriContainerListInt vec_cont(vec);
   R_len_t vec_n = vec_cont.get_n();

   // get required buf size
   R_len_t bufsize = 0;
   for (R_len_t i=0; i<vec_n; ++i) {
      if (!vec_cont.isNA(i) && vec_cont.get(i).size() > bufsize)
         bufsize = vec_cont.get(i).size();
   }
   bufsize = U8_MAX_LENGTH*bufsize+1; // this will surely be sufficient
   String8buf buf(bufsize);
   char* bufdata = buf.data();

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(STRSXP, vec_n));

   for (R_len_t i=0; i<vec_n; ++i) {
      if (vec_cont.isNA(i)) {
         SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      const int* cur_data = vec_cont.get(i).data();
      R_len_t    cur_n    = vec_cont.get(i).size();
      UChar32 c = (UChar32)0;
      R_len_t j = 0;
      R_len_t k = 0;
      UBool err = FALSE;
      while (!err && k < cur_n) {
         c = cur_data[k++];
         U8_APPEND((uint8_t*)bufdata, j, bufsize, c, err);

         // Rf_mkCharLenCE detects embedded nuls, but stops execution completely
         if (c == 0) err = TRUE;
      }

      if (err) {
         Rf_warning(MSG__INVALID_CODE_POINT, (int)c);
         SET_STRING_ELT(ret, i, NA_STRING);
      }
      else
         SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, j, CE_UTF8));
   }

   STRI__UNPROTECT_ALL;
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
Ejemplo n.º 5
0
std::size_t copy_uchar_as_utf8_(char32_t const c, std::uint8_t (&buf)[4u])
{
  std::int32_t i = 0;
  constexpr std::int32_t capacity = sizeof(buf) / sizeof(buf[0u]);
  static_assert(capacity == 4u, "");
  UBool is_error = FALSE;
  U8_APPEND(buf, i, capacity, c, is_error);
  if (is_error != FALSE) {
    CRADLE_THROW_EXCEPTION((std::invalid_argument("")));
  }
  return i;
}
Ejemplo n.º 6
0
/** Convert from UTF-32 [single string, internal]
 *
 * On invalid codepoint, warning is generated and -1 is returned
 * @param data  UTF-32 codes
 * @param ndata number of codes
 * @param buf [out] output buffer
 * @param bufsize buffer size
 * @return number of bytes written
 *
 * @version 0.1 (Marek Gagolewski)
 */
R_len_t stri__enc_fromutf32(int* data, R_len_t ndata, char* buf, R_len_t bufsize)
{
   R_len_t i = 0;
   R_len_t k = 0;
   UBool err = FALSE;
   while (k < ndata) {
      UChar32 c = data[k++];
      U8_APPEND((uint8_t*)buf, i, bufsize, c, err);
      if (err) {
         Rf_warning(MSG__INVALID_CODE_POINT, (int)c);
         return -1;
      }
   }
   return i;
}
Ejemplo n.º 7
0
Archivo: utf.cpp Proyecto: MGKhKhD/meta
void utf8_append_codepoint(std::string& dest, UChar32 codepoint)
{
    std::array<uint8_t, U8_MAX_LENGTH> buf;
    int32_t len = 0;
    UBool err = FALSE;
    // ICU has some conversions within this macro, which we can't control
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wconversion"
    U8_APPEND(&buf[0], len, U8_MAX_LENGTH, codepoint, err);
#pragma GCC diagnostic pop
    if (err)
        throw std::runtime_error{"failed to add codepoint to string"};
    dest.append(reinterpret_cast<char*>(&buf[0]),
                static_cast<std::size_t>(len));
}
/**
 * Obtains the first UNICODE letter from the supplied string, normalizes and returns it.
 */
static void get_phonebook_index(
    sqlite3_context * context, int argc, sqlite3_value ** argv)
{
    if (argc != 2) {
      sqlite3_result_null(context);
      return;
    }

    char const * src = (char const *)sqlite3_value_text(argv[0]);
    char const * locale = (char const *)sqlite3_value_text(argv[1]);
    if (src == NULL || src[0] == 0 || locale == NULL) {
      sqlite3_result_null(context);
      return;
    }

    UCharIterator iter;
    uiter_setUTF8(&iter, src, -1);

    UBool isError = FALSE;
    UChar index[SMALL_BUFFER_SIZE];
    uint32_t len = android::GetPhonebookIndex(&iter, locale, index, sizeof(index), &isError);
    if (isError) {
      sqlite3_result_null(context);
      return;
    }

    uint32_t outlen = 0;
    uint8_t out[SMALL_BUFFER_SIZE];
    for (uint32_t i = 0; i < len; i++) {
      U8_APPEND(out, outlen, sizeof(out), index[i], isError);
      if (isError) {
        sqlite3_result_null(context);
        return;
      }
    }

    if (outlen == 0) {
      sqlite3_result_null(context);
      return;
    }

    sqlite3_result_text(context, (const char*)out, outlen, SQLITE_TRANSIENT);
}
/** Convert character vector to UTF-8
 *
 * @param str character vector
 * @param is_unknown_8bit single logical value;
 * if TRUE, then in case of ENC_NATIVE or ENC_LATIN1, UTF-8
 * REPLACEMENT CHARACTERs (U+FFFD) are
 * put for codes > 127
 * @param validate single logical value (or NA)
 *
 * @return character vector
 *
 * @version 0.1-XX (Marek Gagolewski)
 *
 * @version 0.1-XX (Marek Gagolewski, 2013-06-16)
 *                  make StriException-friendly
 *
 * @version 0.2-1  (Marek Gagolewski, 2014-03-26)
 *                 Use one String8buf;
 *                 is_unknown_8bit_logical and UTF-8 tries now to remove BOMs
 *
 * @version 0.2-1  (Marek Gagolewksi, 2014-03-30)
 *                 added validate arg
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_enc_toutf8(SEXP str, SEXP is_unknown_8bit, SEXP validate)
{
   PROTECT(validate = stri_prepare_arg_logical_1(validate, "validate"));
   bool is_unknown_8bit_logical =
      stri__prepare_arg_logical_1_notNA(is_unknown_8bit, "is_unknown_8bit");
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   R_len_t n = LENGTH(str);

   STRI__ERROR_HANDLER_BEGIN(2)
   SEXP ret;
   if (!is_unknown_8bit_logical) {
      // Trivial - everything we need is in StriContainerUTF8 :)
      // which removes BOMs silently
      StriContainerUTF8 str_cont(str, n);
      STRI__PROTECT(ret = str_cont.toR());
   }
   else {
      // get buf size
      R_len_t bufsize = 0;
      for (R_len_t i=0; i<n; ++i) {
         SEXP curs = STRING_ELT(str, i);
         if (curs == NA_STRING || IS_ASCII(curs) || IS_UTF8(curs))
            continue;

         R_len_t ni = LENGTH(curs);
         if (ni > bufsize) bufsize = ni;
      }
      String8buf buf(bufsize*3); // either 1 byte < 127 or U+FFFD == 3 bytes UTF-8
      char* bufdata = buf.data();

      STRI__PROTECT(ret = Rf_allocVector(STRSXP, n));
      for (R_len_t i=0; i<n; ++i) {
         SEXP curs = STRING_ELT(str, i);
         if (curs == NA_STRING) {
            SET_STRING_ELT(ret, i, NA_STRING);
            continue;
         }

         if (IS_ASCII(curs) || IS_UTF8(curs)) {
            R_len_t curs_n = LENGTH(curs);
            const char* curs_s = CHAR(curs);
            if (curs_n >= 3 &&
               (uint8_t)(curs_s[0]) == UTF8_BOM_BYTE1 &&
               (uint8_t)(curs_s[1]) == UTF8_BOM_BYTE2 &&
               (uint8_t)(curs_s[2]) == UTF8_BOM_BYTE3) {
               // has BOM - get rid of it
               SET_STRING_ELT(ret, i, Rf_mkCharLenCE(curs_s+3, curs_n-3, CE_UTF8));
            }
            else
               SET_STRING_ELT(ret, i, curs);

            continue;
         }

         // otherwise, we have an 8-bit encoding
         R_len_t curn = LENGTH(curs);
         const char* curs_tab = CHAR(curs);
         R_len_t k = 0;
         for (R_len_t j=0; j<curn; ++j) {
            if (U8_IS_SINGLE(curs_tab[j]))
               bufdata[k++] = curs_tab[j];
            else { // 0xEF 0xBF 0xBD
               bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE1;
               bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE2;
               bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE3;
            }
         }
         SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8));
      }

   }

   // validate utf8 byte stream
   if (LOGICAL(validate)[0] != FALSE) { // NA or TRUE
      R_len_t ret_n = LENGTH(ret);
      for (R_len_t i=0; i<ret_n; ++i) {
         SEXP curs = STRING_ELT(ret, i);
         if (curs == NA_STRING) continue;

         const char* s = CHAR(curs);
         R_len_t sn = LENGTH(curs);
         R_len_t j = 0;
         UChar32 c = 0;
         while (c >= 0 && j < sn) {
            U8_NEXT(s, j, sn, c);
         }

         if (c >= 0) continue; // valid, nothing to do

         if (LOGICAL(validate)[0] == NA_LOGICAL) {
            Rf_warning(MSG__INVALID_CODE_POINT_REPLNA);
            SET_STRING_ELT(ret, i, NA_STRING);
         }
         else {
            int bufsize = sn*3; // maximum: 1 byte -> U+FFFD (3 bytes)
            String8buf buf(bufsize); // maximum: 1 byte -> U+FFFD (3 bytes)
            char* bufdata = buf.data();

            j = 0;
            R_len_t k = 0;
            UBool err = FALSE;
            while (!err && j < sn) {
               U8_NEXT(s, j, sn, c);
               if (c >= 0) {
                  U8_APPEND((uint8_t*)bufdata, k, bufsize, c, err);
               } else {
                  Rf_warning(MSG__INVALID_CODE_POINT_FIXING);
                  bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE1;
                  bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE2;
                  bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE3;
               }
            }

            if (err) throw StriException(MSG__INTERNAL_ERROR);
            SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8));
         }
      }
   }

   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
Ejemplo n.º 10
0
/** Generate random permutations of code points in each string
 *
 * @param str character vector
 * @return character vector
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-04-04)
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 *
 * @version 1.2.5 (Marek Gagolewski, 2019-07-23)
 *    #319: Fixed overflow in `stri_rand_shuffle()`.
 */
SEXP stri_rand_shuffle(SEXP str)
{
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   R_len_t n = LENGTH(str);

   GetRNGstate();
   STRI__ERROR_HANDLER_BEGIN(1)
   StriContainerUTF8 str_cont(str, n);

   R_len_t bufsize = 0;
   for (R_len_t i=0; i<n; ++i) {
      if (str_cont.isNA(i)) continue;
      R_len_t ni = str_cont.get(i).length();
      if (ni > bufsize) bufsize = ni;
   }
   std::vector<UChar32> buf1(bufsize); // at most bufsize UChars32 (bufsize/4 min.)
   String8buf buf2(bufsize);

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(STRSXP, n));

   for (R_len_t i=0; i<n; ++i) {

      if (str_cont.isNA(i)) {
         SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      // fill buf1
      UChar32 c = (UChar32)0;
      const char* s = str_cont.get(i).c_str();
      R_len_t sn = str_cont.get(i).length();
      R_len_t j = 0;
      R_len_t k = 0;
      while (c >= 0 && j < sn) {
         U8_NEXT(s, j, sn, c);
         buf1[k++] = (int)c;
      }

      if (c < 0) {
         Rf_warning(MSG__INVALID_UTF8);
         SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      // do shuffle buf1 at pos 0..k-1: (Fisher-Yates shuffle)
      R_len_t cur_n = k;
      for (j=0; j<cur_n-1; ++j) {
         // rand from i to cur_n-1
         R_len_t r = (R_len_t)floor(unif_rand()*(double)(cur_n-j)+(double)j);
         UChar32 tmp = buf1[r];
         buf1[r] = buf1[j];
         buf1[j] = tmp;
      }

      // create string:
      char* buf2data = buf2.data();
      c = (UChar32)0;
      j = 0;
      k = 0;
      UBool err = FALSE;
      while (!err && k < cur_n) {
         c = buf1[k++];
         U8_APPEND((uint8_t*)buf2data, j, bufsize, c, err);
      }

      if (err) throw StriException(MSG__INTERNAL_ERROR);

      SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf2data, j, CE_UTF8));
   }

   PutRNGstate();
   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END({
      PutRNGstate();
   })
}
Ejemplo n.º 11
0
/** Generate random strings
 *
 * @param n single integer
 * @param length integer vector
 * @param pattern character vector
 * @return character vector
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-04-04)
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-04-05)
 *          Use StriContainerCharClass which now contains UnicodeSets;
 *          vectorized also over pattern
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_rand_strings(SEXP n, SEXP length, SEXP pattern)
{
   int n_val = stri__prepare_arg_integer_1_notNA(n, "n");
   PROTECT(length    = stri_prepare_arg_integer(length, "length"));
   PROTECT(pattern   = stri_prepare_arg_string(pattern, "pattern"));

   if (n_val < 0) n_val = 0; /* that's not NA for sure now */

   R_len_t length_len = LENGTH(length);
   if (length_len <= 0) {
      UNPROTECT(2);
      Rf_error(MSG__ARG_EXPECTED_NOT_EMPTY, "length");
   }
   else if (length_len > n_val || n_val % length_len != 0)
      Rf_warning(MSG__WARN_RECYCLING_RULE2);

   R_len_t pattern_len = LENGTH(pattern);
   if (pattern_len <= 0) {
      UNPROTECT(2);
      Rf_error(MSG__ARG_EXPECTED_NOT_EMPTY, "pattern");
   }
   else if (pattern_len > n_val || n_val % pattern_len != 0)
      Rf_warning(MSG__WARN_RECYCLING_RULE2);

   GetRNGstate();
   STRI__ERROR_HANDLER_BEGIN(2)

   StriContainerCharClass pattern_cont(pattern, max(n_val, pattern_len));
   StriContainerInteger   length_cont(length, max(n_val, length_len));

   // get max required bufsize
   int*    length_tab = INTEGER(length);
   R_len_t bufsize = 0;
   for (R_len_t i=0; i<length_len; ++i) {
      if (length_tab[i] != NA_INTEGER && length_tab[i] > bufsize)
         bufsize = length_tab[i];
   }
   bufsize *= 4;  // 1 UChar32 -> max. 4 UTF-8 bytes
   String8buf buf(bufsize);
   char* bufdata = buf.data();

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(STRSXP, n_val));

   for (R_len_t i=0; i<n_val; ++i) {
      if (length_cont.isNA(i) || pattern_cont.isNA(i)) {
         SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      int length_cur = length_cont.get(i);
      if (length_cur < 0) length_cur = 0;

      const UnicodeSet* uset = &(pattern_cont.get(i));
      int32_t uset_size = uset->size();

      // generate string:
      R_len_t j = 0;
      UBool err = FALSE;
      for (R_len_t k=0; k<length_cur; ++k) {
         int32_t idx = (int32_t)floor(unif_rand()*(double)uset_size); /* 0..uset_size-1 */
         UChar32 c = uset->charAt(idx);
         if (c < 0) throw StriException(MSG__INTERNAL_ERROR);

         U8_APPEND((uint8_t*)bufdata, j, bufsize, c, err);
         if (err) throw StriException(MSG__INTERNAL_ERROR);
      }
      SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, j, CE_UTF8));
   }

   PutRNGstate();
   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END({
      PutRNGstate();
   })
}