Exemplo n.º 1
0
/* keep this in sync with utf16tst.c's TestNulTerminated() */
static void TestNulTerminated() {
    static const uint8_t input[]={
        /*  0 */  0x61,
        /*  1 */  0xf0, 0x90, 0x90, 0x81,
        /*  5 */  0xc0, 0x80,
        /*  7 */  0xdf, 0x80,
        /*  9 */  0xc2,
        /* 10 */  0x62,
        /* 11 */  0xfd, 0xbe,
        /* 13 */  0xe0, 0xa0, 0x80,
        /* 16 */  0xe2, 0x82, 0xac,
        /* 19 */  0xf0, 0x90, 0x90,
        /* 22 */  0x00
        /* 23 */
    };
    static const UChar32 result[]={
        0x61,
        0x10401,
        U_SENTINEL,
        0x7c0,
        U_SENTINEL,
        0x62,
        U_SENTINEL,
        0x800,
        0x20ac,
        U_SENTINEL,
        0
    };

    UChar32 c, c2, expected;
    int32_t i0, i=0, j, k, expectedIndex;
    int32_t cpIndex=0;
    do {
        i0=i;
        U8_NEXT(input, i, -1, c);
        expected=result[cpIndex];
        if(c!=expected) {
            log_err("U8_NEXT(from %d)=U+%04x != U+%04x\n", i0, c, expected);
        }
        j=i0;
        U8_NEXT_OR_FFFD(input, j, -1, c);
        if(expected<0) { expected=0xfffd; }
        if(c!=expected) {
            log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x\n", i0, c, expected);
        }
        if(j!=i) {
            log_err("U8_NEXT_OR_FFFD() moved to index %d but U8_NEXT() moved to %d\n", j, i);
        }
        j=i0;
        U8_FWD_1(input, j, -1);
        if(j!=i) {
            log_err("U8_FWD_1() moved to index %d but U8_NEXT() moved to %d\n", j, i);
        }
        ++cpIndex;
        /*
         * Move by this many code points from the start.
         * U8_FWD_N() stops at the end of the string, that is, at the NUL if necessary.
         */
        expectedIndex= (c==0) ? i-1 : i;
        k=0;
        U8_FWD_N(input, k, -1, cpIndex);
        if(k!=expectedIndex) {
            log_err("U8_FWD_N(code points from 0) moved to index %d but expected %d\n", k, expectedIndex);
        }
    } while(c!=0);

    i=0;
    do {
        j=i0=i;
        U8_NEXT(input, i, -1, c);
        do {
            U8_GET(input, 0, j, -1, c2);
            if(c2!=c) {
                log_err("U8_NEXT(from %d)=U+%04x != U+%04x=U8_GET(at %d)\n", i0, c, c2, j);
            }
            U8_GET_OR_FFFD(input, 0, j, -1, c2);
            expected= (c>=0) ? c : 0xfffd;
            if(c2!=expected) {
                log_err("U8_NEXT_OR_FFFD(from %d)=U+%04x != U+%04x=U8_GET_OR_FFFD(at %d)\n", i0, expected, c2, j);
            }
            /* U8_SET_CP_LIMIT moves from a non-lead byte to the limit of the code point */
            k=j+1;
            U8_SET_CP_LIMIT(input, 0, k, -1);
            if(k!=i) {
                log_err("U8_NEXT() moved to %d but U8_SET_CP_LIMIT(%d) moved to %d\n", i, j+1, k);
            }
        } while(++j<i);
    } while(c!=0);
}
Exemplo n.º 2
0
static void TestGetChar()
{
    static const uint8_t input[]={
    /*  code unit,*/
        0x61,
        0x7f,
        0xe4,
        0xba, 
        0x8c,
        0xF0, 
        0x90, 
        0x90, 
        0x81,
        0xc0,
        0x65,
        0x31,
        0x9a,
        0xc9
    };
    static const UChar32 result[]={
     /*codepoint-unsafe,  codepoint-safe(not strict)  codepoint-safe(strict)*/
        0x61,             0x61,                       0x61, 
        0x7f,             0x7f,                       0x7f, 
        0x4e8c,           0x4e8c,                     0x4e8c,
        0x4e8c,           0x4e8c,                     0x4e8c ,
        0x4e8c,           0x4e8c,                     0x4e8c,
        0x10401,          0x10401,                    0x10401 ,
        0x10401,          0x10401,                    0x10401 ,
        0x10401,          0x10401,                    0x10401 ,
        0x10401,          0x10401,                    0x10401,
        0x25,             UTF8_ERROR_VALUE_1,         UTF8_ERROR_VALUE_1,
        0x65,             0x65,                       0x65,  
        0x31,             0x31,                       0x31,  
        0x31,             UTF8_ERROR_VALUE_1,         UTF8_ERROR_VALUE_1,
        0x240,            UTF8_ERROR_VALUE_1,         UTF8_ERROR_VALUE_1
    };
    uint16_t i=0;
    UChar32 c;
    uint32_t offset=0;

    for(offset=0; offset<sizeof(input); offset++) {
        if (offset < sizeof(input) - 1) {
            UTF8_GET_CHAR_UNSAFE(input, offset, c);
            if(c != result[i]){
                log_err("ERROR: UTF8_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
           
            }

            U8_GET_UNSAFE(input, offset, c);
            if(c != result[i]){
                log_err("ERROR: U8_GET_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
           
            }
        }

        U8_GET(input, 0, offset, sizeof(input), c);
        if(UTF_IS_ERROR(result[i+1]) ? c >= 0 : c != result[i+1]){
            log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
        }

        UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, FALSE);
        if(c != result[i+1]){
            log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
        }

        UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, TRUE);
        if(c != result[i+2]){
            log_err("ERROR: UTF8_GET_CHAR_SAFE(strict) failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
        }
         
         i=(uint16_t)(i+3);
    }
}
Exemplo n.º 3
0
/** Word wrap text
 *
 * @param str character vector
 * @param width single integer
 * @param cost_exponent single double
 * @param indent single integer
 * @param exdent single integer
 * @param prefix single string
 * @param initial single string
 * @param locale locale identifier or NULL for default locale
 * @param use_length single logical value
 *
 * @return list
 *
 * @version 0.1-?? (Bartek Tartanus)
 *
 * @version 0.2-2 (Marek Gagolewski, 2014-04-27)
 *          single function for wrap_greedy and wrap_dynamic
 *          (dispatch inside);
 *          use BreakIterator
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 *
 * @version 0.4-1 (Marek Gagolewski, 2014-12-06)
 *    new args: indent, exdent, prefix, initial
 *
 * @version 0.5-1 (Marek Gagolewski, 2014-12-19)
 *    #133 allow width <= 0
 *
 * @version 0.5-1 (Marek Gagolewski, 2015-02-28)
 *    don't trim so many white spaces at the end of each word (normalize arg does that)
 *    #139: allow a "whitespace" break iterator
 *
 * @version 0.5-1 (Marek Gagolewski, 2015-04-23)
 *    `use_length` arg added
 *
 *
 * @version 0.5-1 (Marek Gagolewski, 2015-06-09)
 *    BIGSKIP: no more CHARSXP on out on "" input
 */
SEXP stri_wrap(SEXP str, SEXP width, SEXP cost_exponent,
   SEXP indent, SEXP exdent, SEXP prefix, SEXP initial, SEXP whitespace_only,
   SEXP use_length, SEXP locale)
{
   bool use_length_val      = stri__prepare_arg_logical_1_notNA(use_length, "use_length");
   double exponent_val      = stri__prepare_arg_double_1_notNA(cost_exponent, "cost_exponent");
   bool whitespace_only_val = stri__prepare_arg_logical_1_notNA(whitespace_only, "whitespace_only");

   int width_val = stri__prepare_arg_integer_1_notNA(width, "width");
   if (width_val <= 0) width_val = 0;

   int indent_val = stri__prepare_arg_integer_1_notNA(indent, "indent");
   if (indent_val < 0) Rf_error(MSG__EXPECTED_POSITIVE, "indent");

   int exdent_val = stri__prepare_arg_integer_1_notNA(exdent, "exdent");
   if (exdent_val < 0) Rf_error(MSG__EXPECTED_POSITIVE, "exdent");


   const char* qloc = stri__prepare_arg_locale(locale, "locale", true); /* this is R_alloc'ed */
   Locale loc = Locale::createFromName(qloc);
   PROTECT(str     = stri_prepare_arg_string(str, "str"));
   PROTECT(prefix  = stri_prepare_arg_string_1(prefix, "prefix"));
   PROTECT(initial = stri_prepare_arg_string_1(initial, "initial"));

   BreakIterator* briter = NULL;
   UText* str_text = NULL;

   STRI__ERROR_HANDLER_BEGIN(3)
   UErrorCode status = U_ZERO_ERROR;
   briter = BreakIterator::createLineInstance(loc, status);
   STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})

   R_len_t str_length = LENGTH(str);
   StriContainerUTF8_indexable str_cont(str, str_length);
   StriContainerUTF8 prefix_cont(prefix, 1);
   StriContainerUTF8 initial_cont(initial, 1);


   // prepare indent/exdent/prefix/initial stuff:
   // 1st line, 1st para (i==0, u==0): initial+indent
   // nth line, 1st para (i==0, u> 0): prefix +exdent
   // 1st line, nth para (i> 0, u==0): prefix +indent
   // nth line, nth para (i> 0, u> 0): prefix +exdent
   StriWrapLineStart ii(initial_cont.get(0), indent_val);
   StriWrapLineStart pi(prefix_cont.get(0), indent_val);
   StriWrapLineStart pe(prefix_cont.get(0), exdent_val);


   status = U_ZERO_ERROR;
   //Unicode Newline Guidelines - Unicode Technical Report #13
   UnicodeSet uset_linebreaks(UnicodeString::fromUTF8("[\\u000A-\\u000D\\u0085\\u2028\\u2029]"), status);
   STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
   uset_linebreaks.freeze();

   status = U_ZERO_ERROR;
   UnicodeSet uset_whitespaces(UnicodeString::fromUTF8("\\p{White_space}"), status);
   STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
   uset_whitespaces.freeze();

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(VECSXP, str_length));
   for (R_len_t i = 0; i < str_length; ++i)
   {
      if (str_cont.isNA(i) || prefix_cont.isNA(0) || initial_cont.isNA(0)) {
         SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1));
         continue;
      }

      status = U_ZERO_ERROR;
      const char* str_cur_s = str_cont.get(i).c_str();
      R_len_t str_cur_n = str_cont.get(i).length();
      str_text = utext_openUTF8(str_text, str_cur_s, str_cont.get(i).length(), &status);
      STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})

      status = U_ZERO_ERROR;
      briter->setText(str_text, status);
      STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})

      // all right, first let's generate a list of places at which we may do line breaks
      deque< R_len_t > occurrences_list; // this could be an R_len_t queue
      R_len_t match = briter->first();
      while (match != BreakIterator::DONE) {

         if (!whitespace_only_val)
            occurrences_list.push_back(match);
         else {
            if (match > 0 && match < str_cur_n) {
               UChar32 c;
               U8_GET((const uint8_t*)str_cur_s, 0, match-1, str_cur_n, c);
               if (uset_whitespaces.contains(c))
                  occurrences_list.push_back(match);
            }
            else
               occurrences_list.push_back(match);
         }

         match = briter->next();
      }

      R_len_t noccurrences = (R_len_t)occurrences_list.size(); // number of boundaries
      if (noccurrences <= 1) { // no match (1 boundary == 0)
         SET_VECTOR_ELT(ret, i, Rf_ScalarString(str_cont.toR(i)));
         continue;
      }

      // the number of "words" is:
      R_len_t nwords = noccurrences - 1;

      // convert occurrences_list to a vector
      // in order to obtain end positions (in a string) of each "words",
      // noting that occurrences_list.at(0) == 0
#ifndef NDEBUG
      if (occurrences_list.at(0) != 0)
         throw StriException("NDEBUG: stri_wrap: (occurrences_list.at(0) != 0)");
#endif

      std::vector<R_len_t> end_pos_orig(nwords);
      deque<R_len_t>::iterator iter = ++(occurrences_list.begin());
      for (R_len_t j = 0; iter != occurrences_list.end(); ++iter, ++j) {
         end_pos_orig[j] = (*iter); // this is a UTF-8 index
      }


      // now:
      // we'll get the total widths/number of code points in each "word"
      std::vector<R_len_t> widths_orig(nwords);
      // we'll get the total widths/number of code points without trailing whitespaces
      std::vector<R_len_t> widths_trim(nwords);
      // we'll get the end positions without trailing whitespaces
      std::vector<R_len_t> end_pos_trim(nwords);
      // detect line endings (fail on a match)

      UChar32 c = 0;
      R_len_t j = 0;
      R_len_t cur_block = 0;
      R_len_t cur_width_orig = 0;
      R_len_t cur_width_trim = 0;
      R_len_t cur_count_orig = 0;
      R_len_t cur_count_trim = 0;
      R_len_t cur_end_pos_trim = 0;
      while (j < str_cur_n) {
         R_len_t jlast = j;
         U8_NEXT(str_cur_s, j, str_cur_n, c);
         if (c < 0) // invalid utf-8 sequence
            throw StriException(MSG__INVALID_UTF8);

         if (uset_linebreaks.contains(c))
            throw StriException(MSG__NEWLINE_FOUND);

         cur_width_orig += stri__width_char(c);
         ++cur_count_orig;
         if (uset_whitespaces.contains(c)) {
// OLD: trim all white spaces from the end:
//            ++cur_count_trim;
//           [we have the normalize arg for that]

// NEW: trim just one white space at the end:
            cur_width_trim = stri__width_char(c);
            cur_count_trim = 1;
            cur_end_pos_trim = jlast;
         }
         else {
            cur_width_trim = 0;
            cur_count_trim = 0;
            cur_end_pos_trim = j;
         }

         if (j >= str_cur_n || end_pos_orig[cur_block] <= j) {
            // we'll start a new block in a moment
            if (use_length_val) {
               widths_orig[cur_block] = cur_count_orig;
               widths_trim[cur_block] = cur_count_orig-cur_count_trim;
            }
            else {
               widths_orig[cur_block] = cur_width_orig;
               widths_trim[cur_block] = cur_width_orig-cur_width_trim;
            }
            end_pos_trim[cur_block] = cur_end_pos_trim;
            cur_block++;
            cur_width_orig = 0;
            cur_width_trim = 0;
            cur_count_orig = 0;
            cur_count_trim = 0;
            cur_end_pos_trim = j;
         }
      }

      // do wrap
      std::deque<R_len_t> wrap_after; // wrap line after which word in {0..nwords-1}?
      if (exponent_val <= 0.0) {
         stri__wrap_greedy(wrap_after, nwords, width_val,
            widths_orig, widths_trim,
               (use_length_val)?((i==0)?ii.count:pi.count):((i==0)?ii.width:pi.width),
               (use_length_val)?pe.count:pe.width);
      }
      else {
         stri__wrap_dynamic(wrap_after, nwords, width_val, exponent_val,
            widths_orig, widths_trim,
               (use_length_val)?((i==0)?ii.count:pi.count):((i==0)?ii.width:pi.width),
               (use_length_val)?pe.count:pe.width);
      }

      // wrap_after.size() line breaks => wrap_after.size()+1 lines
      R_len_t nlines = (R_len_t)wrap_after.size()+1;
      R_len_t last_pos = 0;
      SEXP ans;
      STRI__PROTECT(ans = Rf_allocVector(STRSXP, nlines));
      deque<R_len_t>::iterator iter_wrap = wrap_after.begin();
      for (R_len_t u = 0; iter_wrap != wrap_after.end(); ++iter_wrap, ++u) {
         R_len_t wrap_after_cur = *iter_wrap;
         R_len_t cur_pos = end_pos_trim[wrap_after_cur];

         std::string cs;
         if (i == 0 && u == 0)     cs = ii.str;
         else if (i > 0 && u == 0) cs = pi.str;
         else                      cs = pe.str;
         cs.append(str_cur_s+last_pos, cur_pos-last_pos);
         SET_STRING_ELT(ans, u, Rf_mkCharLenCE(cs.c_str(), cs.size(), CE_UTF8));

         last_pos = end_pos_orig[wrap_after_cur];
      }

      // last line goes here:
      std::string cs;
      if (i == 0 && nlines-1 == 0)     cs = ii.str;
      else if (i > 0 && nlines-1 == 0) cs = pi.str;
      else                             cs = pe.str;
      cs.append(str_cur_s+last_pos, end_pos_trim[nwords-1]-last_pos);
      SET_STRING_ELT(ans, nlines-1, Rf_mkCharLenCE(cs.c_str(), cs.size(), CE_UTF8));

      SET_VECTOR_ELT(ret, i, ans);
      STRI__UNPROTECT(1);
   }

   if (briter) { delete briter; briter = NULL; }
   if (str_text) { utext_close(str_text); str_text = NULL; }
   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END({
      if (briter) { delete briter; briter = NULL; }
      if (str_text) { utext_close(str_text); str_text = NULL; }
   })
}