Esempio n. 1
0
/** internal function - replace multiple substrings in a single string
 * can raise Rf_error
 *
 *  @version 1.3.2 (Marek Gagolewski, 2019-02-23)
 *
 * @version 1.4.3 (Marek Gagolewski, 2019-03-12)
 *    #346: na_omit for `value`
 */
SEXP stri__sub_replacement_all_single(SEXP curs,
    SEXP from, SEXP to, SEXP length, bool omit_na_1, SEXP value)
{
    // curs is a CHARSXP in UTF-8

    PROTECT(value = stri_enc_toutf8(value,
        Rf_ScalarLogical(FALSE), Rf_ScalarLogical(FALSE)));
    R_len_t value_len     = LENGTH(value);

    R_len_t from_len      = 0; // see below
    R_len_t to_len        = 0; // see below
    R_len_t length_len    = 0; // see below
    int* from_tab         = 0; // see below
    int* to_tab           = 0; // see below
    int* length_tab       = 0; // see below

    R_len_t sub_protected = 1+ /* how many objects to PROTECT on ret? */
        stri__sub_prepare_from_to_length(from, to, length,
        from_len, to_len, length_len, from_tab, to_tab, length_tab);

    R_len_t vectorize_len = stri__recycling_rule(true, 2, // does not care about value_len
      from_len, (to_len>length_len)?to_len:length_len);

    if (vectorize_len <= 0) { // "nothing" is being replaced -> return the input as-is
        UNPROTECT(sub_protected);
        return curs;
    }
    if (value_len <= 0) { // things are supposed to be replaced with "nothing"...
        UNPROTECT(sub_protected);
        Rf_warning(MSG__REPLACEMENT_ZERO);
        return NA_STRING;
    }

    if (vectorize_len % value_len != 0)
        Rf_warning(MSG__WARN_RECYCLING_RULE2);


    const char* curs_s = CHAR(curs); // already in UTF-8
    R_len_t curs_n = LENGTH(curs);

    // first check for NAs....
    if (!omit_na_1) {
       for (R_len_t i=0; i<vectorize_len; ++i) {
           R_len_t cur_from     = from_tab[i % from_len];
           R_len_t cur_to       = (to_tab)?to_tab[i % to_len]:length_tab[i % length_len];
           if (cur_from == NA_INTEGER || cur_to == NA_INTEGER) {
               UNPROTECT(sub_protected);
               if (omit_na_1) return curs;
               else return NA_STRING;
           }
       }

       for (R_len_t i=0; i<vectorize_len; ++i) {
           if (STRING_ELT(value, i%value_len) == NA_STRING) {
               UNPROTECT(sub_protected);
               return NA_STRING;
           }
       }
    }

    // get the number of code points in curs, if required (for negative indexes)
    R_len_t curs_m = -1;
    if (IS_ASCII(curs)) curs_m = curs_n;
    else { // is UTF-8
        curs_m = 0;    // code points count
        R_len_t j = 0; // byte pos
        while (j < curs_n) {
            U8_FWD_1_UNSAFE(curs_s, j);
            ++curs_m;
        }
    }

    STRI__ERROR_HANDLER_BEGIN(sub_protected)
    std::vector<char> buf; // convenience >> speed

    R_len_t buf_size;
    R_len_t last_pos = 0;
    R_len_t byte_pos = 0, byte_pos_last;
    for (R_len_t i=0; i<vectorize_len; ++i) {
        R_len_t cur_from     = from_tab[i % from_len];
        R_len_t cur_to       = (to_tab)?to_tab[i % to_len]:length_tab[i % length_len];

        if (cur_from == NA_INTEGER || cur_to == NA_INTEGER || STRING_ELT(value, i%value_len) == NA_STRING) {
           continue;
        }

        if (cur_from < 0) cur_from = curs_m+cur_from+1;
        if (cur_from <= 0) cur_from = 1;
        cur_from--; // 1-based -> 0-based index
        if (cur_from >= curs_m) cur_from = curs_m;

        // cur_from is in [0, curs_m]

        if (length_tab) {
            if (cur_to < 0) cur_to = 0;
            cur_to = cur_from+cur_to;
        }
        else {
            if (cur_to < 0)  cur_to = curs_m+cur_to+1;
            if (cur_to < cur_from) cur_to = cur_from; // insertion
        }
        if (cur_to >= curs_m) cur_to = curs_m;

        // the chunk to replace is at code points [cur_from, cur_to)

        // Rprintf("orig [%d,%d) repl [%d,%d)\n", last_pos, cur_from, cur_from, cur_to);

        if (last_pos > cur_from)
            throw StriException(MSG__OVERLAPPING_OR_UNSORTED_INDEXES);

        // first, copy [last_pos, cur_from)
        byte_pos_last = byte_pos;
        while (last_pos < cur_from) {
            U8_FWD_1_UNSAFE(curs_s, byte_pos);
            ++last_pos;
        }
        buf_size = buf.size();
        buf.resize(buf_size+byte_pos-byte_pos_last);
        memcpy(buf.data()+buf_size, curs_s+byte_pos_last, byte_pos-byte_pos_last);

        // then, copy the corresponding replacement string
        SEXP value_cur = STRING_ELT(value, i%value_len);
        const char* value_s = CHAR(value_cur);
        R_len_t value_n = LENGTH(value_cur);
        buf_size = buf.size();
        buf.resize(buf_size+value_n);
        memcpy(buf.data()+buf_size, value_s, value_n);



        // lastly, update last_pos
        // ---> last_pos = cur_to;
        while (last_pos < cur_to) {
            U8_FWD_1_UNSAFE(curs_s, byte_pos);
            ++last_pos;
        }
      }

      // finally, copy [last_pos, curs_m)
    // Rprintf("orig [%d,%d)\n", last_pos, curs_m);
    buf_size = buf.size();
    buf.resize(buf_size+curs_n-byte_pos);
    memcpy(buf.data()+buf_size, curs_s+byte_pos, curs_n-byte_pos);

    SEXP ret;
    STRI__PROTECT(ret = Rf_mkCharLenCE(buf.data(), buf.size(), CE_UTF8));
    STRI__UNPROTECT_ALL
    return ret;
    STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
Esempio n. 2
0
/*
** Compare two UTF-8 strings for equality where the first string is
** a "LIKE" expression. Return true (1) if they are the same and 
** false (0) if they are different.
*/
static int icuLikeCompare(
  const uint8_t *zPattern,   /* LIKE pattern */
  const uint8_t *zString,    /* The UTF-8 string to compare against */
  const UChar32 uEsc         /* The escape character */
){
  static const int MATCH_ONE = (UChar32)'_';
  static const int MATCH_ALL = (UChar32)'%';

  int iPattern = 0;       /* Current byte index in zPattern */
  int iString = 0;        /* Current byte index in zString */

  int prevEscape = 0;     /* True if the previous character was uEsc */

  while( zPattern[iPattern]!=0 ){

    /* Read (and consume) the next character from the input pattern. */
    UChar32 uPattern;
    U8_NEXT_UNSAFE(zPattern, iPattern, uPattern);
    assert(uPattern!=0);

    /* There are now 4 possibilities:
    **
    **     1. uPattern is an unescaped match-all character "%",
    **     2. uPattern is an unescaped match-one character "_",
    **     3. uPattern is an unescaped escape character, or
    **     4. uPattern is to be handled as an ordinary character
    */
    if( !prevEscape && uPattern==MATCH_ALL ){
      /* Case 1. */
      uint8_t c;

      /* Skip any MATCH_ALL or MATCH_ONE characters that follow a
      ** MATCH_ALL. For each MATCH_ONE, skip one character in the 
      ** test string.
      */
      while( (c=zPattern[iPattern]) == MATCH_ALL || c == MATCH_ONE ){
        if( c==MATCH_ONE ){
          if( zString[iString]==0 ) return 0;
          U8_FWD_1_UNSAFE(zString, iString);
        }
        iPattern++;
      }

      if( zPattern[iPattern]==0 ) return 1;

      while( zString[iString] ){
        if( icuLikeCompare(&zPattern[iPattern], &zString[iString], uEsc) ){
          return 1;
        }
        U8_FWD_1_UNSAFE(zString, iString);
      }
      return 0;

    }else if( !prevEscape && uPattern==MATCH_ONE ){
      /* Case 2. */
      if( zString[iString]==0 ) return 0;
      U8_FWD_1_UNSAFE(zString, iString);

    }else if( !prevEscape && uPattern==uEsc){
      /* Case 3. */
      prevEscape = 1;

    }else{
      /* Case 4. */
      UChar32 uString;
      U8_NEXT_UNSAFE(zString, iString, uString);
      uString = u_foldCase(uString, U_FOLD_CASE_DEFAULT);
      uPattern = u_foldCase(uPattern, U_FOLD_CASE_DEFAULT);
      if( uString!=uPattern ){
        return 0;
      }
      prevEscape = 0;
    }
  }

  return zString[iString]==0;
}
Esempio n. 3
0
static void TestFwdBack(){ 
    static const uint8_t input[]={0x61, 0xF0, 0x90, 0x90, 0x81, 0xff, 0x62, 0xc0, 0x80, 0x7f, 0x8f, 0xc0, 0x63, 0x81, 0x90, 0x90, 0xF0, 0x00};
    static const uint16_t fwd_unsafe[] ={1, 5, 6, 7,  9, 10, 11, 13, 14, 15, 16,  20, };
    static const uint16_t fwd_safe[]   ={1, 5, 6, 7, 9, 10, 11,  12, 13, 14, 15, 16, 17, 18};
    static const uint16_t back_unsafe[]={17, 16, 12, 11, 9, 7, 6, 5, 1, 0};
    static const uint16_t back_safe[]  ={17, 16, 15, 14, 13, 12, 11, 10, 9, 7, 6, 5, 1, 0};

    static const uint16_t Nvalue[]= {0, 1, 2, 3, 1, 2, 1, 5};
    static const uint16_t fwd_N_unsafe[] ={0, 1, 6, 10, 11, 14, 15};
    static const uint16_t fwd_N_safe[]   ={0, 1, 6, 10, 11, 13, 14, 18}; /*safe macro keeps it at the end of the string */
    static const uint16_t back_N_unsafe[]={18, 17, 12, 7, 6, 1, 0};
    static const uint16_t back_N_safe[]  ={18, 17, 15, 12, 11, 9, 7, 0};   


    uint32_t offunsafe=0, offsafe=0;

    uint32_t i=0;
    while(offunsafe < sizeof(input)){
        UTF8_FWD_1_UNSAFE(input, offunsafe);
        if(offunsafe != fwd_unsafe[i]){
            log_err("ERROR: Forward_unsafe offset expected:%d, Got:%d\n", fwd_unsafe[i], offunsafe);
        }
        i++;
    }

    i=0;
    while(offunsafe < sizeof(input)){
        U8_FWD_1_UNSAFE(input, offunsafe);
        if(offunsafe != fwd_unsafe[i]){
            log_err("ERROR: U8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", fwd_unsafe[i], offunsafe);
        }
        i++;
    }

    i=0;
    while(offsafe < sizeof(input)){
        UTF8_FWD_1_SAFE(input, offsafe, sizeof(input));
        if(offsafe != fwd_safe[i]){
            log_err("ERROR: Forward_safe offset expected:%d, Got:%d\n", fwd_safe[i], offsafe);
        }
        i++;
    }

    i=0;
    while(offsafe < sizeof(input)){
        U8_FWD_1(input, offsafe, sizeof(input));
        if(offsafe != fwd_safe[i]){
            log_err("ERROR: U8_FWD_1 offset expected:%d, Got:%d\n", fwd_safe[i], offsafe);
        }
        i++;
    }

    offunsafe=sizeof(input);
    i=0;
    while(offunsafe > 0){
        UTF8_BACK_1_UNSAFE(input, offunsafe);
        if(offunsafe != back_unsafe[i]){
            log_err("ERROR: Backward_unsafe offset expected:%d, Got:%d\n", back_unsafe[i], offunsafe);
        }
        i++;
    }

    offunsafe=sizeof(input);
    i=0;
    while(offunsafe > 0){
        U8_BACK_1_UNSAFE(input, offunsafe);
        if(offunsafe != back_unsafe[i]){
            log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", back_unsafe[i], offunsafe);
        }
        i++;
    }

    i=0;
    offsafe=sizeof(input);
    while(offsafe > 0){
        UTF8_BACK_1_SAFE(input, 0,  offsafe);
        if(offsafe != back_safe[i]){
            log_err("ERROR: Backward_safe offset expected:%d, Got:%d\n", back_unsafe[i], offsafe);
        }
        i++;
    }

    i=0;
    offsafe=sizeof(input);
    while(offsafe > 0){
        U8_BACK_1(input, 0,  offsafe);
        if(offsafe != back_safe[i]){
            log_err("ERROR: U8_BACK_1 offset expected:%d, Got:%d\n", back_unsafe[i], offsafe);
        }
        i++;
    }

    offunsafe=0;
    for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){  
        UTF8_FWD_N_UNSAFE(input, offunsafe, Nvalue[i]);
        if(offunsafe != fwd_N_unsafe[i]){
            log_err("ERROR: Forward_N_unsafe offset=%d expected:%d, Got:%d\n", i, fwd_N_unsafe[i], offunsafe);
        }
    }

    offunsafe=0;
    for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){  
        U8_FWD_N_UNSAFE(input, offunsafe, Nvalue[i]);
        if(offunsafe != fwd_N_unsafe[i]){
            log_err("ERROR: U8_FWD_N_UNSAFE offset=%d expected:%d, Got:%d\n", i, fwd_N_unsafe[i], offunsafe);
        }
    }

    offsafe=0;
    for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
        UTF8_FWD_N_SAFE(input, offsafe, sizeof(input), Nvalue[i]);
        if(offsafe != fwd_N_safe[i]){
            log_err("ERROR: Forward_N_safe offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe);
        }
    
    }

    offsafe=0;
    for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
        U8_FWD_N(input, offsafe, sizeof(input), Nvalue[i]);
        if(offsafe != fwd_N_safe[i]){
            log_err("ERROR: U8_FWD_N offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe);
        }
    
    }

    offunsafe=sizeof(input);
    for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){
        UTF8_BACK_N_UNSAFE(input, offunsafe, Nvalue[i]);
        if(offunsafe != back_N_unsafe[i]){
            log_err("ERROR: backward_N_unsafe offset=%d expected:%d, Got:%d\n", i, back_N_unsafe[i], offunsafe);
        }
    }

    offunsafe=sizeof(input);
    for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){
        U8_BACK_N_UNSAFE(input, offunsafe, Nvalue[i]);
        if(offunsafe != back_N_unsafe[i]){
            log_err("ERROR: U8_BACK_N_UNSAFE offset=%d expected:%d, Got:%d\n", i, back_N_unsafe[i], offunsafe);
        }
    }

    offsafe=sizeof(input);
    for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
        UTF8_BACK_N_SAFE(input, 0, offsafe, Nvalue[i]);
        if(offsafe != back_N_safe[i]){
            log_err("ERROR: backward_N_safe offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe);
        }
    }

    offsafe=sizeof(input);
    for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
        U8_BACK_N(input, 0, offsafe, Nvalue[i]);
        if(offsafe != back_N_safe[i]){
            log_err("ERROR: U8_BACK_N offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe);
        }
    }
}
Esempio n. 4
0
static void TestFwdBackUnsafe() {
    /*
     * Use a (mostly) well-formed UTF-8 string and test at code point boundaries.
     * The behavior of _UNSAFE macros for ill-formed strings is undefined.
     */
    static const uint8_t input[]={
        0x61,
        0xf0, 0x90, 0x90, 0x81,
        0xc0, 0x80,  /* non-shortest form */
        0xe2, 0x82, 0xac,
        0xc2, 0xa1,
        0xf4, 0x8f, 0xbf, 0xbf,
        0x00
    };
    static const int8_t boundaries[]={ 0, 1, 5, 7, 10, 12, 16, 17 };

    int32_t offset;
    int32_t i;
    for(i=1, offset=0; offset<LENGTHOF(input); ++i) {
        UTF8_FWD_1_UNSAFE(input, offset);
        if(offset != boundaries[i]){
            log_err("ERROR: UTF8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
        }
    }
    for(i=1, offset=0; offset<LENGTHOF(input); ++i) {
        U8_FWD_1_UNSAFE(input, offset);
        if(offset != boundaries[i]){
            log_err("ERROR: U8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
        }
    }

    for(i=LENGTHOF(boundaries)-2, offset=LENGTHOF(input); offset>0; --i) {
        UTF8_BACK_1_UNSAFE(input, offset);
        if(offset != boundaries[i]){
            log_err("ERROR: UTF8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
        }
    }
    for(i=LENGTHOF(boundaries)-2, offset=LENGTHOF(input); offset>0; --i) {
        U8_BACK_1_UNSAFE(input, offset);
        if(offset != boundaries[i]){
            log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
        }
    }

    for(i=0; i<LENGTHOF(boundaries); ++i) {
        offset=0;
        UTF8_FWD_N_UNSAFE(input, offset, i);
        if(offset != boundaries[i]) {
            log_err("ERROR: UTF8_FWD_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
        }
    }
    for(i=0; i<LENGTHOF(boundaries); ++i) {
        offset=0;
        U8_FWD_N_UNSAFE(input, offset, i);
        if(offset != boundaries[i]) {
            log_err("ERROR: U8_FWD_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[i], offset);
        }
    }

    for(i=0; i<LENGTHOF(boundaries); ++i) {
        int32_t j=LENGTHOF(boundaries)-1-i;
        offset=LENGTHOF(input);
        UTF8_BACK_N_UNSAFE(input, offset, i);
        if(offset != boundaries[j]) {
            log_err("ERROR: UTF8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[j], offset);
        }
    }
    for(i=0; i<LENGTHOF(boundaries); ++i) {
        int32_t j=LENGTHOF(boundaries)-1-i;
        offset=LENGTHOF(input);
        U8_BACK_N_UNSAFE(input, offset, i);
        if(offset != boundaries[j]) {
            log_err("ERROR: U8_BACK_N_UNSAFE offset expected:%d, Got:%d\n", boundaries[j], offset);
        }
    }
}