Пример #1
0
static void TestCodeUnitValues()
{
    static const uint8_t codeunit[]={0x00, 0x65, 0x7e, 0x7f, 0xc0, 0xc4, 0xf0, 0xfd, 0x80, 0x81, 0xbc, 0xbe,};
    
    int16_t i;
    for(i=0; i<sizeof(codeunit)/sizeof(codeunit[0]); i++){
        uint8_t c=codeunit[i];
        log_verbose("Testing code unit value of %x\n", c);
        if(i<4){
            if(!UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) || UTF8_IS_TRAIL(c) || !U8_IS_SINGLE(c) || U8_IS_LEAD(c) || U8_IS_TRAIL(c)){
                log_err("ERROR: 0x%02x is a single byte but results in single: %c lead: %c trail: %c\n",
                    c, UTF8_IS_SINGLE(c) ? 'y' : 'n', UTF8_IS_LEAD(c) ? 'y' : 'n', UTF8_IS_TRAIL(c) ? 'y' : 'n');
            }
        } else if(i< 8){
            if(!UTF8_IS_LEAD(c) || UTF8_IS_SINGLE(c) || UTF8_IS_TRAIL(c) || !U8_IS_LEAD(c) || U8_IS_SINGLE(c) || U8_IS_TRAIL(c)){
                log_err("ERROR: 0x%02x is a lead byte but results in single: %c lead: %c trail: %c\n",
                    c, UTF8_IS_SINGLE(c) ? 'y' : 'n', UTF8_IS_LEAD(c) ? 'y' : 'n', UTF8_IS_TRAIL(c) ? 'y' : 'n');
            }
        } else if(i< 12){
            if(!UTF8_IS_TRAIL(c) || UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) || !U8_IS_TRAIL(c) || U8_IS_SINGLE(c) || U8_IS_LEAD(c)){
                log_err("ERROR: 0x%02x is a trail byte but results in single: %c lead: %c trail: %c\n",
                    c, UTF8_IS_SINGLE(c) ? 'y' : 'n', UTF8_IS_LEAD(c) ? 'y' : 'n', UTF8_IS_TRAIL(c) ? 'y' : 'n');
            }
        }
    }
}
Пример #2
0
/** Convert character vector to ASCII
 *
 * All charcodes > 127 are replaced with subst chars (0x1A)
 *
 * @param str character vector
 * @return character vector
 *
 * @version 0.1 (Marek Gagolewski)
 * @version 0.2 (Marek Gagolewski, 2013-06-16) make StriException-friendly
 */
SEXP stri_enc_toascii(SEXP str)
{
   str = stri_prepare_arg_string(str, "str");
   R_len_t n = LENGTH(str);

   STRI__ERROR_HANDLER_BEGIN
   SEXP ret;
   PROTECT(ret = Rf_allocVector(STRSXP, n));
   for (R_len_t i=0; i<n; ++i) {
      SEXP curs = STRING_ELT(str, i);
      if (curs == NA_STRING) {
         SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }
      else if (IS_ASCII(curs)) {
         SET_STRING_ELT(ret, i, curs);
      }
      else if (IS_UTF8(curs)) {
         R_len_t curn = LENGTH(curs);
         const char* curs_tab = CHAR(curs);
         // TODO: buffer reuse....
         String8 buf(curn+1); // this may be 4 times too much
         R_len_t k = 0;
         UChar32 c;
         for (int j=0; j<curn; ) {
            U8_NEXT(curs_tab, j, curn, c);
            if (c > ASCII_MAXCHARCODE)
               buf.data()[k++] = ASCII_SUBSTITUTE;
            else
               buf.data()[k++] = (char)c;
         }
         SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), k, CE_UTF8)); // will be marked as ASCII anyway by mkCharLenCE
      }
      else { // some 8-bit encoding
         R_len_t curn = LENGTH(curs);
         const char* curs_tab = CHAR(curs);
         // TODO: buffer reuse....
         String8 buf(curn+1);
         R_len_t k = 0;
         for (R_len_t j=0; j<curn; ++j) {
            if (U8_IS_SINGLE(curs_tab[j]))
               buf.data()[k++] = curs_tab[j];
            else {
               buf.data()[k++] = (char)ASCII_SUBSTITUTE; // subst char in ascii
            }
         }
         SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), k, CE_UTF8)); // will be marked as ASCII anyway by mkCharLenCE
      }
   }
   UNPROTECT(1);
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
Пример #3
0
/** Convert character vector to UTF-8
 *
 * @param str character vector
 * @param is_unknown_8bit single logical value;
 * if TRUE, then in case of ENC_NATIVE or ENC_LATIN1, UTF-8
 * REPLACEMENT CHARACTERs (U+FFFD) are
 * put for codes > 127
 * @return character vector
 *
 * @version 0.1 (Marek Gagolewski)
 * @version 0.2 (Marek Gagolewski, 2013-06-16) make StriException-friendly
 */
SEXP stri_enc_toutf8(SEXP str, SEXP is_unknown_8bit)
{
   str = stri_prepare_arg_string(str, "str");
   R_len_t n = LENGTH(str);
   bool is_unknown_8bit_logical = stri__prepare_arg_logical_1_notNA(is_unknown_8bit, "is_unknown_8bit");

   STRI__ERROR_HANDLER_BEGIN
   if (is_unknown_8bit_logical) {
      SEXP ret;
      PROTECT(ret = Rf_allocVector(STRSXP, n));
      for (R_len_t i=0; i<n; ++i) {
         SEXP curs = STRING_ELT(str, i);
         if (curs == NA_STRING) {
            SET_STRING_ELT(ret, i, NA_STRING);
            continue;
         }
         else if (IS_ASCII(curs) || IS_UTF8(curs)) {
            SET_STRING_ELT(ret, i, curs);
         }
         else { // some 8-bit encoding
            R_len_t curn = LENGTH(curs);
            const char* curs_tab = CHAR(curs);
            // TODO: buffer reuse....
            String8 buf(curn*3+1); // one byte -> either one byte or FFFD, which is 3 bytes in UTF-8
            R_len_t k = 0;
            for (R_len_t j=0; j<curn; ++j) {
               if (U8_IS_SINGLE(curs_tab[j]))
                  buf.data()[k++] = curs_tab[j];
               else { // 0xEF 0xBF 0xBD
                  buf.data()[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE1;
                  buf.data()[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE2;
                  buf.data()[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE3;
               }
            }
            SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), k, CE_UTF8));
         }
      }
      UNPROTECT(1);
      return ret;
   }
   else {
      // Trivial - everything we need is in StriContainerUTF8 :)
      StriContainerUTF8 str_cont(str, n);
      return str_cont.toR();
   }
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
/** Convert character vector to ASCII
 *
 * All charcodes > 127 are replaced with subst chars (0x1A)
 *
 * @param str character vector
 * @return character vector
 *
 * @version 0.1-?? (Marek Gagolewski)
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-16)
 *          make StriException-friendly
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-03-30)
 *          use single common buf;
 *          warn on invalid utf8 byte stream
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_enc_toascii(SEXP str)
{
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   R_len_t n = LENGTH(str);

   STRI__ERROR_HANDLER_BEGIN(1)

   // get buf size
   R_len_t bufsize = 0;
   for (R_len_t i=0; i<n; ++i) {
      SEXP curs = STRING_ELT(str, i);
      if (curs == NA_STRING)
         continue;

      R_len_t ni = LENGTH(curs);
      if (ni > bufsize) bufsize = ni;
   }
   String8buf buf(bufsize); // no more bytes than this needed
   char* bufdata = buf.data();

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(STRSXP, n));
   for (R_len_t i=0; i<n; ++i) {
      SEXP curs = STRING_ELT(str, i);
      if (curs == NA_STRING || IS_ASCII(curs)) {
         // nothing to do
         SET_STRING_ELT(ret, i, curs);
         continue;
      }

      R_len_t curn = LENGTH(curs);
      const char* curs_tab = CHAR(curs);

      if (IS_UTF8(curs)) {
         R_len_t k = 0, j = 0;
         UChar32 c;
         while (j<curn) {
            U8_NEXT(curs_tab, j, curn, c);
            if (c < 0) {
               Rf_warning(MSG__INVALID_CODE_POINT_FIXING);
               bufdata[k++] = ASCII_SUBSTITUTE;
            }
            else if (c > ASCII_MAXCHARCODE)
               bufdata[k++] = ASCII_SUBSTITUTE;
            else
               bufdata[k++] = (char)c;
         }
         SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8));
         // the string will be marked as ASCII anyway by mkCharLenCE
      }
      else { // some 8-bit encoding
         R_len_t k = 0;
         for (R_len_t j=0; j<curn; ++j) {
            if (U8_IS_SINGLE(curs_tab[j]))
               bufdata[k++] = curs_tab[j];
            else {
               bufdata[k++] = (char)ASCII_SUBSTITUTE; // subst char in ascii
            }
         }
         SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8));
         // the string will be marked as ASCII anyway by mkCharLenCE
      }
   }

   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
/** Convert character vector to UTF-8
 *
 * @param str character vector
 * @param is_unknown_8bit single logical value;
 * if TRUE, then in case of ENC_NATIVE or ENC_LATIN1, UTF-8
 * REPLACEMENT CHARACTERs (U+FFFD) are
 * put for codes > 127
 * @param validate single logical value (or NA)
 *
 * @return character vector
 *
 * @version 0.1-XX (Marek Gagolewski)
 *
 * @version 0.1-XX (Marek Gagolewski, 2013-06-16)
 *                  make StriException-friendly
 *
 * @version 0.2-1  (Marek Gagolewski, 2014-03-26)
 *                 Use one String8buf;
 *                 is_unknown_8bit_logical and UTF-8 tries now to remove BOMs
 *
 * @version 0.2-1  (Marek Gagolewksi, 2014-03-30)
 *                 added validate arg
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_enc_toutf8(SEXP str, SEXP is_unknown_8bit, SEXP validate)
{
   PROTECT(validate = stri_prepare_arg_logical_1(validate, "validate"));
   bool is_unknown_8bit_logical =
      stri__prepare_arg_logical_1_notNA(is_unknown_8bit, "is_unknown_8bit");
   PROTECT(str = stri_prepare_arg_string(str, "str"));
   R_len_t n = LENGTH(str);

   STRI__ERROR_HANDLER_BEGIN(2)
   SEXP ret;
   if (!is_unknown_8bit_logical) {
      // Trivial - everything we need is in StriContainerUTF8 :)
      // which removes BOMs silently
      StriContainerUTF8 str_cont(str, n);
      STRI__PROTECT(ret = str_cont.toR());
   }
   else {
      // get buf size
      R_len_t bufsize = 0;
      for (R_len_t i=0; i<n; ++i) {
         SEXP curs = STRING_ELT(str, i);
         if (curs == NA_STRING || IS_ASCII(curs) || IS_UTF8(curs))
            continue;

         R_len_t ni = LENGTH(curs);
         if (ni > bufsize) bufsize = ni;
      }
      String8buf buf(bufsize*3); // either 1 byte < 127 or U+FFFD == 3 bytes UTF-8
      char* bufdata = buf.data();

      STRI__PROTECT(ret = Rf_allocVector(STRSXP, n));
      for (R_len_t i=0; i<n; ++i) {
         SEXP curs = STRING_ELT(str, i);
         if (curs == NA_STRING) {
            SET_STRING_ELT(ret, i, NA_STRING);
            continue;
         }

         if (IS_ASCII(curs) || IS_UTF8(curs)) {
            R_len_t curs_n = LENGTH(curs);
            const char* curs_s = CHAR(curs);
            if (curs_n >= 3 &&
               (uint8_t)(curs_s[0]) == UTF8_BOM_BYTE1 &&
               (uint8_t)(curs_s[1]) == UTF8_BOM_BYTE2 &&
               (uint8_t)(curs_s[2]) == UTF8_BOM_BYTE3) {
               // has BOM - get rid of it
               SET_STRING_ELT(ret, i, Rf_mkCharLenCE(curs_s+3, curs_n-3, CE_UTF8));
            }
            else
               SET_STRING_ELT(ret, i, curs);

            continue;
         }

         // otherwise, we have an 8-bit encoding
         R_len_t curn = LENGTH(curs);
         const char* curs_tab = CHAR(curs);
         R_len_t k = 0;
         for (R_len_t j=0; j<curn; ++j) {
            if (U8_IS_SINGLE(curs_tab[j]))
               bufdata[k++] = curs_tab[j];
            else { // 0xEF 0xBF 0xBD
               bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE1;
               bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE2;
               bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE3;
            }
         }
         SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8));
      }

   }

   // validate utf8 byte stream
   if (LOGICAL(validate)[0] != FALSE) { // NA or TRUE
      R_len_t ret_n = LENGTH(ret);
      for (R_len_t i=0; i<ret_n; ++i) {
         SEXP curs = STRING_ELT(ret, i);
         if (curs == NA_STRING) continue;

         const char* s = CHAR(curs);
         R_len_t sn = LENGTH(curs);
         R_len_t j = 0;
         UChar32 c = 0;
         while (c >= 0 && j < sn) {
            U8_NEXT(s, j, sn, c);
         }

         if (c >= 0) continue; // valid, nothing to do

         if (LOGICAL(validate)[0] == NA_LOGICAL) {
            Rf_warning(MSG__INVALID_CODE_POINT_REPLNA);
            SET_STRING_ELT(ret, i, NA_STRING);
         }
         else {
            int bufsize = sn*3; // maximum: 1 byte -> U+FFFD (3 bytes)
            String8buf buf(bufsize); // maximum: 1 byte -> U+FFFD (3 bytes)
            char* bufdata = buf.data();

            j = 0;
            R_len_t k = 0;
            UBool err = FALSE;
            while (!err && j < sn) {
               U8_NEXT(s, j, sn, c);
               if (c >= 0) {
                  U8_APPEND((uint8_t*)bufdata, k, bufsize, c, err);
               } else {
                  Rf_warning(MSG__INVALID_CODE_POINT_FIXING);
                  bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE1;
                  bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE2;
                  bufdata[k++] = (char)UCHAR_REPLACEMENT_UTF8_BYTE3;
               }
            }

            if (err) throw StriException(MSG__INTERNAL_ERROR);
            SET_STRING_ELT(ret, i, Rf_mkCharLenCE(bufdata, k, CE_UTF8));
         }
      }
   }

   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}