Ejemplo n.º 1
0
/** Check R encoding marking *for testing only*
 *  This function should not be exported
 *
 *  @param s character vector
 *
 *  Results are printed on STDERR
 *
 * @version 0.1 (Marek Gagolewski)
 */
SEXP stri_test_Rmark(SEXP s)
{
#ifndef NDEBUG
   s = stri_prepare_arg_string(s, "str");
   int ns = LENGTH(s);
   for (int i=0; i < ns; ++i) {
      fprintf(stdout, "!NDEBUG: Element #%d:\n", i);
      SEXP curs = STRING_ELT(s, i);
      if (curs == NA_STRING){
         fprintf(stdout, "!NDEBUG: \tNA\n");
         continue;
      }
      //const char* string = CHAR(curs);
      fprintf(stdout, "!NDEBUG: \tMARK_ASCII = %d\n", (IS_ASCII(curs) > 0));
      fprintf(stdout, "!NDEBUG: \tMARK_UTF8  = %d\n", (IS_UTF8(curs) > 0));
      fprintf(stdout, "!NDEBUG: \tMARK_LATIN1= %d\n", (IS_LATIN1(curs) > 0));
      fprintf(stdout, "!NDEBUG: \tMARK_BYTES = %d\n", (IS_BYTES(curs) > 0));
      fprintf(stdout, "!NDEBUG: \n");
   }
   return R_NilValue;
#else
   Rf_error("This function is enabled only if NDEBUG is undef.");
   return s;  // s here avoids compiler warning
#endif
}
Ejemplo n.º 2
0
/* This may return a R_alloc-ed result, so the caller has to manage the
   R_alloc stack */
const char *translateChar0(SEXP x)
{
    if(TYPEOF(x) != CHARSXP)
	error(_("'%s' must be called on a CHARSXP"), "translateChar0");
    if(IS_BYTES(x)) return CHAR(x);
    return translateChar(x);
}
Ejemplo n.º 3
0
wchar_t *filenameToWchar_wcc(const SEXP fn, const Rboolean expand){
	static wchar_t filename[BSIZE + 1];
	void *obj;
	const char *from = "", *inbuf;
	char *outbuf;
	size_t inb, outb, res;

	if(!strlen(CHAR(fn))){
		wcscpy(filename, L"");
		return filename;
	}

	if(IS_LATIN1(fn)) from = "latin1";
	if(IS_UTF8(fn)) from = "UTF-8";
	if(IS_BYTES(fn)) REprintf("encoding of a filename cannot be 'bytes'");

	obj = Riconv_open("UCS-2LE", from);
	if(obj == (void *)(-1))
		REprintf("unsupported conversion from '%s' in shellexec_wcc.c",
			  from);

	if(expand) inbuf = R_ExpandFileName(CHAR(fn)); else inbuf = CHAR(fn);

	inb = strlen(inbuf)+1; outb = 2*BSIZE;
	outbuf = (char *) filename;
	res = Riconv(obj, &inbuf , &inb, &outbuf, &outb);
	Riconv_close(obj);
	if(inb > 0) REprintf("file name conversion problem -- name too long?");
	if(res == -1) REprintf("file name conversion problem");

	return filename;
} /* End of filenameToWchar_wcc(). */
Ejemplo n.º 4
0
cetype_t getCharCE(SEXP x)
{
    if(TYPEOF(x) != CHARSXP)
	error(_("'%s' must be called on a CHARSXP"), "getCharCE");
    if(IS_UTF8(x)) return CE_UTF8;
    else if(IS_LATIN1(x)) return CE_LATIN1;
    else if(IS_BYTES(x)) return CE_BYTES;
    else return CE_NATIVE;
}
Ejemplo n.º 5
0
/* This may return a R_alloc-ed result, so the caller has to manage the
   R_alloc stack */
const char *translateCharUTF8(SEXP x)
{
    void *obj;
    const char *inbuf, *ans = CHAR(x);
    char *outbuf, *p;
    size_t inb, outb, res;
    R_StringBuffer cbuff = {NULL, 0, MAXELTSIZE};

    if(TYPEOF(x) != CHARSXP)
	error(_("'%s' must be called on a CHARSXP"), "translateCharUTF8");
    if(x == NA_STRING) return ans;
    if(IS_UTF8(x)) return ans;
    if(IS_ASCII(x)) return ans;
    if(IS_BYTES(x))
	error(_("translating strings with \"bytes\" encoding is not allowed"));

    obj = Riconv_open("UTF-8", IS_LATIN1(x) ? "latin1" : "");
    if(obj == (void *)(-1)) 
#ifdef Win32
	error(_("unsupported conversion from '%s' in codepage %d"),
	      "latin1", localeCP);
#else
       error(_("unsupported conversion from '%s' to '%s'"), "latin1", "UTF-8");
#endif
    R_AllocStringBuffer(0, &cbuff);
top_of_loop:
    inbuf = ans; inb = strlen(inbuf);
    outbuf = cbuff.data; outb = cbuff.bufsize - 1;
    /* First initialize output */
    Riconv (obj, NULL, NULL, &outbuf, &outb);
next_char:
    /* Then convert input  */
    res = Riconv(obj, &inbuf , &inb, &outbuf, &outb);
    if(res == -1 && errno == E2BIG) {
	R_AllocStringBuffer(2*cbuff.bufsize, &cbuff);
	goto top_of_loop;
    } else if(res == -1 && (errno == EILSEQ || errno == EINVAL)) {
	if(outb < 5) {
	    R_AllocStringBuffer(2*cbuff.bufsize, &cbuff);
	    goto top_of_loop;
	}
	snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf);
	outbuf += 4; outb -= 4;
	inbuf++; inb--;
	goto next_char;
    }
    *outbuf = '\0';
    Riconv_close(obj);
    res = strlen(cbuff.data) + 1;
    p = R_alloc(res, 1);
    memcpy(p, cbuff.data, res);
    R_FreeStringBuffer(&cbuff);
    return p;
}
/** Get Declared Encodings of Each String
 *
 * @param str a character vector or an object coercible to
 * @return a character vector
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-03-25)
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_enc_mark(SEXP str) {
   PROTECT(str = stri_prepare_arg_string(str, "str"));    // prepare string argument

   STRI__ERROR_HANDLER_BEGIN(1)
   R_len_t str_len = LENGTH(str);

   // some of them will not be used in this call, but we're lazy
   SEXP mark_ascii, mark_latin1, mark_utf8, mark_native, mark_bytes;
   STRI__PROTECT(mark_ascii  = Rf_mkChar("ASCII"));
   STRI__PROTECT(mark_latin1 = Rf_mkChar("latin1"));
   STRI__PROTECT(mark_utf8   = Rf_mkChar("UTF-8"));
   STRI__PROTECT(mark_native = Rf_mkChar("native"));
   STRI__PROTECT(mark_bytes  = Rf_mkChar("bytes"));

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_len));

   for (R_len_t i=0; i<str_len; ++i) {
      SEXP curs = STRING_ELT(str, i);
      if (curs == NA_STRING) {
         SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      if (IS_ASCII(curs))
         SET_STRING_ELT(ret, i, mark_ascii);
      else if (IS_UTF8(curs))
         SET_STRING_ELT(ret, i, mark_utf8);
      else if (IS_BYTES(curs))
         SET_STRING_ELT(ret, i, mark_bytes);
      else if (IS_LATIN1(curs))
         SET_STRING_ELT(ret, i, mark_latin1);
      else
         SET_STRING_ELT(ret, i, mark_native);
   }

   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END(;/* nothing special to be done on error */)
}
Ejemplo n.º 7
0
/**
 * Count the number of characters in a string
 *
 * Note that ICU permits only strings of length < 2^31.
 * @param s R character vector
 * @return integer vector
 *
 * @version 0.1-?? (Marcin Bujarski)
 *
 * @version 0.1-?? (Marek Gagolewski)
 *          Multiple input encoding support
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-16)
 *          make StriException-friendly
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-03-27)
 *          using StriUcnv;
 *          warn on invalid utf-8 sequences
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 */
SEXP stri_length(SEXP str)
{
    PROTECT(str = stri_prepare_arg_string(str, "str"));

    STRI__ERROR_HANDLER_BEGIN(1)

    R_len_t str_n = LENGTH(str);
    SEXP ret;
    STRI__PROTECT(ret = Rf_allocVector(INTSXP, str_n));
    int* retint = INTEGER(ret);

    StriUcnv ucnvNative(NULL);

    for (R_len_t k = 0; k < str_n; k++) {
        SEXP curs = STRING_ELT(str, k);
        if (curs == NA_STRING) {
            retint[k] = NA_INTEGER;
            continue;
        }

        R_len_t curs_n = LENGTH(curs);  // O(1) - stored by R
        if (IS_ASCII(curs) || IS_LATIN1(curs)) {
            retint[k] = curs_n;
        }
        else if (IS_BYTES(curs)) {
            throw StriException(MSG__BYTESENC);
        }
        else if (IS_UTF8(curs) || ucnvNative.isUTF8()) { // utf8 or native-utf8
            UChar32 c = 0;
            const char* curs_s = CHAR(curs);
            R_len_t j = 0;
            R_len_t i = 0;
            while (c >= 0 && j < curs_n) {
                U8_NEXT(curs_s, j, curs_n, c); // faster that U8_FWD_1 & gives bad UChar32s
                i++;
            }

            if (c < 0) { // invalid utf-8 sequence
                Rf_warning(MSG__INVALID_UTF8);
                retint[k] = NA_INTEGER;
            }
            else
                retint[k] = i;
        }
        else if (ucnvNative.is8bit()) { // native-8bit
            retint[k] = curs_n;
        }
        else { // native encoding, not 8 bit

            UConverter* uconv = ucnvNative.getConverter();

            // native encoding which is neither 8-bit, nor UTF-8 (e.g. 'Big5')
            // this is weird, but we'll face it
            UErrorCode status = U_ZERO_ERROR;
            const char* source = CHAR(curs);
            const char* sourceLimit = source + curs_n;
            R_len_t j;
            for (j = 0; source != sourceLimit; j++) {
                /*ignore_retval=*/ucnv_getNextUChar(uconv, &source, sourceLimit, &status);
                STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
            }
            retint[k] = j; // all right, we got it!
        }
    }

    STRI__UNPROTECT_ALL
    return ret;

    STRI__ERROR_HANDLER_END({ /* no special action on error */ })
}
Ejemplo n.º 8
0
int R_nchar(SEXP string, nchar_type type_,
	    Rboolean allowNA, Rboolean keepNA, const char* msg_name)
{
    if (string == NA_STRING)
	return keepNA ? NA_INTEGER : 2;
    // else :
    switch(type_) {
    case Bytes:
	return LENGTH(string);
	break;
    case Chars:
	if (IS_UTF8(string)) {
	    const char *p = CHAR(string);
	    if (!utf8Valid(p)) {
		if (!allowNA)
		    error(_("invalid multibyte string, %s"), msg_name);
		return NA_INTEGER;
	    } else {
		int nc = 0;
		for( ; *p; p += utf8clen(*p)) nc++;
		return nc;
	    }
	} else if (IS_BYTES(string)) {
	    if (!allowNA) /* could do chars 0 */
		error(_("number of characters is not computable in \"bytes\" encoding, %s"),
		      msg_name);
	    return NA_INTEGER;
	} else if (mbcslocale) {
	    int nc = (int) mbstowcs(NULL, translateChar(string), 0);
	    if (!allowNA && nc < 0)
		error(_("invalid multibyte string, %s"), msg_name);
	    return (nc >= 0 ? nc : NA_INTEGER);
	} else
	    return ((int) strlen(translateChar(string)));
	break;
    case Width:
	if (IS_UTF8(string)) {
	    const char *p = CHAR(string);
	    if (!utf8Valid(p)) {
		if (!allowNA)
		    error(_("invalid multibyte string, %s"), msg_name);
		return NA_INTEGER;
	    } else {
		wchar_t wc1;
		int nc = 0;
		for( ; *p; p += utf8clen(*p)) {
		    utf8toucs(&wc1, p);
		    nc += Ri18n_wcwidth(wc1);
		}
		return nc;
	    }
	} else if (IS_BYTES(string)) {
	    if (!allowNA) /* could do width 0 */
		error(_("width is not computable for %s in \"bytes\" encoding"),
		      msg_name);
	    return NA_INTEGER;
	} else if (mbcslocale) {
	    const char *xi = translateChar(string);
	    int nc = (int) mbstowcs(NULL, xi, 0);
	    if (nc >= 0) {
		const void *vmax = vmaxget();
		wchar_t *wc = (wchar_t *)
		    R_AllocStringBuffer((nc+1)*sizeof(wchar_t), &cbuff);
		mbstowcs(wc, xi, nc + 1);
		int nci18n = Ri18n_wcswidth(wc, 2147483647);
		vmaxset(vmax);
		return (nci18n < 1) ? nc : nci18n;
	    } else if (allowNA)
		error(_("invalid multibyte string, %s"), msg_name);
	    else
		return NA_INTEGER;
	} else
	    return (int) strlen(translateChar(string));

    } // switch
    return NA_INTEGER; // -Wall
} // R_nchar()
Ejemplo n.º 9
0
SEXP attribute_hidden do_nchar(SEXP call, SEXP op, SEXP args, SEXP env)
{
    SEXP d, s, x, stype;
    int i, len, allowNA;
    size_t ntype;
    int nc;
    const char *type;
    const char *xi;
    wchar_t *wc;
    const void *vmax;

    checkArity(op, args);
    if (isFactor(CAR(args)))
	error(_("'%s' requires a character vector"), "nchar()");
    PROTECT(x = coerceVector(CAR(args), STRSXP));
    if (!isString(x))
	error(_("'%s' requires a character vector"), "nchar()");
    len = LENGTH(x);
    stype = CADR(args);
    if (!isString(stype) || LENGTH(stype) != 1)
	error(_("invalid '%s' argument"), "type");
    type = CHAR(STRING_ELT(stype, 0)); /* always ASCII */
    ntype = strlen(type);
    if (ntype == 0) error(_("invalid '%s' argument"), "type");
    allowNA = asLogical(CADDR(args));
    if (allowNA == NA_LOGICAL) allowNA = 0;

    PROTECT(s = allocVector(INTSXP, len));
    vmax = vmaxget();
    for (i = 0; i < len; i++) {
	SEXP sxi = STRING_ELT(x, i);
	if (sxi == NA_STRING) {
	    INTEGER(s)[i] = 2;
	    continue;
	}
	if (strncmp(type, "bytes", ntype) == 0) {
	    INTEGER(s)[i] = LENGTH(sxi);
	} else if (strncmp(type, "chars", ntype) == 0) {
	    if (IS_UTF8(sxi)) { /* assume this is valid */
		const char *p = CHAR(sxi);
		nc = 0;
		for( ; *p; p += utf8clen(*p)) nc++;
		INTEGER(s)[i] = nc;
	    } else if (IS_BYTES(sxi)) {
		if (!allowNA) /* could do chars 0 */
		    error(_("number of characters is not computable for element %d in \"bytes\" encoding"), i+1);
		INTEGER(s)[i] = NA_INTEGER;
	    } else if (mbcslocale) {
		nc = mbstowcs(NULL, translateChar(sxi), 0);
		if (!allowNA && nc < 0)
		    error(_("invalid multibyte string %d"), i+1);
		INTEGER(s)[i] = nc >= 0 ? nc : NA_INTEGER;
	    } else
		INTEGER(s)[i] = strlen(translateChar(sxi));
	} else if (strncmp(type, "width", ntype) == 0) {
	    if (IS_UTF8(sxi)) { /* assume this is valid */
		const char *p = CHAR(sxi);
		wchar_t wc1;
		nc = 0;
		for( ; *p; p += utf8clen(*p)) {
		    utf8toucs(&wc1, p);
		    nc += Ri18n_wcwidth(wc1);
		}
		INTEGER(s)[i] = nc;
	    } else if (IS_BYTES(sxi)) {
		if (!allowNA) /* could do width 0 */
		    error(_("width is not computable for element %d in \"bytes\" encoding"), i+1);
		INTEGER(s)[i] = NA_INTEGER;
	    } else if (mbcslocale) {
		xi = translateChar(sxi);
		nc = mbstowcs(NULL, xi, 0);
		if (nc >= 0) {
		    wc = (wchar_t *) R_AllocStringBuffer((nc+1)*sizeof(wchar_t), &cbuff);

		    mbstowcs(wc, xi, nc + 1);
		    INTEGER(s)[i] = Ri18n_wcswidth(wc, 2147483647);
		    if (INTEGER(s)[i] < 1) INTEGER(s)[i] = nc;
		} else if (allowNA)
		    error(_("invalid multibyte string %d"), i+1);
		else
		    INTEGER(s)[i] = NA_INTEGER;
	    } else
		INTEGER(s)[i] = strlen(translateChar(sxi));
	} else
	    error(_("invalid '%s' argument"), "type");
	vmaxset(vmax);
    }
    R_FreeStringBufferL(&cbuff);
    if ((d = getAttrib(x, R_NamesSymbol)) != R_NilValue)
	setAttrib(s, R_NamesSymbol, d);
    if ((d = getAttrib(x, R_DimSymbol)) != R_NilValue)
	setAttrib(s, R_DimSymbol, d);
    if ((d = getAttrib(x, R_DimNamesSymbol)) != R_NilValue)
	setAttrib(s, R_DimNamesSymbol, d);
    UNPROTECT(2);
    return s;
}
Ejemplo n.º 10
0
/**
 * Construct String Container from R character vector
 * @param rstr R character vector
 * @param nrecycle extend length [vectorization]
 * @param shallowrecycle will \code{this->str} be ever modified?
 */
StriContainerUTF16::StriContainerUTF16(SEXP rstr, R_len_t _nrecycle, bool _shallowrecycle)
{
   this->str = NULL;
#ifndef NDEBUG
   if (!isString(rstr))
      throw StriException("DEBUG: !isString in StriContainerUTF16::StriContainerUTF16(SEXP rstr)");
#endif
   R_len_t nrstr = LENGTH(rstr);
   this->init_Base(nrstr, _nrecycle, _shallowrecycle); // calling LENGTH(rstr) fails on constructor call


   if (this->n > 0) {
      this->str = new UnicodeString*[this->n];
      for (R_len_t i=0; i<this->n; ++i)
         this->str[i] = NULL; // in case it fails during conversion (this is NA)

      UConverter* ucnvASCII = NULL;
//      UConverter* ucnvUTF8 = NULL;
      UConverter* ucnvLatin1 = NULL;
      UConverter* ucnvNative = NULL;

      for (R_len_t i=0; i<nrstr; ++i) {
         SEXP curs = STRING_ELT(rstr, i);
         if (curs == NA_STRING) {
            continue; // keep NA
         }
         else {
            if (IS_ASCII(curs)) {
               if (!ucnvASCII) ucnvASCII = stri__ucnv_open("ASCII");
               UErrorCode status = U_ZERO_ERROR;
               this->str[i] = new UnicodeString(CHAR(curs), LENGTH(curs),
                  ucnvASCII, status);
               if (U_FAILURE(status))
                  throw StriException(status);

               // Performance improvement attempt #1:
               // this->str[i] = new UnicodeString(UnicodeString::fromUTF8(CHAR(curs))); // slower than the above

               // Performance improvement attempt #2:
               // Create UChar buf with LENGTH(curs) items, fill it with (CHAR(curs)[i], 0x00), i=1,...
               // This wasn't faster tham the ucnvASCII approach.
            }
            else if (IS_UTF8(curs)) {
               // the above ASCII-approach (but with ucnvUTF8) is slower for UTF-8
               this->str[i] = new UnicodeString(UnicodeString::fromUTF8(CHAR(curs)));
            }
            else if (IS_LATIN1(curs)) {
               if (!ucnvLatin1) ucnvLatin1 = stri__ucnv_open("ISO-8859-1");
               UErrorCode status = U_ZERO_ERROR;
               this->str[i] = new UnicodeString(CHAR(curs), LENGTH(curs),
                  ucnvLatin1, status);
               if (U_FAILURE(status))
                  throw StriException(status);
            }
            else if (IS_BYTES(curs))
               throw StriException(MSG__BYTESENC);
            else {
//             Any encoding - detection needed
//             Assume it's Native; this assumes the user working in an 8-bit environment
//             would convert strings to UTF-8 manually if needed - I think is's
//             a more reasonable approach (Native --> input via keyboard)
               if (!ucnvNative) ucnvNative = stri__ucnv_open((char*)NULL);
               UErrorCode status = U_ZERO_ERROR;
               this->str[i] = new UnicodeString(CHAR(curs), LENGTH(curs),
                  ucnvNative, status);
               if (U_FAILURE(status))
                  throw StriException(status);
            }
         }
      }

      if (ucnvASCII) ucnv_close(ucnvASCII);
//      if (ucnvUTF8)  ucnv_close(ucnvUTF8);
      if (ucnvLatin1) ucnv_close(ucnvLatin1);
      if (ucnvNative) ucnv_close(ucnvNative);

      if (!_shallowrecycle) {
         for (R_len_t i=nrstr; i<this->n; ++i) {
            if (this->str[i%nrstr] == NULL)
               this->str[i] = NULL;
            else
               this->str[i] = new UnicodeString(*this->str[i%nrstr]);
         }
      }
   }
}
Ejemplo n.º 11
0
const char *EncodeString(SEXP s, int w, int quote, Rprt_adj justify)
{
    int b, b0, i, j, cnt;
    const char *p; char *q, buf[11];
    cetype_t ienc = CE_NATIVE;

    /* We have to do something like this as the result is returned, and
       passed on by EncodeElement -- so no way could be end user be
       responsible for freeing it.  However, this is not thread-safe. */

    static R_StringBuffer gBuffer = {NULL, 0, BUFSIZE};
    R_StringBuffer *buffer = &gBuffer;

    if (s == NA_STRING) {
	p = quote ? CHAR(R_print.na_string) : CHAR(R_print.na_string_noquote);
	cnt = i = (int)(quote ? strlen(CHAR(R_print.na_string)) :
			strlen(CHAR(R_print.na_string_noquote)));
	quote = 0;
    } else {
#ifdef Win32
	if(WinUTF8out) {
	    ienc = getCharCE(s);
	    if(ienc == CE_UTF8) {
		p = CHAR(s);
		i = Rstrlen(s, quote);
		cnt = LENGTH(s);
	    } else {
		p = translateChar0(s);
		if(p == CHAR(s)) {
		    i = Rstrlen(s, quote);
		    cnt = LENGTH(s);
		} else {
		    cnt = strlen(p);
		    i = Rstrwid(p, cnt, CE_NATIVE, quote);
		}
		ienc = CE_NATIVE;
	    }
	} else
#endif
	{
	    if(IS_BYTES(s)) {
		p = CHAR(s);
		cnt = (int) strlen(p);
		const char *q;
		char *pp = R_alloc(4*cnt+1, 1), *qq = pp, buf[5];
		for (q = p; *q; q++) {
		    unsigned char k = (unsigned char) *q;
		    if (k >= 0x20 && k < 0x80) {
			*qq++ = *q;
			if (quote && *q == '"') cnt++;
		    } else {
			snprintf(buf, 5, "\\x%02x", k);
			for(j = 0; j < 4; j++) *qq++ = buf[j];
			cnt += 3;
		    }
		}
		*qq = '\0';
		p = pp;
		i = cnt;
	    } else {
		p = translateChar(s);
		if(p == CHAR(s)) {
		    i = Rstrlen(s, quote);
		    cnt = LENGTH(s);
		} else {
		    cnt = (int) strlen(p);
		    i = Rstrwid(p, cnt, CE_NATIVE, quote);
		}
	    }
	}
    }

    /* We need enough space for the encoded string, including escapes.
       Octal encoding turns one byte into four.
       \u encoding can turn a multibyte into six or ten,
       but it turns 2/3 into 6, and 4 (and perhaps 5/6) into 10.
       Let's be wasteful here (the worst case appears to be an MBCS with
       one byte for an upper-plane Unicode point output as ten bytes,
       but I doubt that such an MBCS exists: two bytes is plausible).

       +2 allows for quotes, +6 for UTF_8 escapes.
     */
    q = R_AllocStringBuffer(imax2(5*cnt+8, w), buffer);
    b = w - i - (quote ? 2 : 0); /* total amount of padding */
    if(justify == Rprt_adj_none) b = 0;
    if(b > 0 && justify != Rprt_adj_left) {
	b0 = (justify == Rprt_adj_centre) ? b/2 : b;
	for(i = 0 ; i < b0 ; i++) *q++ = ' ';
	b -= b0;
    }
    if(quote) *q++ = (char) quote;
    if(mbcslocale || ienc == CE_UTF8) {
	int j, res;
	mbstate_t mb_st;
	wchar_t wc;
	unsigned int k; /* not wint_t as it might be signed */
#ifndef __STDC_ISO_10646__
	Rboolean Unicode_warning = FALSE;
#endif
	if(ienc != CE_UTF8)  mbs_init(&mb_st);
#ifdef Win32
	else if(WinUTF8out) { memcpy(q, UTF8in, 3); q += 3; }
#endif
	for (i = 0; i < cnt; i++) {
	    res = (int)((ienc == CE_UTF8) ? utf8toucs(&wc, p):
			mbrtowc(&wc, p, MB_CUR_MAX, NULL));
	    if(res >= 0) { /* res = 0 is a terminator */
		k = wc;
		/* To be portable, treat \0 explicitly */
		if(res == 0) {k = 0; wc = L'\0';}
		if(0x20 <= k && k < 0x7f && iswprint(wc)) {
		    switch(wc) {
		    case L'\\': *q++ = '\\'; *q++ = '\\'; p++;
			break;
		    case L'\'':
		    case L'"':
			if(quote == *p)  *q++ = '\\'; *q++ = *p++;
			break;
		    default:
			for(j = 0; j < res; j++) *q++ = *p++;
			break;
		    }
		} else if (k < 0x80) {
		    /* ANSI Escapes */
		    switch(wc) {
		    case L'\a': *q++ = '\\'; *q++ = 'a'; break;
		    case L'\b': *q++ = '\\'; *q++ = 'b'; break;
		    case L'\f': *q++ = '\\'; *q++ = 'f'; break;
		    case L'\n': *q++ = '\\'; *q++ = 'n'; break;
		    case L'\r': *q++ = '\\'; *q++ = 'r'; break;
		    case L'\t': *q++ = '\\'; *q++ = 't'; break;
		    case L'\v': *q++ = '\\'; *q++ = 'v'; break;
		    case L'\0': *q++ = '\\'; *q++ = '0'; break;

		    default:
			/* print in octal */
			snprintf(buf, 5, "\\%03o", k);
			for(j = 0; j < 4; j++) *q++ = buf[j];
			break;
		    }
		    p++;
		} else {
		    if(iswprint(wc)) {
			/* The problem here is that wc may be
			   printable according to the Unicode tables,
			   but it may not be printable on the output
			   device concerned. */
			for(j = 0; j < res; j++) *q++ = *p++;
		    } else {
#ifndef Win32
# ifndef __STDC_ISO_10646__
			Unicode_warning = TRUE;
# endif
			if(k > 0xffff)
			    snprintf(buf, 11, "\\U%08x", k);
			else
#endif
			    snprintf(buf, 11, "\\u%04x", k);
			j = (int) strlen(buf);
			memcpy(q, buf, j);
			q += j;
			p += res;
		    }
		    i += (res - 1);
		}

	    } else { /* invalid char */
		snprintf(q, 5, "\\x%02x", *((unsigned char *)p));
		q += 4; p++;
	    }
	}
#ifndef __STDC_ISO_10646__
	if(Unicode_warning)
	    warning(_("it is not known that wchar_t is Unicode on this platform"));
#endif

    } else
	for (i = 0; i < cnt; i++) {

	    /* ASCII */
	    if((unsigned char) *p < 0x80) {
		if(*p != '\t' && isprint((int)*p)) { /* Windows has \t as printable */
		    switch(*p) {
		    case '\\': *q++ = '\\'; *q++ = '\\'; break;
		    case '\'':
		    case '"':
			if(quote == *p)  *q++ = '\\'; *q++ = *p; break;
		    default: *q++ = *p; break;
		    }
		} else switch(*p) {
			/* ANSI Escapes */
		    case '\a': *q++ = '\\'; *q++ = 'a'; break;
		    case '\b': *q++ = '\\'; *q++ = 'b'; break;
		    case '\f': *q++ = '\\'; *q++ = 'f'; break;
		    case '\n': *q++ = '\\'; *q++ = 'n'; break;
		    case '\r': *q++ = '\\'; *q++ = 'r'; break;
		    case '\t': *q++ = '\\'; *q++ = 't'; break;
		    case '\v': *q++ = '\\'; *q++ = 'v'; break;
		    case '\0': *q++ = '\\'; *q++ = '0'; break;

		    default:
			/* print in octal */
			snprintf(buf, 5, "\\%03o", (unsigned char) *p);
			for(j = 0; j < 4; j++) *q++ = buf[j];
			break;
		    }
		p++;
	    } else {  /* 8 bit char */
#ifdef Win32 /* It seems Windows does not know what is printable! */
		*q++ = *p++;
#else
		if(!isprint((int)*p & 0xff)) {
		    /* print in octal */
		    snprintf(buf, 5, "\\%03o", (unsigned char) *p);
		    for(j = 0; j < 4; j++) *q++ = buf[j];
		    p++;
		} else *q++ = *p++;
#endif
	    }
	}

#ifdef Win32
    if(WinUTF8out && ienc == CE_UTF8)  { memcpy(q, UTF8out, 3); q += 3; }
#endif
    if(quote) *q++ = (char) quote;
    if(b > 0 && justify != Rprt_adj_right) {
	for(i = 0 ; i < b ; i++) *q++ = ' ';
    }
    *q = '\0';
    return buffer->data;
}
Ejemplo n.º 12
0
SEXP installTrChar(SEXP x)
{
    void * obj;
    const char *inbuf, *ans = CHAR(x);
    char *outbuf;
    size_t inb, outb, res;
    cetype_t ienc = getCharCE(x);
    R_StringBuffer cbuff = {NULL, 0, MAXELTSIZE};

    if(TYPEOF(x) != CHARSXP)
	error(_("'%s' must be called on a CHARSXP"), "installTrChar");
    if(x == NA_STRING || !(ENC_KNOWN(x))) return install(ans);
    if(IS_BYTES(x))
	error(_("translating strings with \"bytes\" encoding is not allowed"));
    if(utf8locale && IS_UTF8(x)) return install(ans);
    if(latin1locale && IS_LATIN1(x)) return install(ans);
    if(IS_ASCII(x)) return install(ans);

    if(IS_LATIN1(x)) {
	if(!latin1_obj) {
	    obj = Riconv_open("", "latin1");
	    /* should never happen */
	    if(obj == (void *)(-1))
#ifdef Win32
		error(_("unsupported conversion from '%s' in codepage %d"),
		      "latin1", localeCP);
#else
	        error(_("unsupported conversion from '%s' to '%s'"),
		      "latin1", "");
#endif
	    latin1_obj = obj;
	}
	obj = latin1_obj;
    } else {
	if(!utf8_obj) {
	    obj = Riconv_open("", "UTF-8");
	    /* should never happen */
	    if(obj == (void *)(-1)) 
#ifdef Win32
		error(_("unsupported conversion from '%s' in codepage %d"),
		      "latin1", localeCP);
#else
	        error(_("unsupported conversion from '%s' to '%s'"),
		      "latin1", "");
#endif
	    utf8_obj = obj;
	}
	obj = utf8_obj;
    }

    R_AllocStringBuffer(0, &cbuff);
top_of_loop:
    inbuf = ans; inb = strlen(inbuf);
    outbuf = cbuff.data; outb = cbuff.bufsize - 1;
    /* First initialize output */
    Riconv (obj, NULL, NULL, &outbuf, &outb);
next_char:
    /* Then convert input  */
    res = Riconv(obj, &inbuf , &inb, &outbuf, &outb);
    if(res == -1 && errno == E2BIG) {
	R_AllocStringBuffer(2*cbuff.bufsize, &cbuff);
	goto top_of_loop;
    } else if(res == -1 && (errno == EILSEQ || errno == EINVAL)) {
	if(outb < 13) {
	    R_AllocStringBuffer(2*cbuff.bufsize, &cbuff);
	    goto top_of_loop;
	}
	if (ienc == CE_UTF8) {
	    /* if starting in UTF-8, use \uxxxx */
	    /* This must be the first byte */
	    size_t clen;
	    wchar_t wc;
	    clen = utf8toucs(&wc, inbuf);
	    if(clen > 0 && inb >= clen) {
		inbuf += clen; inb -= clen;
# ifndef Win32
		if((unsigned int) wc < 65536) {
# endif
		    snprintf(outbuf, 9, "<U+%04X>", (unsigned int) wc);
		    outbuf += 8; outb -= 8;
# ifndef Win32
		} else {
		    snprintf(outbuf, 13, "<U+%08X>", (unsigned int) wc);
		    outbuf += 12; outb -= 12;
		}
# endif
	    } else {
		snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf);
		outbuf += 4; outb -= 4;
		inbuf++; inb--;
	    }
	} else {
	    snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf);
	    outbuf += 4; outb -= 4;
	    inbuf++; inb--;
	}
	goto next_char;
    }
    *outbuf = '\0';
    SEXP Sans = install(cbuff.data);
    R_FreeStringBuffer(&cbuff);
    return Sans;
}
Ejemplo n.º 13
0
/* This may return a R_alloc-ed result, so the caller has to manage the
   R_alloc stack */
attribute_hidden /* but not hidden on Windows, where it was used in tcltk.c */
const wchar_t *wtransChar(SEXP x)
{
    void * obj;
    const char *inbuf, *ans = CHAR(x);
    char *outbuf;
    wchar_t *p;
    size_t inb, outb, res, top;
    Rboolean knownEnc = FALSE;
    R_StringBuffer cbuff = {NULL, 0, MAXELTSIZE};

    if(TYPEOF(x) != CHARSXP)
	error(_("'%s' must be called on a CHARSXP"), "wtransChar");

    if(IS_BYTES(x))
	error(_("translating strings with \"bytes\" encoding is not allowed"));

    if(IS_LATIN1(x)) {
	if(!latin1_wobj) {
	    obj = Riconv_open(TO_WCHAR, "latin1");
	    if(obj == (void *)(-1))
		error(_("unsupported conversion from '%s' to '%s'"),
		      "latin1", TO_WCHAR);
	    latin1_wobj = obj;
	} else
	    obj = latin1_wobj;
	knownEnc = TRUE;
    } else if(IS_UTF8(x)) {
	if(!utf8_wobj) {
	    obj = Riconv_open(TO_WCHAR, "UTF-8");
	    if(obj == (void *)(-1)) 
		error(_("unsupported conversion from '%s' to '%s'"),
		      "latin1", TO_WCHAR);
	    utf8_wobj = obj;
	} else
	    obj = utf8_wobj;
	knownEnc = TRUE;
    } else {
	obj = Riconv_open(TO_WCHAR, "");
	if(obj == (void *)(-1))
#ifdef Win32
	    error(_("unsupported conversion to '%s' from codepage %d"),
		  TO_WCHAR, localeCP);
#else
	    error(_("unsupported conversion from '%s' to '%s'"), "", TO_WCHAR);
#endif
    }

    R_AllocStringBuffer(0, &cbuff);
top_of_loop:
    inbuf = ans; inb = strlen(inbuf);
    outbuf = cbuff.data; top = outb = cbuff.bufsize - 1;
    /* First initialize output */
    Riconv (obj, NULL, NULL, &outbuf, &outb);
next_char:
    /* Then convert input  */
    res = Riconv(obj, &inbuf , &inb, &outbuf, &outb);
    if(res == -1 && errno == E2BIG) {
	R_AllocStringBuffer(2*cbuff.bufsize, &cbuff);
	goto top_of_loop;
    } else if(res == -1 && (errno == EILSEQ || errno == EINVAL)) {
	if(outb < 5) {
	    R_AllocStringBuffer(2*cbuff.bufsize, &cbuff);
	    goto top_of_loop;
	}
	snprintf(outbuf, 5, "<%02x>", (unsigned char)*inbuf);
	outbuf += 4; outb -= 4;
	inbuf++; inb--;
	goto next_char;
	/* if(!knownEnc) Riconv_close(obj);
	   error(_("invalid input in wtransChar")); */
    }
    if(!knownEnc) Riconv_close(obj);
    res = (top - outb);
    /* terminator is 2 or 4 null bytes */
    p = (wchar_t *) R_alloc(res+4, 1);
    memset(p, 0, res+4);
    memcpy(p, cbuff.data, res);
    R_FreeStringBuffer(&cbuff);
    return p;
}
Ejemplo n.º 14
0
/* Note that NA_STRING is not handled separately here.  This is
   deliberate -- see ?paste -- and implicitly coerces it to "NA"
*/
SEXP attribute_hidden do_paste(SEXP call, SEXP op, SEXP args, SEXP env)
{
    SEXP ans, collapse, sep, x;
    int sepw, u_sepw, ienc;
    R_xlen_t i, j, k, maxlen, nx, pwidth;
    const char *s, *cbuf, *csep=NULL, *u_csep=NULL;
    char *buf;
    Rboolean allKnown, anyKnown, use_UTF8, use_Bytes,
	sepASCII = TRUE, sepUTF8 = FALSE, sepBytes = FALSE, sepKnown = FALSE,
	use_sep = (PRIMVAL(op) == 0);
    const void *vmax;

    checkArity(op, args);

    /* We use formatting and so we must initialize printing. */

    PrintDefaults();

    /* Check the arguments */

    x = CAR(args);
    if (!isVectorList(x))
	error(_("invalid first argument"));
    nx = xlength(x);

    if(use_sep) { /* paste(..., sep, .) */
	sep = CADR(args);
	if (!isString(sep) || LENGTH(sep) <= 0 || STRING_ELT(sep, 0) == NA_STRING)
	    error(_("invalid separator"));
	sep = STRING_ELT(sep, 0);
	csep = translateChar(sep);
	u_sepw = sepw = (int) strlen(csep); // will be short
	sepASCII = strIsASCII(csep);
	sepKnown = ENC_KNOWN(sep) > 0;
	sepUTF8 = IS_UTF8(sep);
	sepBytes = IS_BYTES(sep);
	collapse = CADDR(args);
    } else { /* paste0(..., .) */
	u_sepw = sepw = 0; sep = R_NilValue;/* -Wall */
	collapse = CADR(args);
    }
    if (!isNull(collapse))
	if(!isString(collapse) || LENGTH(collapse) <= 0 ||
	   STRING_ELT(collapse, 0) == NA_STRING)
	    error(_("invalid '%s' argument"), "collapse");
    if(nx == 0)
	return (!isNull(collapse)) ? mkString("") : allocVector(STRSXP, 0);


    /* Maximum argument length, coerce if needed */

    maxlen = 0;
    for (j = 0; j < nx; j++) {
	if (!isString(VECTOR_ELT(x, j))) {
	    /* formerly in R code: moved to C for speed */
	    SEXP call, xj = VECTOR_ELT(x, j);
	    if(OBJECT(xj)) { /* method dispatch */
		PROTECT(call = lang2(install("as.character"), xj));
		SET_VECTOR_ELT(x, j, eval(call, env));
		UNPROTECT(1);
	    } else if (isSymbol(xj))
		SET_VECTOR_ELT(x, j, ScalarString(PRINTNAME(xj)));
	    else
		SET_VECTOR_ELT(x, j, coerceVector(xj, STRSXP));

	    if (!isString(VECTOR_ELT(x, j)))
		error(_("non-string argument to internal 'paste'"));
	}
	if(xlength(VECTOR_ELT(x, j)) > maxlen)
	    maxlen = xlength(VECTOR_ELT(x, j));
    }
    if(maxlen == 0)
	return (!isNull(collapse)) ? mkString("") : allocVector(STRSXP, 0);

    PROTECT(ans = allocVector(STRSXP, maxlen));

    for (i = 0; i < maxlen; i++) {
	/* Strategy for marking the encoding: if all inputs (including
	 * the separator) are ASCII, so is the output and we don't
	 * need to mark.  Otherwise if all non-ASCII inputs are of
	 * declared encoding, we should mark.
	 * Need to be careful only to include separator if it is used.
	 */
	anyKnown = FALSE; allKnown = TRUE; use_UTF8 = FALSE; use_Bytes = FALSE;
	if(nx > 1) {
	    allKnown = sepKnown || sepASCII;
	    anyKnown = sepKnown;
	    use_UTF8 = sepUTF8;
	    use_Bytes = sepBytes;
	}

	pwidth = 0;
	for (j = 0; j < nx; j++) {
	    k = xlength(VECTOR_ELT(x, j));
	    if (k > 0) {
		SEXP cs = STRING_ELT(VECTOR_ELT(x, j), i % k);
		if(IS_UTF8(cs)) use_UTF8 = TRUE;
		if(IS_BYTES(cs)) use_Bytes = TRUE;
	    }
	}
	if (use_Bytes) use_UTF8 = FALSE;
	vmax = vmaxget();
	for (j = 0; j < nx; j++) {
	    k = xlength(VECTOR_ELT(x, j));
	    if (k > 0) {
		if(use_Bytes)
		    pwidth += strlen(CHAR(STRING_ELT(VECTOR_ELT(x, j), i % k)));
		else if(use_UTF8)
		    pwidth += strlen(translateCharUTF8(STRING_ELT(VECTOR_ELT(x, j), i % k)));
		else
		    pwidth += strlen(translateChar(STRING_ELT(VECTOR_ELT(x, j), i % k)));
		vmaxset(vmax);
	    }
	}
	if(use_sep) {
	    if (use_UTF8 && !u_csep) {
		u_csep = translateCharUTF8(sep);
		u_sepw = (int) strlen(u_csep); // will be short
	    }
	    pwidth += (nx - 1) * (use_UTF8 ? u_sepw : sepw);
	}
	if (pwidth > INT_MAX)
	    error(_("result would exceed 2^31-1 bytes"));
	cbuf = buf = R_AllocStringBuffer(pwidth, &cbuff);
	vmax = vmaxget();
	for (j = 0; j < nx; j++) {
	    k = xlength(VECTOR_ELT(x, j));
	    if (k > 0) {
		SEXP cs = STRING_ELT(VECTOR_ELT(x, j), i % k);
		if (use_UTF8) {
		    s = translateCharUTF8(cs);
		    strcpy(buf, s);
		    buf += strlen(s);
		} else {
		    s = use_Bytes ? CHAR(cs) : translateChar(cs);
		    strcpy(buf, s);
		    buf += strlen(s);
		    allKnown = allKnown && (strIsASCII(s) || (ENC_KNOWN(cs)> 0));
		    anyKnown = anyKnown || (ENC_KNOWN(cs)> 0);
		}
	    }
	    if (sepw != 0 && j != nx - 1) {
		if (use_UTF8) {
		    strcpy(buf, u_csep);
		    buf += u_sepw;
		} else {
		    strcpy(buf, csep);
		    buf += sepw;
		}
	    }
	    vmax = vmaxget();
	}
	ienc = 0;
	if(use_UTF8) ienc = CE_UTF8;
	else if(use_Bytes) ienc = CE_BYTES;
	else if(anyKnown && allKnown) {
	    if(known_to_be_latin1) ienc = CE_LATIN1;
	    if(known_to_be_utf8) ienc = CE_UTF8;
	}
	SET_STRING_ELT(ans, i, mkCharCE(cbuf, ienc));
    }

    /* Now collapse, if required. */

    if(collapse != R_NilValue && (nx = XLENGTH(ans)) > 0) {
	sep = STRING_ELT(collapse, 0);
	use_UTF8 = IS_UTF8(sep);
	use_Bytes = IS_BYTES(sep);
	for (i = 0; i < nx; i++) {
	    if(IS_UTF8(STRING_ELT(ans, i))) use_UTF8 = TRUE;
	    if(IS_BYTES(STRING_ELT(ans, i))) use_Bytes = TRUE;
	}
	if(use_Bytes) {
	    csep = CHAR(sep);
	    use_UTF8 = FALSE;
	} else if(use_UTF8)
	    csep = translateCharUTF8(sep);
	else
	    csep = translateChar(sep);
	sepw = (int) strlen(csep);
	anyKnown = ENC_KNOWN(sep) > 0;
	allKnown = anyKnown || strIsASCII(csep);
	pwidth = 0;
	vmax = vmaxget();
	for (i = 0; i < nx; i++)
	    if(use_UTF8) {
		pwidth += strlen(translateCharUTF8(STRING_ELT(ans, i)));
		vmaxset(vmax);
	    } else /* already translated */
		pwidth += strlen(CHAR(STRING_ELT(ans, i)));
	pwidth += (nx - 1) * sepw;
	if (pwidth > INT_MAX)
	    error(_("result would exceed 2^31-1 bytes"));
	cbuf = buf = R_AllocStringBuffer(pwidth, &cbuff);
	vmax = vmaxget();
	for (i = 0; i < nx; i++) {
	    if(i > 0) {
		strcpy(buf, csep);
		buf += sepw;
	    }
	    if(use_UTF8)
		s = translateCharUTF8(STRING_ELT(ans, i));
	    else /* already translated */
		s = CHAR(STRING_ELT(ans, i));
	    strcpy(buf, s);
	    while (*buf)
		buf++;
	    allKnown = allKnown &&
		(strIsASCII(s) || (ENC_KNOWN(STRING_ELT(ans, i)) > 0));
	    anyKnown = anyKnown || (ENC_KNOWN(STRING_ELT(ans, i)) > 0);
	    if(use_UTF8) vmaxset(vmax);
	}
	UNPROTECT(1);
	ienc = CE_NATIVE;
	if(use_UTF8) ienc = CE_UTF8;
	else if(use_Bytes) ienc = CE_BYTES;
	else if(anyKnown && allKnown) {
	    if(known_to_be_latin1) ienc = CE_LATIN1;
	    if(known_to_be_utf8) ienc = CE_UTF8;
	}
	PROTECT(ans = allocVector(STRSXP, 1));
	SET_STRING_ELT(ans, 0, mkCharCE(cbuf, ienc));
    }
    R_FreeStringBufferL(&cbuff);
    UNPROTECT(1);
    return ans;
}
Ejemplo n.º 15
0
/* format.default(x, trim, digits, nsmall, width, justify, na.encode,
		  scientific) */
SEXP attribute_hidden do_format(SEXP call, SEXP op, SEXP args, SEXP env)
{
    SEXP l, x, y, swd;
    int il, digits, trim = 0, nsmall = 0, wd = 0, adj = -1, na, sci = 0;
    int w, d, e;
    int wi, di, ei, scikeep;
    const char *strp;
    R_xlen_t i, n;

    checkArity(op, args);
    PrintDefaults();
    scikeep = R_print.scipen;

    if (isEnvironment(x = CAR(args))) {
	return mkString(EncodeEnvironment(x));
    }
    else if (!isVector(x))
	error(_("first argument must be atomic"));
    args = CDR(args);

    trim = asLogical(CAR(args));
    if (trim == NA_INTEGER)
	error(_("invalid '%s' argument"), "trim");
    args = CDR(args);

    if (!isNull(CAR(args))) {
	digits = asInteger(CAR(args));
	if (digits == NA_INTEGER || digits < R_MIN_DIGITS_OPT
	    || digits > R_MAX_DIGITS_OPT)
	    error(_("invalid '%s' argument"), "digits");
	R_print.digits = digits;
    }
    args = CDR(args);

    nsmall = asInteger(CAR(args));
    if (nsmall == NA_INTEGER || nsmall < 0 || nsmall > 20)
	error(_("invalid '%s' argument"), "nsmall");
    args = CDR(args);

    if (isNull(swd = CAR(args))) wd = 0; else wd = asInteger(swd);
    if(wd == NA_INTEGER)
	error(_("invalid '%s' argument"), "width");
    args = CDR(args);

    adj = asInteger(CAR(args));
    if(adj == NA_INTEGER || adj < 0 || adj > 3)
	error(_("invalid '%s' argument"), "justify");
    args = CDR(args);

    na = asLogical(CAR(args));
    if(na == NA_LOGICAL)
	error(_("invalid '%s' argument"), "na.encode");
    args = CDR(args);
    if(LENGTH(CAR(args)) != 1)
	error(_("invalid '%s' argument"), "scientific");
    if(isLogical(CAR(args))) {
	int tmp = LOGICAL(CAR(args))[0];
	if(tmp == NA_LOGICAL) sci = NA_INTEGER;
	else sci = tmp > 0 ?-100 : 100;
    } else if (isNumeric(CAR(args))) {
	sci = asInteger(CAR(args));
    } else
	error(_("invalid '%s' argument"), "scientific");
    if(sci != NA_INTEGER) R_print.scipen = sci;

    if ((n = XLENGTH(x)) <= 0) {
	PROTECT(y = allocVector(STRSXP, 0));
    } else {
	switch (TYPEOF(x)) {

	case LGLSXP:
	    PROTECT(y = allocVector(STRSXP, n));
	    if (trim) w = 0; else formatLogical(LOGICAL(x), n, &w);
	    w = imax2(w, wd);
	    for (i = 0; i < n; i++) {
		strp = EncodeLogical(LOGICAL(x)[i], w);
		SET_STRING_ELT(y, i, mkChar(strp));
	    }
	    break;

	case INTSXP:
	    PROTECT(y = allocVector(STRSXP, n));
	    if (trim) w = 0;
	    else formatInteger(INTEGER(x), n, &w);
	    w = imax2(w, wd);
	    for (i = 0; i < n; i++) {
		strp = EncodeInteger(INTEGER(x)[i], w);
		SET_STRING_ELT(y, i, mkChar(strp));
	    }
	    break;

	case REALSXP:
	    formatReal(REAL(x), n, &w, &d, &e, nsmall);
	    if (trim) w = 0;
	    w = imax2(w, wd);
	    PROTECT(y = allocVector(STRSXP, n));
	    for (i = 0; i < n; i++) {
		strp = EncodeReal0(REAL(x)[i], w, d, e, OutDec);
		SET_STRING_ELT(y, i, mkChar(strp));
	    }
	    break;

	case CPLXSXP:
	    formatComplex(COMPLEX(x), n, &w, &d, &e, &wi, &di, &ei, nsmall);
	    if (trim) wi = w = 0;
	    w = imax2(w, wd); wi = imax2(wi, wd);
	    PROTECT(y = allocVector(STRSXP, n));
	    for (i = 0; i < n; i++) {
		strp = EncodeComplex(COMPLEX(x)[i], w, d, e, wi, di, ei, OutDec);
		SET_STRING_ELT(y, i, mkChar(strp));
	    }
	    break;

	case STRSXP:
	{
	    /* this has to be different from formatString/EncodeString as
	       we don't actually want to encode here */
	    const char *s;
	    char *q;
	    int b, b0, cnt = 0, j;
	    SEXP s0, xx;

	    /* This is clumsy, but it saves rewriting and re-testing
	       this complex code */
	    PROTECT(xx = duplicate(x));
	    for (i = 0; i < n; i++) {
		SEXP tmp =  STRING_ELT(xx, i);
		if(IS_BYTES(tmp)) {
		    const char *p = CHAR(tmp), *q;
		    char *pp = R_alloc(4*strlen(p)+1, 1), *qq = pp, buf[5];
		    for (q = p; *q; q++) {
			unsigned char k = (unsigned char) *q;
			if (k >= 0x20 && k < 0x80) {
			    *qq++ = *q;
			} else {
			    snprintf(buf, 5, "\\x%02x", k);
			    for(int j = 0; j < 4; j++) *qq++ = buf[j];
			}
		    }
		    *qq = '\0';
		    s = pp;
		} else s = translateChar(tmp);
		if(s != CHAR(tmp)) SET_STRING_ELT(xx, i, mkChar(s));
	    }

	    w = wd;
	    if (adj != Rprt_adj_none) {
		for (i = 0; i < n; i++)
		    if (STRING_ELT(xx, i) != NA_STRING)
			w = imax2(w, Rstrlen(STRING_ELT(xx, i), 0));
		    else if (na) w = imax2(w, R_print.na_width);
	    } else w = 0;
	    /* now calculate the buffer size needed, in bytes */
	    for (i = 0; i < n; i++)
		if (STRING_ELT(xx, i) != NA_STRING) {
		    il = Rstrlen(STRING_ELT(xx, i), 0);
		    cnt = imax2(cnt, LENGTH(STRING_ELT(xx, i)) + imax2(0, w-il));
		} else if (na) cnt  = imax2(cnt, R_print.na_width + imax2(0, w-R_print.na_width));
	    R_CheckStack2(cnt+1);
	    char buff[cnt+1];
	    PROTECT(y = allocVector(STRSXP, n));
	    for (i = 0; i < n; i++) {
		if(!na && STRING_ELT(xx, i) == NA_STRING) {
		    SET_STRING_ELT(y, i, NA_STRING);
		} else {
		    q = buff;
		    if(STRING_ELT(xx, i) == NA_STRING) s0 = R_print.na_string;
		    else s0 = STRING_ELT(xx, i) ;
		    s = CHAR(s0);
		    il = Rstrlen(s0, 0);
		    b = w - il;
		    if(b > 0 && adj != Rprt_adj_left) {
			b0 = (adj == Rprt_adj_centre) ? b/2 : b;
			for(j = 0 ; j < b0 ; j++) *q++ = ' ';
			b -= b0;
		    }
		    for(j = 0; j < LENGTH(s0); j++) *q++ = *s++;
		    if(b > 0 && adj != Rprt_adj_right)
			for(j = 0 ; j < b ; j++) *q++ = ' ';
		    *q = '\0';
		    SET_STRING_ELT(y, i, mkChar(buff));
		}
	    }
	}
	UNPROTECT(2); /* xx , y */
	PROTECT(y);
	break;
	default:
	    error(_("Impossible mode ( x )")); y = R_NilValue;/* -Wall */
	}
    }
    if((l = getAttrib(x, R_DimSymbol)) != R_NilValue) {
	setAttrib(y, R_DimSymbol, l);
	if((l = getAttrib(x, R_DimNamesSymbol)) != R_NilValue)
	    setAttrib(y, R_DimNamesSymbol, l);
    } else if((l = getAttrib(x, R_NamesSymbol)) != R_NilValue)
	setAttrib(y, R_NamesSymbol, l);

    /* In case something else forgets to set PrintDefaults(), PR#14477 */
    R_print.scipen = scikeep;

    UNPROTECT(1); /* y */
    return y;
}
Ejemplo n.º 16
0
/**
 * Count the number of characters in a string
 *
 * Note that ICU permits only strings of length < 2^31.
 * @param s R character vector
 * @return integer vector
 * @version 0.1 (Marcin Bujarski)
 * @version 0.2 (Marek Gagolewski) Multiple input encoding support
 * @version 0.3 (Marek Gagolewski, 2013-06-16) make StriException-friendly
 */
SEXP stri_length(SEXP str)
{
   str = stri_prepare_arg_string(str, "str");
   R_len_t ns = LENGTH(str);
   SEXP ret;

   UConverter* uconv = NULL;
   bool uconv_8bit = false;
   bool uconv_utf8 = false;

   STRI__ERROR_HANDLER_BEGIN

/* Note: ICU50 permits only int-size strings in U8_NEXT and U8_FWD_1 */
#define STRI_LENGTH_CALCULATE_UTF8  \
   const char* qc = CHAR(q);        \
   R_len_t j = 0;                   \
   for (R_len_t i = 0; i < nq; j++) \
      U8_FWD_1(qc, i, nq);          \
   retint[k] = j;

   PROTECT(ret = Rf_allocVector(INTSXP, ns));
   int* retint = INTEGER(ret);
   for (R_len_t k = 0; k < ns; k++) {
      SEXP q = STRING_ELT(str, k);
      if (q == NA_STRING)
         retint[k] = NA_INTEGER;
      else {
         R_len_t nq = LENGTH(q);  // O(1) - stored by R

         // We trust (is that a wise assumption?)
         // R encoding marks; However, it there is no mark,
         // the string may have any encoding (ascii, latin1, utf8, native)
         if (IS_ASCII(q) || IS_LATIN1(q))
            retint[k] = nq;
         else if (IS_BYTES(q))
            throw StriException(MSG__BYTESENC);
         else if (IS_UTF8(q)) {
            STRI_LENGTH_CALCULATE_UTF8
         }
         else { // Any encoding - detection needed
//            UTF-8 strings can be fairly reliably recognized as such by a
//            simple algorithm, i.e., the probability that a string of
//            characters in any other encoding appears as valid UTF-8 is low,
//            diminishing with increasing string length.
//            We have two possibilities here:
//            1. Auto detect encoding: Is this ASCII or UTF-8? If not => use Native
//                This won't work correctly in some cases.
//                e.g. (c4,85) represents ("Polish a with ogonek") in UTF-8
//                and ("A umlaut", "Ellipsis") in WINDOWS-1250
//            2. Assume it's Native; this assumes the user working in an 8-bit environment
//                would convert strings to UTF-8 manually if needed - I think is's
//                a more reasonable approach (Native --> input via keyboard)

            if (!uconv) { // open ucnv on demand
               uconv = stri__ucnv_open((const char*)NULL); // native decoder
               if (!uconv) {
                  retint[k] = NA_INTEGER;
                  continue;
               }
               uconv_8bit = ((int)ucnv_getMaxCharSize(uconv) == 1);
               if (!uconv_8bit) {
                  UErrorCode err = U_ZERO_ERROR;
                  const char* name = ucnv_getName(uconv, &err);
                  if (U_FAILURE(err))
                     throw StriException("could not query default converter");
                  uconv_utf8 = !strncmp("UTF-8", name, 5);
               }
            }

            if (uconv_8bit) {
               retint[k] = nq; // it's an 8-bit encoding :-)
            }
            else if (uconv_utf8) { // it's UTF-8
               STRI_LENGTH_CALCULATE_UTF8
            }
            else {  // native encoding which is neither 8-bit, nor UTF-8 (e.g. 'Big5')
               UErrorCode err = U_ZERO_ERROR;
               const char* source = CHAR(q);
               const char* sourceLimit = source + nq;
               R_len_t j;
               for (j = 0; source != sourceLimit; j++) {
                  if (U_FAILURE(err)) break; // error from previous iteration
                  // iterate through each native-encoded character:
                  ucnv_getNextUChar(uconv, &source, sourceLimit, &err);
               }
               if (U_FAILURE(err)) {  // error from last iteration
                  Rf_warning("error determining length for native, neither 8-bit- nor UTF-8-encoded string.");
                  retint[k] = NA_INTEGER;
               }
               else
                  retint[k] = j; // all right, we got it!
            }
         }
      }