Exemple #1
0
static void substr(char *buf, const char *str, int ienc, int sa, int so)
{
/* Store the substring	str [sa:so]  into buf[] */
    int i, j, used;

    if (ienc == CE_UTF8) {
	const char *end = str + strlen(str);
	for (i = 0; i < so && str < end; i++) {
	    int used = utf8clen(*str);
	    if (i < sa - 1) { str += used; continue; }
	    for (j = 0; j < used; j++) *buf++ = *str++;
	}
    } else if (ienc == CE_LATIN1 || ienc == CE_BYTES) {
	for (str += (sa - 1), i = sa; i <= so; i++) *buf++ = *str++;
    } else {
	if (mbcslocale && !strIsASCII(str)) {
	    const char *end = str + strlen(str);
	    mbstate_t mb_st;
	    mbs_init(&mb_st);
	    for (i = 1; i < sa; i++) str += Mbrtowc(NULL, str, MB_CUR_MAX, &mb_st);
	    for (i = sa; i <= so && str < end; i++) {
		used = (int) Mbrtowc(NULL, str, MB_CUR_MAX, &mb_st);
		for (j = 0; j < used; j++) *buf++ = *str++;
	    }
	} else
	    for (str += (sa - 1), i = sa; i <= so; i++) *buf++ = *str++;
    }
    *buf = '\0';
}
Exemple #2
0
static int mbcs_get_next2(int c, ucs_t *wc)
{
#if 0
    int i, res, clen = 1; char s[9];

    s[0] = c;
    /* This assumes (probably OK) that all MBCS embed ASCII as single-byte
       lead bytes, including control chars */
    if((unsigned int) c < 0x80) {
	*wc = (wchar_t) c;
	return 1;
    }
    if(utf8locale) {
	clen = utf8clen(c);
	for(i = 1; i < clen; i++) {
	    s[i] = xxgetc();
	    if(s[i] == R_EOF) {
		PROBLEM "EOF whilst reading MBCS char"
		    ERROR;
	    }
	}
	s[clen] ='\0'; /* x86 Solaris requires this */
	res = mbtoucs(wc, s, clen);
	if(res == -1) {
	    PROBLEM "invalid multibyte character in parser"
		ERROR;
	}
    } else {
	/* This is not necessarily correct for stateful MBCS */
	while(clen <= MB_CUR_MAX) {
	    res = mbtoucs(wc, s, clen);
	    if(res >= 0) break;
	    if(res == -1) {
		PROBLEM "invalid multibyte character in parser"
		    ERROR;
	    }
	    /* so res == -2 */
	    c = xxgetc();
	    if(c == R_EOF) {
		PROBLEM "EOF whilst reading MBCS char"
		    ERROR;
	    }
	    s[clen++] = c;
	} /* we've tried enough, so must be complete or invalid by now */
    }
    for(i = clen - 1; i > 0; i--) xxungetc(s[i]);
    return clen;
#else
    return(0);
#endif
 }
Exemple #3
0
static void
substrset(char *buf, const char *const str, cetype_t ienc, int sa, int so)
{
    /* Replace the substring buf[sa:so] by str[] */
    int i, in = 0, out = 0;

    if (ienc == CE_UTF8) {
	for (i = 1; i < sa; i++) buf += utf8clen(*buf);
	for (i = sa; i <= so && in < strlen(str); i++) {
	    in +=  utf8clen(str[in]);
	    out += utf8clen(buf[out]);
	    if (!str[in]) break;
	}
	if (in != out) memmove(buf+in, buf+out, strlen(buf+out)+1);
	memcpy(buf, str, in);
    } else if (ienc == CE_LATIN1 || ienc == CE_BYTES) {
	in = (int) strlen(str);
	out = so - sa + 1;
	memcpy(buf + sa - 1, str, (in < out) ? in : out);
    } else {
	/* This cannot work for stateful encodings */
	if (mbcslocale) {
	    for (i = 1; i < sa; i++) buf += Mbrtowc(NULL, buf, MB_CUR_MAX, NULL);
	    /* now work out how many bytes to replace by how many */
	    for (i = sa; i <= so && in < strlen(str); i++) {
		in += (int) Mbrtowc(NULL, str+in, MB_CUR_MAX, NULL);
		out += (int) Mbrtowc(NULL, buf+out, MB_CUR_MAX, NULL);
		if (!str[in]) break;
	    }
	    if (in != out) memmove(buf+in, buf+out, strlen(buf+out)+1);
	    memcpy(buf, str, in);
	} else {
	    in = (int) strlen(str);
	    out = so - sa + 1;
	    memcpy(buf + sa - 1, str, (in < out) ? in : out);
	}
    }
}
Exemple #4
0
static bool
keysym_to_utf8(char *buf, const xcb_keysym_t ksym)
{
    unsigned int ksym_conv;
    int count;

    /* Unicode keysym */
    if((ksym & 0xff000000) == 0x01000000)
        ksym_conv = ksym & 0x00ffffff;
    else if(ksym > 0 && ksym < 0x100)
	ksym_conv = ksym;
    else if(ksym > 0x1a0 && ksym < 0x200)
	ksym_conv = keysym_to_unicode_1a1_1ff[ksym - 0x1a1];
    else if(ksym > 0x2a0 && ksym < 0x2ff)
	ksym_conv = keysym_to_unicode_2a1_2fe[ksym - 0x2a1];
    else if(ksym > 0x3a1 && ksym < 0x3ff)
	ksym_conv = keysym_to_unicode_3a2_3fe[ksym - 0x3a2];
    else if(ksym > 0x4a0 && ksym < 0x4e0)
	ksym_conv = keysym_to_unicode_4a1_4df[ksym - 0x4a1];
    else if(ksym > 0x589 && ksym < 0x5ff)
	ksym_conv = keysym_to_unicode_590_5fe[ksym - 0x590];
    else if(ksym > 0x67f && ksym < 0x700)
	ksym_conv = keysym_to_unicode_680_6ff[ksym - 0x680];
    else if(ksym > 0x7a0 && ksym < 0x7fa)
	ksym_conv = keysym_to_unicode_7a1_7f9[ksym - 0x7a1];
    else if(ksym > 0x8a3 && ksym < 0x8ff)
	ksym_conv = keysym_to_unicode_8a4_8fe[ksym - 0x8a4];
    else if(ksym > 0x9de && ksym < 0x9f9)
	ksym_conv = keysym_to_unicode_9df_9f8[ksym - 0x9df];
    else if(ksym > 0xaa0 && ksym < 0xaff)
	ksym_conv = keysym_to_unicode_aa1_afe[ksym - 0xaa1];
    else if(ksym > 0xcde && ksym < 0xcfb)
	ksym_conv = keysym_to_unicode_cdf_cfa[ksym - 0xcdf];
    else if(ksym > 0xda0 && ksym < 0xdfa)
	ksym_conv = keysym_to_unicode_da1_df9[ksym - 0xda1];
    else if(ksym > 0xe9f && ksym < 0xf00)
	ksym_conv = keysym_to_unicode_ea0_eff[ksym - 0xea0];
    else if(ksym > 0x12a0 && ksym < 0x12ff)
	ksym_conv = keysym_to_unicode_12a1_12fe[ksym - 0x12a1];
    else if(ksym > 0x13bb && ksym < 0x13bf)
	ksym_conv = keysym_to_unicode_13bc_13be[ksym - 0x13bc];
    else if(ksym > 0x14a0 && ksym < 0x1500)
        ksym_conv = keysym_to_unicode_14a1_14ff[ksym - 0x14a1];
    else if(ksym > 0x15cf && ksym < 0x15f7)
	ksym_conv = keysym_to_unicode_15d0_15f6[ksym - 0x15d0];
    else if(ksym > 0x169f && ksym < 0x16f7)
	ksym_conv = keysym_to_unicode_16a0_16f6[ksym - 0x16a0];
    else if(ksym > 0x1e9e && ksym < 0x1f00)
	ksym_conv = keysym_to_unicode_1e9f_1eff[ksym - 0x1e9f];
    else if(ksym > 0x209f && ksym < 0x20ad)
	ksym_conv = keysym_to_unicode_20a0_20ac[ksym - 0x20a0];
    else
        return false;

    count = utf8clen(ksym_conv);
    switch(count)
    {
      case 7: return false;
      case 6: buf[5] = (ksym_conv | 0x80) & 0xbf; ksym_conv >>= 6;
      case 5: buf[4] = (ksym_conv | 0x80) & 0xbf; ksym_conv >>= 6;
      case 4: buf[3] = (ksym_conv | 0x80) & 0xbf; ksym_conv >>= 6;
      case 3: buf[2] = (ksym_conv | 0x80) & 0xbf; ksym_conv >>= 6;
      case 2: buf[1] = (ksym_conv | 0x80) & 0xbf; ksym_conv >>= 6;
      case 1: buf[0] = (ksym_conv | __utf8_mark[count]);
    }
    buf[count] = '\0';
    return true;
}
Exemple #5
0
int R_nchar(SEXP string, nchar_type type_,
	    Rboolean allowNA, Rboolean keepNA, const char* msg_name)
{
    if (string == NA_STRING)
	return keepNA ? NA_INTEGER : 2;
    // else :
    switch(type_) {
    case Bytes:
	return LENGTH(string);
	break;
    case Chars:
	if (IS_UTF8(string)) {
	    const char *p = CHAR(string);
	    if (!utf8Valid(p)) {
		if (!allowNA)
		    error(_("invalid multibyte string, %s"), msg_name);
		return NA_INTEGER;
	    } else {
		int nc = 0;
		for( ; *p; p += utf8clen(*p)) nc++;
		return nc;
	    }
	} else if (IS_BYTES(string)) {
	    if (!allowNA) /* could do chars 0 */
		error(_("number of characters is not computable in \"bytes\" encoding, %s"),
		      msg_name);
	    return NA_INTEGER;
	} else if (mbcslocale) {
	    int nc = (int) mbstowcs(NULL, translateChar(string), 0);
	    if (!allowNA && nc < 0)
		error(_("invalid multibyte string, %s"), msg_name);
	    return (nc >= 0 ? nc : NA_INTEGER);
	} else
	    return ((int) strlen(translateChar(string)));
	break;
    case Width:
	if (IS_UTF8(string)) {
	    const char *p = CHAR(string);
	    if (!utf8Valid(p)) {
		if (!allowNA)
		    error(_("invalid multibyte string, %s"), msg_name);
		return NA_INTEGER;
	    } else {
		wchar_t wc1;
		int nc = 0;
		for( ; *p; p += utf8clen(*p)) {
		    utf8toucs(&wc1, p);
		    nc += Ri18n_wcwidth(wc1);
		}
		return nc;
	    }
	} else if (IS_BYTES(string)) {
	    if (!allowNA) /* could do width 0 */
		error(_("width is not computable for %s in \"bytes\" encoding"),
		      msg_name);
	    return NA_INTEGER;
	} else if (mbcslocale) {
	    const char *xi = translateChar(string);
	    int nc = (int) mbstowcs(NULL, xi, 0);
	    if (nc >= 0) {
		const void *vmax = vmaxget();
		wchar_t *wc = (wchar_t *)
		    R_AllocStringBuffer((nc+1)*sizeof(wchar_t), &cbuff);
		mbstowcs(wc, xi, nc + 1);
		int nci18n = Ri18n_wcswidth(wc, 2147483647);
		vmaxset(vmax);
		return (nci18n < 1) ? nc : nci18n;
	    } else if (allowNA)
		error(_("invalid multibyte string, %s"), msg_name);
	    else
		return NA_INTEGER;
	} else
	    return (int) strlen(translateChar(string));

    } // switch
    return NA_INTEGER; // -Wall
} // R_nchar()
Exemple #6
0
SEXP attribute_hidden do_nchar(SEXP call, SEXP op, SEXP args, SEXP env)
{
    SEXP d, s, x, stype;
    int i, len, allowNA;
    size_t ntype;
    int nc;
    const char *type;
    const char *xi;
    wchar_t *wc;
    const void *vmax;

    checkArity(op, args);
    if (isFactor(CAR(args)))
	error(_("'%s' requires a character vector"), "nchar()");
    PROTECT(x = coerceVector(CAR(args), STRSXP));
    if (!isString(x))
	error(_("'%s' requires a character vector"), "nchar()");
    len = LENGTH(x);
    stype = CADR(args);
    if (!isString(stype) || LENGTH(stype) != 1)
	error(_("invalid '%s' argument"), "type");
    type = CHAR(STRING_ELT(stype, 0)); /* always ASCII */
    ntype = strlen(type);
    if (ntype == 0) error(_("invalid '%s' argument"), "type");
    allowNA = asLogical(CADDR(args));
    if (allowNA == NA_LOGICAL) allowNA = 0;

    PROTECT(s = allocVector(INTSXP, len));
    vmax = vmaxget();
    for (i = 0; i < len; i++) {
	SEXP sxi = STRING_ELT(x, i);
	if (sxi == NA_STRING) {
	    INTEGER(s)[i] = 2;
	    continue;
	}
	if (strncmp(type, "bytes", ntype) == 0) {
	    INTEGER(s)[i] = LENGTH(sxi);
	} else if (strncmp(type, "chars", ntype) == 0) {
	    if (IS_UTF8(sxi)) { /* assume this is valid */
		const char *p = CHAR(sxi);
		nc = 0;
		for( ; *p; p += utf8clen(*p)) nc++;
		INTEGER(s)[i] = nc;
	    } else if (IS_BYTES(sxi)) {
		if (!allowNA) /* could do chars 0 */
		    error(_("number of characters is not computable for element %d in \"bytes\" encoding"), i+1);
		INTEGER(s)[i] = NA_INTEGER;
	    } else if (mbcslocale) {
		nc = mbstowcs(NULL, translateChar(sxi), 0);
		if (!allowNA && nc < 0)
		    error(_("invalid multibyte string %d"), i+1);
		INTEGER(s)[i] = nc >= 0 ? nc : NA_INTEGER;
	    } else
		INTEGER(s)[i] = strlen(translateChar(sxi));
	} else if (strncmp(type, "width", ntype) == 0) {
	    if (IS_UTF8(sxi)) { /* assume this is valid */
		const char *p = CHAR(sxi);
		wchar_t wc1;
		nc = 0;
		for( ; *p; p += utf8clen(*p)) {
		    utf8toucs(&wc1, p);
		    nc += Ri18n_wcwidth(wc1);
		}
		INTEGER(s)[i] = nc;
	    } else if (IS_BYTES(sxi)) {
		if (!allowNA) /* could do width 0 */
		    error(_("width is not computable for element %d in \"bytes\" encoding"), i+1);
		INTEGER(s)[i] = NA_INTEGER;
	    } else if (mbcslocale) {
		xi = translateChar(sxi);
		nc = mbstowcs(NULL, xi, 0);
		if (nc >= 0) {
		    wc = (wchar_t *) R_AllocStringBuffer((nc+1)*sizeof(wchar_t), &cbuff);

		    mbstowcs(wc, xi, nc + 1);
		    INTEGER(s)[i] = Ri18n_wcswidth(wc, 2147483647);
		    if (INTEGER(s)[i] < 1) INTEGER(s)[i] = nc;
		} else if (allowNA)
		    error(_("invalid multibyte string %d"), i+1);
		else
		    INTEGER(s)[i] = NA_INTEGER;
	    } else
		INTEGER(s)[i] = strlen(translateChar(sxi));
	} else
	    error(_("invalid '%s' argument"), "type");
	vmaxset(vmax);
    }
    R_FreeStringBufferL(&cbuff);
    if ((d = getAttrib(x, R_NamesSymbol)) != R_NilValue)
	setAttrib(s, R_NamesSymbol, d);
    if ((d = getAttrib(x, R_DimSymbol)) != R_NilValue)
	setAttrib(s, R_DimSymbol, d);
    if ((d = getAttrib(x, R_DimNamesSymbol)) != R_NilValue)
	setAttrib(s, R_DimNamesSymbol, d);
    UNPROTECT(2);
    return s;
}