static void substr(char *buf, const char *str, int ienc, int sa, int so) { /* Store the substring str [sa:so] into buf[] */ int i, j, used; if (ienc == CE_UTF8) { const char *end = str + strlen(str); for (i = 0; i < so && str < end; i++) { int used = utf8clen(*str); if (i < sa - 1) { str += used; continue; } for (j = 0; j < used; j++) *buf++ = *str++; } } else if (ienc == CE_LATIN1 || ienc == CE_BYTES) { for (str += (sa - 1), i = sa; i <= so; i++) *buf++ = *str++; } else { if (mbcslocale && !strIsASCII(str)) { const char *end = str + strlen(str); mbstate_t mb_st; mbs_init(&mb_st); for (i = 1; i < sa; i++) str += Mbrtowc(NULL, str, MB_CUR_MAX, &mb_st); for (i = sa; i <= so && str < end; i++) { used = (int) Mbrtowc(NULL, str, MB_CUR_MAX, &mb_st); for (j = 0; j < used; j++) *buf++ = *str++; } } else for (str += (sa - 1), i = sa; i <= so; i++) *buf++ = *str++; } *buf = '\0'; }
static int mbcs_get_next2(int c, ucs_t *wc) { #if 0 int i, res, clen = 1; char s[9]; s[0] = c; /* This assumes (probably OK) that all MBCS embed ASCII as single-byte lead bytes, including control chars */ if((unsigned int) c < 0x80) { *wc = (wchar_t) c; return 1; } if(utf8locale) { clen = utf8clen(c); for(i = 1; i < clen; i++) { s[i] = xxgetc(); if(s[i] == R_EOF) { PROBLEM "EOF whilst reading MBCS char" ERROR; } } s[clen] ='\0'; /* x86 Solaris requires this */ res = mbtoucs(wc, s, clen); if(res == -1) { PROBLEM "invalid multibyte character in parser" ERROR; } } else { /* This is not necessarily correct for stateful MBCS */ while(clen <= MB_CUR_MAX) { res = mbtoucs(wc, s, clen); if(res >= 0) break; if(res == -1) { PROBLEM "invalid multibyte character in parser" ERROR; } /* so res == -2 */ c = xxgetc(); if(c == R_EOF) { PROBLEM "EOF whilst reading MBCS char" ERROR; } s[clen++] = c; } /* we've tried enough, so must be complete or invalid by now */ } for(i = clen - 1; i > 0; i--) xxungetc(s[i]); return clen; #else return(0); #endif }
static void substrset(char *buf, const char *const str, cetype_t ienc, int sa, int so) { /* Replace the substring buf[sa:so] by str[] */ int i, in = 0, out = 0; if (ienc == CE_UTF8) { for (i = 1; i < sa; i++) buf += utf8clen(*buf); for (i = sa; i <= so && in < strlen(str); i++) { in += utf8clen(str[in]); out += utf8clen(buf[out]); if (!str[in]) break; } if (in != out) memmove(buf+in, buf+out, strlen(buf+out)+1); memcpy(buf, str, in); } else if (ienc == CE_LATIN1 || ienc == CE_BYTES) { in = (int) strlen(str); out = so - sa + 1; memcpy(buf + sa - 1, str, (in < out) ? in : out); } else { /* This cannot work for stateful encodings */ if (mbcslocale) { for (i = 1; i < sa; i++) buf += Mbrtowc(NULL, buf, MB_CUR_MAX, NULL); /* now work out how many bytes to replace by how many */ for (i = sa; i <= so && in < strlen(str); i++) { in += (int) Mbrtowc(NULL, str+in, MB_CUR_MAX, NULL); out += (int) Mbrtowc(NULL, buf+out, MB_CUR_MAX, NULL); if (!str[in]) break; } if (in != out) memmove(buf+in, buf+out, strlen(buf+out)+1); memcpy(buf, str, in); } else { in = (int) strlen(str); out = so - sa + 1; memcpy(buf + sa - 1, str, (in < out) ? in : out); } } }
static bool keysym_to_utf8(char *buf, const xcb_keysym_t ksym) { unsigned int ksym_conv; int count; /* Unicode keysym */ if((ksym & 0xff000000) == 0x01000000) ksym_conv = ksym & 0x00ffffff; else if(ksym > 0 && ksym < 0x100) ksym_conv = ksym; else if(ksym > 0x1a0 && ksym < 0x200) ksym_conv = keysym_to_unicode_1a1_1ff[ksym - 0x1a1]; else if(ksym > 0x2a0 && ksym < 0x2ff) ksym_conv = keysym_to_unicode_2a1_2fe[ksym - 0x2a1]; else if(ksym > 0x3a1 && ksym < 0x3ff) ksym_conv = keysym_to_unicode_3a2_3fe[ksym - 0x3a2]; else if(ksym > 0x4a0 && ksym < 0x4e0) ksym_conv = keysym_to_unicode_4a1_4df[ksym - 0x4a1]; else if(ksym > 0x589 && ksym < 0x5ff) ksym_conv = keysym_to_unicode_590_5fe[ksym - 0x590]; else if(ksym > 0x67f && ksym < 0x700) ksym_conv = keysym_to_unicode_680_6ff[ksym - 0x680]; else if(ksym > 0x7a0 && ksym < 0x7fa) ksym_conv = keysym_to_unicode_7a1_7f9[ksym - 0x7a1]; else if(ksym > 0x8a3 && ksym < 0x8ff) ksym_conv = keysym_to_unicode_8a4_8fe[ksym - 0x8a4]; else if(ksym > 0x9de && ksym < 0x9f9) ksym_conv = keysym_to_unicode_9df_9f8[ksym - 0x9df]; else if(ksym > 0xaa0 && ksym < 0xaff) ksym_conv = keysym_to_unicode_aa1_afe[ksym - 0xaa1]; else if(ksym > 0xcde && ksym < 0xcfb) ksym_conv = keysym_to_unicode_cdf_cfa[ksym - 0xcdf]; else if(ksym > 0xda0 && ksym < 0xdfa) ksym_conv = keysym_to_unicode_da1_df9[ksym - 0xda1]; else if(ksym > 0xe9f && ksym < 0xf00) ksym_conv = keysym_to_unicode_ea0_eff[ksym - 0xea0]; else if(ksym > 0x12a0 && ksym < 0x12ff) ksym_conv = keysym_to_unicode_12a1_12fe[ksym - 0x12a1]; else if(ksym > 0x13bb && ksym < 0x13bf) ksym_conv = keysym_to_unicode_13bc_13be[ksym - 0x13bc]; else if(ksym > 0x14a0 && ksym < 0x1500) ksym_conv = keysym_to_unicode_14a1_14ff[ksym - 0x14a1]; else if(ksym > 0x15cf && ksym < 0x15f7) ksym_conv = keysym_to_unicode_15d0_15f6[ksym - 0x15d0]; else if(ksym > 0x169f && ksym < 0x16f7) ksym_conv = keysym_to_unicode_16a0_16f6[ksym - 0x16a0]; else if(ksym > 0x1e9e && ksym < 0x1f00) ksym_conv = keysym_to_unicode_1e9f_1eff[ksym - 0x1e9f]; else if(ksym > 0x209f && ksym < 0x20ad) ksym_conv = keysym_to_unicode_20a0_20ac[ksym - 0x20a0]; else return false; count = utf8clen(ksym_conv); switch(count) { case 7: return false; case 6: buf[5] = (ksym_conv | 0x80) & 0xbf; ksym_conv >>= 6; case 5: buf[4] = (ksym_conv | 0x80) & 0xbf; ksym_conv >>= 6; case 4: buf[3] = (ksym_conv | 0x80) & 0xbf; ksym_conv >>= 6; case 3: buf[2] = (ksym_conv | 0x80) & 0xbf; ksym_conv >>= 6; case 2: buf[1] = (ksym_conv | 0x80) & 0xbf; ksym_conv >>= 6; case 1: buf[0] = (ksym_conv | __utf8_mark[count]); } buf[count] = '\0'; return true; }
int R_nchar(SEXP string, nchar_type type_, Rboolean allowNA, Rboolean keepNA, const char* msg_name) { if (string == NA_STRING) return keepNA ? NA_INTEGER : 2; // else : switch(type_) { case Bytes: return LENGTH(string); break; case Chars: if (IS_UTF8(string)) { const char *p = CHAR(string); if (!utf8Valid(p)) { if (!allowNA) error(_("invalid multibyte string, %s"), msg_name); return NA_INTEGER; } else { int nc = 0; for( ; *p; p += utf8clen(*p)) nc++; return nc; } } else if (IS_BYTES(string)) { if (!allowNA) /* could do chars 0 */ error(_("number of characters is not computable in \"bytes\" encoding, %s"), msg_name); return NA_INTEGER; } else if (mbcslocale) { int nc = (int) mbstowcs(NULL, translateChar(string), 0); if (!allowNA && nc < 0) error(_("invalid multibyte string, %s"), msg_name); return (nc >= 0 ? nc : NA_INTEGER); } else return ((int) strlen(translateChar(string))); break; case Width: if (IS_UTF8(string)) { const char *p = CHAR(string); if (!utf8Valid(p)) { if (!allowNA) error(_("invalid multibyte string, %s"), msg_name); return NA_INTEGER; } else { wchar_t wc1; int nc = 0; for( ; *p; p += utf8clen(*p)) { utf8toucs(&wc1, p); nc += Ri18n_wcwidth(wc1); } return nc; } } else if (IS_BYTES(string)) { if (!allowNA) /* could do width 0 */ error(_("width is not computable for %s in \"bytes\" encoding"), msg_name); return NA_INTEGER; } else if (mbcslocale) { const char *xi = translateChar(string); int nc = (int) mbstowcs(NULL, xi, 0); if (nc >= 0) { const void *vmax = vmaxget(); wchar_t *wc = (wchar_t *) R_AllocStringBuffer((nc+1)*sizeof(wchar_t), &cbuff); mbstowcs(wc, xi, nc + 1); int nci18n = Ri18n_wcswidth(wc, 2147483647); vmaxset(vmax); return (nci18n < 1) ? nc : nci18n; } else if (allowNA) error(_("invalid multibyte string, %s"), msg_name); else return NA_INTEGER; } else return (int) strlen(translateChar(string)); } // switch return NA_INTEGER; // -Wall } // R_nchar()
SEXP attribute_hidden do_nchar(SEXP call, SEXP op, SEXP args, SEXP env) { SEXP d, s, x, stype; int i, len, allowNA; size_t ntype; int nc; const char *type; const char *xi; wchar_t *wc; const void *vmax; checkArity(op, args); if (isFactor(CAR(args))) error(_("'%s' requires a character vector"), "nchar()"); PROTECT(x = coerceVector(CAR(args), STRSXP)); if (!isString(x)) error(_("'%s' requires a character vector"), "nchar()"); len = LENGTH(x); stype = CADR(args); if (!isString(stype) || LENGTH(stype) != 1) error(_("invalid '%s' argument"), "type"); type = CHAR(STRING_ELT(stype, 0)); /* always ASCII */ ntype = strlen(type); if (ntype == 0) error(_("invalid '%s' argument"), "type"); allowNA = asLogical(CADDR(args)); if (allowNA == NA_LOGICAL) allowNA = 0; PROTECT(s = allocVector(INTSXP, len)); vmax = vmaxget(); for (i = 0; i < len; i++) { SEXP sxi = STRING_ELT(x, i); if (sxi == NA_STRING) { INTEGER(s)[i] = 2; continue; } if (strncmp(type, "bytes", ntype) == 0) { INTEGER(s)[i] = LENGTH(sxi); } else if (strncmp(type, "chars", ntype) == 0) { if (IS_UTF8(sxi)) { /* assume this is valid */ const char *p = CHAR(sxi); nc = 0; for( ; *p; p += utf8clen(*p)) nc++; INTEGER(s)[i] = nc; } else if (IS_BYTES(sxi)) { if (!allowNA) /* could do chars 0 */ error(_("number of characters is not computable for element %d in \"bytes\" encoding"), i+1); INTEGER(s)[i] = NA_INTEGER; } else if (mbcslocale) { nc = mbstowcs(NULL, translateChar(sxi), 0); if (!allowNA && nc < 0) error(_("invalid multibyte string %d"), i+1); INTEGER(s)[i] = nc >= 0 ? nc : NA_INTEGER; } else INTEGER(s)[i] = strlen(translateChar(sxi)); } else if (strncmp(type, "width", ntype) == 0) { if (IS_UTF8(sxi)) { /* assume this is valid */ const char *p = CHAR(sxi); wchar_t wc1; nc = 0; for( ; *p; p += utf8clen(*p)) { utf8toucs(&wc1, p); nc += Ri18n_wcwidth(wc1); } INTEGER(s)[i] = nc; } else if (IS_BYTES(sxi)) { if (!allowNA) /* could do width 0 */ error(_("width is not computable for element %d in \"bytes\" encoding"), i+1); INTEGER(s)[i] = NA_INTEGER; } else if (mbcslocale) { xi = translateChar(sxi); nc = mbstowcs(NULL, xi, 0); if (nc >= 0) { wc = (wchar_t *) R_AllocStringBuffer((nc+1)*sizeof(wchar_t), &cbuff); mbstowcs(wc, xi, nc + 1); INTEGER(s)[i] = Ri18n_wcswidth(wc, 2147483647); if (INTEGER(s)[i] < 1) INTEGER(s)[i] = nc; } else if (allowNA) error(_("invalid multibyte string %d"), i+1); else INTEGER(s)[i] = NA_INTEGER; } else INTEGER(s)[i] = strlen(translateChar(sxi)); } else error(_("invalid '%s' argument"), "type"); vmaxset(vmax); } R_FreeStringBufferL(&cbuff); if ((d = getAttrib(x, R_NamesSymbol)) != R_NilValue) setAttrib(s, R_NamesSymbol, d); if ((d = getAttrib(x, R_DimSymbol)) != R_NilValue) setAttrib(s, R_DimSymbol, d); if ((d = getAttrib(x, R_DimNamesSymbol)) != R_NilValue) setAttrib(s, R_DimNamesSymbol, d); UNPROTECT(2); return s; }