VALUE rb_enc_associate_index(VALUE obj, int idx) { rb_encoding *enc; int oldidx, oldtermlen, termlen; /* enc_check_capable(obj);*/ rb_check_frozen(obj); oldidx = rb_enc_get_index(obj); if (oldidx == idx) return obj; if (SPECIAL_CONST_P(obj)) { rb_raise(rb_eArgError, "cannot set encoding"); } enc = must_encindex(idx); if (!ENC_CODERANGE_ASCIIONLY(obj) || !rb_enc_asciicompat(enc)) { ENC_CODERANGE_CLEAR(obj); } termlen = rb_enc_mbminlen(enc); oldtermlen = rb_enc_mbminlen(rb_enc_from_index(oldidx)); if (oldtermlen < termlen && RB_TYPE_P(obj, T_STRING)) { rb_str_fill_terminator(obj, termlen); } enc_set_index(obj, idx); return obj; }
VALUE rb_enc_vsprintf(rb_encoding *enc, const char *fmt, va_list ap) { rb_printf_buffer_extra buffer; #define f buffer.base VALUE result; f._flags = __SWR | __SSTR; f._bf._size = 0; f._w = 120; result = rb_str_buf_new(f._w); if (enc) { if (rb_enc_mbminlen(enc) > 1) { /* the implementation deeply depends on plain char */ rb_raise(rb_eArgError, "cannot construct wchar_t based encoding string: %s", rb_enc_name(enc)); } rb_enc_associate(result, enc); } f._bf._base = (unsigned char *)result; f._p = (unsigned char *)RSTRING_PTR(result); RBASIC_CLEAR_CLASS(result); f.vwrite = ruby__sfvwrite; f.vextra = ruby__sfvextra; buffer.value = 0; BSD_vfprintf(&f, fmt, ap); RBASIC_SET_CLASS_RAW(result, rb_cString); rb_str_resize(result, (char *)f._p - RSTRING_PTR(result)); #undef f return result; }
VALUE rb_readlink(VALUE path) { ssize_t len; WCHAR *wpath, wbuf[MAX_PATH]; rb_encoding *enc; UINT cp, path_cp; FilePathValue(path); enc = rb_enc_get(path); cp = path_cp = code_page(enc); if (cp == INVALID_CODE_PAGE) { path = fix_string_encoding(path, enc); cp = CP_UTF8; } wpath = mbstr_to_wstr(cp, RSTRING_PTR(path), RSTRING_LEN(path)+rb_enc_mbminlen(enc), NULL); if (!wpath) rb_memerror(); len = rb_w32_wreadlink(wpath, wbuf, numberof(wbuf)); free(wpath); if (len < 0) rb_sys_fail_path(path); enc = rb_filesystem_encoding(); cp = path_cp = code_page(enc); if (cp == INVALID_CODE_PAGE) cp = CP_UTF8; return append_wstr(rb_enc_str_new(0, 0, enc), wbuf, len, cp, path_cp, enc); }
int rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc) { int n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e); if (MBCLEN_CHARFOUND_P(n) && MBCLEN_CHARFOUND_LEN(n) <= e-p) return MBCLEN_CHARFOUND_LEN(n); else { int min = rb_enc_mbminlen(enc); return min <= e-p ? min : (int)(e-p); } }
static VALUE bug_str_cstr_term(VALUE str) { long len; char *s; int c; rb_encoding *enc; len = RSTRING_LEN(str); s = StringValueCStr(str); rb_gc(); enc = rb_enc_get(str); c = rb_enc_codepoint(&s[len], &s[len+rb_enc_mbminlen(enc)], enc); return INT2NUM(c); }
static VALUE bug_str_cstr_term_char(VALUE str) { long len; char *s; int c; rb_encoding *enc = rb_enc_get(str); RSTRING_GETMEM(str, s, len); s += len; len = rb_enc_mbminlen(enc); c = rb_enc_precise_mbclen(s, s + len, enc); if (!MBCLEN_CHARFOUND_P(c)) { c = (unsigned char)*s; } else { c = rb_enc_mbc_to_codepoint(s, s + len, enc); if (!c) return Qnil; } return rb_enc_uint_chr((unsigned int)c, enc); }
/** * @param str the string to be scrubbed * @param repl the replacement character * @return If given string is invalid, returns a new string. Otherwise, returns Qnil. */ static VALUE str_scrub0(int argc, VALUE *argv, VALUE str) { int cr = ENC_CODERANGE(str); rb_encoding *enc; int encidx; VALUE repl; if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) return Qnil; enc = STR_ENC_GET(str); rb_scan_args(argc, argv, "01", &repl); if (argc != 0) { repl = str_compat_and_valid(repl, enc); } if (rb_enc_dummy_p(enc)) { return Qnil; } encidx = rb_enc_to_index(enc); #define DEFAULT_REPLACE_CHAR(str) do { \ static const char replace[sizeof(str)-1] = str; \ rep = replace; replen = (int)sizeof(replace); \ } while (0) if (rb_enc_asciicompat(enc)) { const char *p = RSTRING_PTR(str); const char *e = RSTRING_END(str); const char *p1 = p; const char *rep; long replen; int rep7bit_p; VALUE buf = Qnil; if (rb_block_given_p()) { rep = NULL; replen = 0; rep7bit_p = FALSE; } else if (!NIL_P(repl)) { rep = RSTRING_PTR(repl); replen = RSTRING_LEN(repl); rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT); } else if (encidx == rb_utf8_encindex()) { DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD"); rep7bit_p = FALSE; } else { DEFAULT_REPLACE_CHAR("?"); rep7bit_p = TRUE; } cr = ENC_CODERANGE_7BIT; p = search_nonascii(p, e); if (!p) { p = e; } while (p < e) { int ret = rb_enc_precise_mbclen(p, e, enc); if (MBCLEN_NEEDMORE_P(ret)) { break; } else if (MBCLEN_CHARFOUND_P(ret)) { cr = ENC_CODERANGE_VALID; p += MBCLEN_CHARFOUND_LEN(ret); } else if (MBCLEN_INVALID_P(ret)) { /* * p1~p: valid ascii/multibyte chars * p ~e: invalid bytes + unknown bytes */ long clen = rb_enc_mbmaxlen(enc); if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str)); if (p > p1) { rb_str_buf_cat(buf, p1, p - p1); } if (e - p < clen) clen = e - p; if (clen <= 2) { clen = 1; } else { const char *q = p; clen--; for (; clen > 1; clen--) { ret = rb_enc_precise_mbclen(q, q + clen, enc); if (MBCLEN_NEEDMORE_P(ret)) break; if (MBCLEN_INVALID_P(ret)) continue; UNREACHABLE; } } if (rep) { rb_str_buf_cat(buf, rep, replen); if (!rep7bit_p) cr = ENC_CODERANGE_VALID; } else { repl = rb_yield(rb_enc_str_new(p, clen, enc)); repl = str_compat_and_valid(repl, enc); rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl)); if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID) cr = ENC_CODERANGE_VALID; } p += clen; p1 = p; p = search_nonascii(p, e); if (!p) { p = e; break; } } else { UNREACHABLE; } } if (NIL_P(buf)) { if (p == e) { ENC_CODERANGE_SET(str, cr); return Qnil; } buf = rb_str_buf_new(RSTRING_LEN(str)); } if (p1 < p) { rb_str_buf_cat(buf, p1, p - p1); } if (p < e) { if (rep) { rb_str_buf_cat(buf, rep, replen); if (!rep7bit_p) cr = ENC_CODERANGE_VALID; } else { repl = rb_yield(rb_enc_str_new(p, e-p, enc)); repl = str_compat_and_valid(repl, enc); rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl)); if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID) cr = ENC_CODERANGE_VALID; } } ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr); return buf; } else { /* ASCII incompatible */ const char *p = RSTRING_PTR(str); const char *e = RSTRING_END(str); const char *p1 = p; VALUE buf = Qnil; const char *rep; long replen; long mbminlen = rb_enc_mbminlen(enc); if (!NIL_P(repl)) { rep = RSTRING_PTR(repl); replen = RSTRING_LEN(repl); } else if (!strcasecmp(rb_enc_name(enc), "UTF-16BE")) { DEFAULT_REPLACE_CHAR("\xFF\xFD"); } else if (!strcasecmp(rb_enc_name(enc), "UTF-16LE")) { DEFAULT_REPLACE_CHAR("\xFD\xFF"); } else if (!strcasecmp(rb_enc_name(enc), "UTF-32BE")) { DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD"); } else if (!strcasecmp(rb_enc_name(enc), "UTF-32lE")) { DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00"); } else { DEFAULT_REPLACE_CHAR("?"); } while (p < e) { int ret = rb_enc_precise_mbclen(p, e, enc); if (MBCLEN_NEEDMORE_P(ret)) { break; } else if (MBCLEN_CHARFOUND_P(ret)) { p += MBCLEN_CHARFOUND_LEN(ret); } else if (MBCLEN_INVALID_P(ret)) { const char *q = p; long clen = rb_enc_mbmaxlen(enc); if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str)); if (p > p1) rb_str_buf_cat(buf, p1, p - p1); if (e - p < clen) clen = e - p; if (clen <= mbminlen * 2) { clen = mbminlen; } else { clen -= mbminlen; for (; clen > mbminlen; clen-=mbminlen) { ret = rb_enc_precise_mbclen(q, q + clen, enc); if (MBCLEN_NEEDMORE_P(ret)) break; if (MBCLEN_INVALID_P(ret)) continue; UNREACHABLE; } } if (rep) { rb_str_buf_cat(buf, rep, replen); } else { repl = rb_yield(rb_enc_str_new(p, e-p, enc)); repl = str_compat_and_valid(repl, enc); rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl)); } p += clen; p1 = p; } else { UNREACHABLE; } } if (NIL_P(buf)) { if (p == e) { ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID); return Qnil; } buf = rb_str_buf_new(RSTRING_LEN(str)); } if (p1 < p) { rb_str_buf_cat(buf, p1, p - p1); } if (p < e) { if (rep) { rb_str_buf_cat(buf, rep, replen); } else { repl = rb_yield(rb_enc_str_new(p, e-p, enc)); repl = str_compat_and_valid(repl, enc); rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl)); } } ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), ENC_CODERANGE_VALID); return buf; } }