static rb_encoding* set_base_encoding(int index, rb_encoding *base) { rb_encoding *enc = enc_table.list[index].enc; enc_table.list[index].base = base; if (rb_enc_dummy_p(base)) ENC_SET_DUMMY(enc); return enc; }
static int opt_enc_index(VALUE enc_name) { const char *s = RSTRING_PTR(enc_name); int i = rb_enc_find_index(s); if (i < 0) { rb_raise(rb_eRuntimeError, "unknown encoding name - %s", s); } else if (rb_enc_dummy_p(rb_enc_from_index(i))) { rb_raise(rb_eRuntimeError, "dummy encoding is not acceptable - %s ", s); } return i; }
/** * @param str the string to be scrubbed * @param repl the replacement character * @return If given string is invalid, returns a new string. Otherwise, returns Qnil. */ static VALUE str_scrub0(int argc, VALUE *argv, VALUE str) { int cr = ENC_CODERANGE(str); rb_encoding *enc; int encidx; VALUE repl; if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) return Qnil; enc = STR_ENC_GET(str); rb_scan_args(argc, argv, "01", &repl); if (argc != 0) { repl = str_compat_and_valid(repl, enc); } if (rb_enc_dummy_p(enc)) { return Qnil; } encidx = rb_enc_to_index(enc); #define DEFAULT_REPLACE_CHAR(str) do { \ static const char replace[sizeof(str)-1] = str; \ rep = replace; replen = (int)sizeof(replace); \ } while (0) if (rb_enc_asciicompat(enc)) { const char *p = RSTRING_PTR(str); const char *e = RSTRING_END(str); const char *p1 = p; const char *rep; long replen; int rep7bit_p; VALUE buf = Qnil; if (rb_block_given_p()) { rep = NULL; replen = 0; rep7bit_p = FALSE; } else if (!NIL_P(repl)) { rep = RSTRING_PTR(repl); replen = RSTRING_LEN(repl); rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT); } else if (encidx == rb_utf8_encindex()) { DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD"); rep7bit_p = FALSE; } else { DEFAULT_REPLACE_CHAR("?"); rep7bit_p = TRUE; } cr = ENC_CODERANGE_7BIT; p = search_nonascii(p, e); if (!p) { p = e; } while (p < e) { int ret = rb_enc_precise_mbclen(p, e, enc); if (MBCLEN_NEEDMORE_P(ret)) { break; } else if (MBCLEN_CHARFOUND_P(ret)) { cr = ENC_CODERANGE_VALID; p += MBCLEN_CHARFOUND_LEN(ret); } else if (MBCLEN_INVALID_P(ret)) { /* * p1~p: valid ascii/multibyte chars * p ~e: invalid bytes + unknown bytes */ long clen = rb_enc_mbmaxlen(enc); if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str)); if (p > p1) { rb_str_buf_cat(buf, p1, p - p1); } if (e - p < clen) clen = e - p; if (clen <= 2) { clen = 1; } else { const char *q = p; clen--; for (; clen > 1; clen--) { ret = rb_enc_precise_mbclen(q, q + clen, enc); if (MBCLEN_NEEDMORE_P(ret)) break; if (MBCLEN_INVALID_P(ret)) continue; UNREACHABLE; } } if (rep) { rb_str_buf_cat(buf, rep, replen); if (!rep7bit_p) cr = ENC_CODERANGE_VALID; } else { repl = rb_yield(rb_enc_str_new(p, clen, enc)); repl = str_compat_and_valid(repl, enc); rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl)); if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID) cr = ENC_CODERANGE_VALID; } p += clen; p1 = p; p = search_nonascii(p, e); if (!p) { p = e; break; } } else { UNREACHABLE; } } if (NIL_P(buf)) { if (p == e) { ENC_CODERANGE_SET(str, cr); return Qnil; } buf = rb_str_buf_new(RSTRING_LEN(str)); } if (p1 < p) { rb_str_buf_cat(buf, p1, p - p1); } if (p < e) { if (rep) { rb_str_buf_cat(buf, rep, replen); if (!rep7bit_p) cr = ENC_CODERANGE_VALID; } else { repl = rb_yield(rb_enc_str_new(p, e-p, enc)); repl = str_compat_and_valid(repl, enc); rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl)); if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID) cr = ENC_CODERANGE_VALID; } } ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr); return buf; } else { /* ASCII incompatible */ const char *p = RSTRING_PTR(str); const char *e = RSTRING_END(str); const char *p1 = p; VALUE buf = Qnil; const char *rep; long replen; long mbminlen = rb_enc_mbminlen(enc); if (!NIL_P(repl)) { rep = RSTRING_PTR(repl); replen = RSTRING_LEN(repl); } else if (!strcasecmp(rb_enc_name(enc), "UTF-16BE")) { DEFAULT_REPLACE_CHAR("\xFF\xFD"); } else if (!strcasecmp(rb_enc_name(enc), "UTF-16LE")) { DEFAULT_REPLACE_CHAR("\xFD\xFF"); } else if (!strcasecmp(rb_enc_name(enc), "UTF-32BE")) { DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD"); } else if (!strcasecmp(rb_enc_name(enc), "UTF-32lE")) { DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00"); } else { DEFAULT_REPLACE_CHAR("?"); } while (p < e) { int ret = rb_enc_precise_mbclen(p, e, enc); if (MBCLEN_NEEDMORE_P(ret)) { break; } else if (MBCLEN_CHARFOUND_P(ret)) { p += MBCLEN_CHARFOUND_LEN(ret); } else if (MBCLEN_INVALID_P(ret)) { const char *q = p; long clen = rb_enc_mbmaxlen(enc); if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str)); if (p > p1) rb_str_buf_cat(buf, p1, p - p1); if (e - p < clen) clen = e - p; if (clen <= mbminlen * 2) { clen = mbminlen; } else { clen -= mbminlen; for (; clen > mbminlen; clen-=mbminlen) { ret = rb_enc_precise_mbclen(q, q + clen, enc); if (MBCLEN_NEEDMORE_P(ret)) break; if (MBCLEN_INVALID_P(ret)) continue; UNREACHABLE; } } if (rep) { rb_str_buf_cat(buf, rep, replen); } else { repl = rb_yield(rb_enc_str_new(p, e-p, enc)); repl = str_compat_and_valid(repl, enc); rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl)); } p += clen; p1 = p; } else { UNREACHABLE; } } if (NIL_P(buf)) { if (p == e) { ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID); return Qnil; } buf = rb_str_buf_new(RSTRING_LEN(str)); } if (p1 < p) { rb_str_buf_cat(buf, p1, p - p1); } if (p < e) { if (rep) { rb_str_buf_cat(buf, rep, replen); } else { repl = rb_yield(rb_enc_str_new(p, e-p, enc)); repl = str_compat_and_valid(repl, enc); rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl)); } } ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), ENC_CODERANGE_VALID); return buf; } }