Esempio n. 1
0
/**
 * @param str the string to be scrubbed
 * @param repl the replacement character
 * @return If given string is invalid, returns a new string. Otherwise, returns Qnil.
 */
static VALUE
str_scrub0(int argc, VALUE *argv, VALUE str)
{
    int cr = ENC_CODERANGE(str);
    rb_encoding *enc;
    int encidx;
    VALUE repl;

    if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID)
	return Qnil;

    enc = STR_ENC_GET(str);
    rb_scan_args(argc, argv, "01", &repl);
    if (argc != 0) {
	repl = str_compat_and_valid(repl, enc);
    }

    if (rb_enc_dummy_p(enc)) {
	return Qnil;
    }
    encidx = rb_enc_to_index(enc);

#define DEFAULT_REPLACE_CHAR(str) do { \
	static const char replace[sizeof(str)-1] = str; \
	rep = replace; replen = (int)sizeof(replace); \
    } while (0)

    if (rb_enc_asciicompat(enc)) {
	const char *p = RSTRING_PTR(str);
	const char *e = RSTRING_END(str);
	const char *p1 = p;
	const char *rep;
	long replen;
	int rep7bit_p;
	VALUE buf = Qnil;
	if (rb_block_given_p()) {
	    rep = NULL;
	    replen = 0;
	    rep7bit_p = FALSE;
	}
	else if (!NIL_P(repl)) {
	    rep = RSTRING_PTR(repl);
	    replen = RSTRING_LEN(repl);
	    rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
	}
	else if (encidx == rb_utf8_encindex()) {
	    DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
	    rep7bit_p = FALSE;
	}
	else {
	    DEFAULT_REPLACE_CHAR("?");
	    rep7bit_p = TRUE;
	}
	cr = ENC_CODERANGE_7BIT;

	p = search_nonascii(p, e);
	if (!p) {
	    p = e;
	}
	while (p < e) {
	    int ret = rb_enc_precise_mbclen(p, e, enc);
	    if (MBCLEN_NEEDMORE_P(ret)) {
		break;
	    }
	    else if (MBCLEN_CHARFOUND_P(ret)) {
		cr = ENC_CODERANGE_VALID;
		p += MBCLEN_CHARFOUND_LEN(ret);
	    }
	    else if (MBCLEN_INVALID_P(ret)) {
		/*
		 * p1~p: valid ascii/multibyte chars
		 * p ~e: invalid bytes + unknown bytes
		 */
		long clen = rb_enc_mbmaxlen(enc);
		if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
		if (p > p1) {
		    rb_str_buf_cat(buf, p1, p - p1);
		}

		if (e - p < clen) clen = e - p;
		if (clen <= 2) {
		    clen = 1;
		}
		else {
		    const char *q = p;
		    clen--;
		    for (; clen > 1; clen--) {
			ret = rb_enc_precise_mbclen(q, q + clen, enc);
			if (MBCLEN_NEEDMORE_P(ret)) break;
			if (MBCLEN_INVALID_P(ret)) continue;
			UNREACHABLE;
		    }
		}
		if (rep) {
		    rb_str_buf_cat(buf, rep, replen);
		    if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
		}
		else {
		    repl = rb_yield(rb_enc_str_new(p, clen, enc));
		    repl = str_compat_and_valid(repl, enc);
		    rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
		    if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
			cr = ENC_CODERANGE_VALID;
		}
		p += clen;
		p1 = p;
		p = search_nonascii(p, e);
		if (!p) {
		    p = e;
		    break;
		}
	    }
	    else {
		UNREACHABLE;
	    }
	}
	if (NIL_P(buf)) {
	    if (p == e) {
		ENC_CODERANGE_SET(str, cr);
		return Qnil;
	    }
	    buf = rb_str_buf_new(RSTRING_LEN(str));
	}
	if (p1 < p) {
	    rb_str_buf_cat(buf, p1, p - p1);
	}
	if (p < e) {
	    if (rep) {
		rb_str_buf_cat(buf, rep, replen);
		if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
	    }
	    else {
		repl = rb_yield(rb_enc_str_new(p, e-p, enc));
		repl = str_compat_and_valid(repl, enc);
		rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
		if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
		    cr = ENC_CODERANGE_VALID;
	    }
	}
	ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
	return buf;
    }
    else {
	/* ASCII incompatible */
	const char *p = RSTRING_PTR(str);
	const char *e = RSTRING_END(str);
	const char *p1 = p;
	VALUE buf = Qnil;
	const char *rep;
	long replen;
	long mbminlen = rb_enc_mbminlen(enc);
	if (!NIL_P(repl)) {
	    rep = RSTRING_PTR(repl);
	    replen = RSTRING_LEN(repl);
	}
	else if (!strcasecmp(rb_enc_name(enc), "UTF-16BE")) {
	    DEFAULT_REPLACE_CHAR("\xFF\xFD");
	}
	else if (!strcasecmp(rb_enc_name(enc), "UTF-16LE")) {
	    DEFAULT_REPLACE_CHAR("\xFD\xFF");
	}
	else if (!strcasecmp(rb_enc_name(enc), "UTF-32BE")) {
	    DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
	}
	else if (!strcasecmp(rb_enc_name(enc), "UTF-32lE")) {
	    DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
	}
	else {
	    DEFAULT_REPLACE_CHAR("?");
	}

	while (p < e) {
	    int ret = rb_enc_precise_mbclen(p, e, enc);
	    if (MBCLEN_NEEDMORE_P(ret)) {
		break;
	    }
	    else if (MBCLEN_CHARFOUND_P(ret)) {
		p += MBCLEN_CHARFOUND_LEN(ret);
	    }
	    else if (MBCLEN_INVALID_P(ret)) {
		const char *q = p;
		long clen = rb_enc_mbmaxlen(enc);
		if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
		if (p > p1) rb_str_buf_cat(buf, p1, p - p1);

		if (e - p < clen) clen = e - p;
		if (clen <= mbminlen * 2) {
		    clen = mbminlen;
		}
		else {
		    clen -= mbminlen;
		    for (; clen > mbminlen; clen-=mbminlen) {
			ret = rb_enc_precise_mbclen(q, q + clen, enc);
			if (MBCLEN_NEEDMORE_P(ret)) break;
			if (MBCLEN_INVALID_P(ret)) continue;
			UNREACHABLE;
		    }
		}
		if (rep) {
		    rb_str_buf_cat(buf, rep, replen);
		}
		else {
		    repl = rb_yield(rb_enc_str_new(p, e-p, enc));
		    repl = str_compat_and_valid(repl, enc);
		    rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
		}
		p += clen;
		p1 = p;
	    }
	    else {
		UNREACHABLE;
	    }
	}
	if (NIL_P(buf)) {
	    if (p == e) {
		ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
		return Qnil;
	    }
	    buf = rb_str_buf_new(RSTRING_LEN(str));
	}
	if (p1 < p) {
	    rb_str_buf_cat(buf, p1, p - p1);
	}
	if (p < e) {
	    if (rep) {
		rb_str_buf_cat(buf, rep, replen);
	    }
	    else {
		repl = rb_yield(rb_enc_str_new(p, e-p, enc));
		repl = str_compat_and_valid(repl, enc);
		rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
	    }
	}
	ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), ENC_CODERANGE_VALID);
	return buf;
    }
}
Esempio n. 2
0
static VALUE
optimized_unescape(VALUE str, VALUE encoding)
{
    long i, len, beg = 0;
    VALUE dest = 0;
    const char *cstr;
    int cr, origenc, encidx = rb_to_encoding_index(encoding);

    len  = RSTRING_LEN(str);
    cstr = RSTRING_PTR(str);

    for (i = 0; i < len; ++i) {
	char buf[1];
	const char c = cstr[i];
	int clen = 0;
	if (c == '%') {
	    if (i + 3 > len) break;
	    if (!ISXDIGIT(cstr[i+1])) continue;
	    if (!ISXDIGIT(cstr[i+2])) continue;
	    buf[0] = ((char_to_number(cstr[i+1]) << 4)
		      | char_to_number(cstr[i+2]));
	    clen = 2;
	}
	else if (c == '+') {
	    buf[0] = ' ';
	}
	else {
	    continue;
	}

	if (!dest) {
	    dest = rb_str_buf_new(len);
	}

	rb_str_cat(dest, cstr + beg, i - beg);
	i += clen;
	beg = i + 1;

	rb_str_cat(dest, buf, 1);
    }

    if (dest) {
	rb_str_cat(dest, cstr + beg, len - beg);
	preserve_original_state(str, dest);
	cr = ENC_CODERANGE_UNKNOWN;
    }
    else {
	dest = rb_str_dup(str);
	cr = ENC_CODERANGE(str);
    }
    origenc = rb_enc_get_index(str);
    if (origenc != encidx) {
	rb_enc_associate_index(dest, encidx);
	if (!ENC_CODERANGE_CLEAN_P(rb_enc_str_coderange(dest))) {
	    rb_enc_associate_index(dest, origenc);
	    if (cr != ENC_CODERANGE_UNKNOWN)
		ENC_CODERANGE_SET(dest, cr);
	}
    }
    return dest;
}
Esempio n. 3
0
static int
str_transcode(int argc, VALUE *argv, VALUE *self)
{
    VALUE dest;
    VALUE str = *self;
    long blen, slen;
    unsigned char *buf, *bp, *sp, *fromp;
    rb_encoding *from_enc, *to_enc;
    const char *from_e, *to_e;
    int from_encidx, to_encidx;
    VALUE from_encval, to_encval;
    const rb_transcoder *my_transcoder;
    rb_transcoding my_transcoding;
    int final_encoding = 0;
    VALUE opt;
    int options = 0;

    opt = rb_check_convert_type(argv[argc-1], T_HASH, "Hash", "to_hash");
    if (!NIL_P(opt)) {
	VALUE v;

	argc--;
	v = rb_hash_aref(opt, sym_invalid);
	if (NIL_P(v)) {
	    rb_raise(rb_eArgError, "unknown value for invalid: setting");
	}
	else if (v==sym_ignore) {
	    options |= INVALID_IGNORE;
	}
    }
    if (argc < 1 || argc > 2) {
	rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
    }
    if ((to_encidx = rb_to_encoding_index(to_encval = argv[0])) < 0) {
	to_enc = 0;
	to_encidx = 0;
	to_e = StringValueCStr(to_encval);
    }
    else {
	to_enc = rb_enc_from_index(to_encidx);
	to_e = rb_enc_name(to_enc);
    }
    if (argc==1) {
	from_encidx = rb_enc_get_index(str);
	from_enc = rb_enc_from_index(from_encidx);
	from_e = rb_enc_name(from_enc);
    }
    else if ((from_encidx = rb_to_encoding_index(from_encval = argv[1])) < 0) {
	from_enc = 0;
	from_e = StringValueCStr(from_encval);
    }
    else {
	from_enc = rb_enc_from_index(from_encidx);
	from_e = rb_enc_name(from_enc);
    }

    if (from_enc && from_enc == to_enc) {
	return -1;
    }
    if (from_enc && to_enc && rb_enc_asciicompat(from_enc) && rb_enc_asciicompat(to_enc)) {
	if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
	    return to_encidx;
	}
    }
    if (encoding_equal(from_e, to_e)) {
	return -1;
    }

    do { /* loop for multistep transcoding */
	/* later, maybe use smaller intermediate strings for very long strings */
	if (!(my_transcoder = transcode_dispatch(from_e, to_e))) {
	    rb_raise(rb_eArgError, "transcoding not supported (from %s to %s)", from_e, to_e);
	}

	my_transcoding.transcoder = my_transcoder;

	if (my_transcoder->preprocessor) {
	    fromp = sp = (unsigned char *)RSTRING_PTR(str);
	    slen = RSTRING_LEN(str);
	    blen = slen + 30; /* len + margin */
	    dest = rb_str_tmp_new(blen);
	    bp = (unsigned char *)RSTRING_PTR(dest);
	    my_transcoding.ruby_string_dest = dest;
	    (*my_transcoder->preprocessor)(&fromp, &bp, (sp+slen), (bp+blen), &my_transcoding);
	    if (fromp != sp+slen) {
		rb_raise(rb_eArgError, "not fully converted, %td bytes left", sp+slen-fromp);
	    }
	    buf = (unsigned char *)RSTRING_PTR(dest);
	    *bp = '\0';
	    rb_str_set_len(dest, bp - buf);
	    str = dest;
	}
	fromp = sp = (unsigned char *)RSTRING_PTR(str);
	slen = RSTRING_LEN(str);
	blen = slen + 30; /* len + margin */
	dest = rb_str_tmp_new(blen);
	bp = (unsigned char *)RSTRING_PTR(dest);
	my_transcoding.ruby_string_dest = dest;
	my_transcoding.flush_func = str_transcoding_resize;

	transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), my_transcoder, &my_transcoding, options);
	if (fromp != sp+slen) {
	    rb_raise(rb_eArgError, "not fully converted, %td bytes left", sp+slen-fromp);
	}
	buf = (unsigned char *)RSTRING_PTR(dest);
	*bp = '\0';
	rb_str_set_len(dest, bp - buf);
	if (my_transcoder->postprocessor) {
	    str = dest;
	    fromp = sp = (unsigned char *)RSTRING_PTR(str);
	    slen = RSTRING_LEN(str);
	    blen = slen + 30; /* len + margin */
	    dest = rb_str_tmp_new(blen);
	    bp = (unsigned char *)RSTRING_PTR(dest);
	    my_transcoding.ruby_string_dest = dest;
	    (*my_transcoder->postprocessor)(&fromp, &bp, (sp+slen), (bp+blen), &my_transcoding);
	    if (fromp != sp+slen) {
		rb_raise(rb_eArgError, "not fully converted, %td bytes left", sp+slen-fromp);
	    }
	    buf = (unsigned char *)RSTRING_PTR(dest);
	    *bp = '\0';
	    rb_str_set_len(dest, bp - buf);
	}

	if (encoding_equal(my_transcoder->to_encoding, to_e)) {
	    final_encoding = 1;
	}
	else {
	    from_e = my_transcoder->to_encoding;
	    str = dest;
	}
    } while (!final_encoding);
    /* set encoding */
    if (!to_enc) {
	to_encidx = rb_define_dummy_encoding(to_e);
    }
    *self = dest;

    return to_encidx;
}