Пример #1
0
static rb_encoding*
enc_compatible_latter(VALUE str1, VALUE str2, int idx1, int idx2)
{
    int isstr1, isstr2;
    rb_encoding *enc1 = rb_enc_from_index(idx1);
    rb_encoding *enc2 = rb_enc_from_index(idx2);

    isstr2 = RB_TYPE_P(str2, T_STRING);
    if (isstr2 && RSTRING_LEN(str2) == 0)
	return enc1;
    isstr1 = RB_TYPE_P(str1, T_STRING);
    if (isstr1 && RSTRING_LEN(str1) == 0)
	return (rb_enc_asciicompat(enc1) && rb_enc_str_asciionly_p(str2)) ? enc1 : enc2;
    if (!rb_enc_asciicompat(enc1) || !rb_enc_asciicompat(enc2)) {
	return 0;
    }

    /* objects whose encoding is the same of contents */
    if (!isstr2 && idx2 == ENCINDEX_US_ASCII)
	return enc1;
    if (!isstr1 && idx1 == ENCINDEX_US_ASCII)
	return enc2;

    if (!isstr1) {
	VALUE tmp = str1;
	int idx0 = idx1;
	str1 = str2;
	str2 = tmp;
	idx1 = idx2;
	idx2 = idx0;
	idx0 = isstr1;
	isstr1 = isstr2;
	isstr2 = idx0;
    }
    if (isstr1) {
	int cr1, cr2;

	cr1 = rb_enc_str_coderange(str1);
	if (isstr2) {
	    cr2 = rb_enc_str_coderange(str2);
	    if (cr1 != cr2) {
		/* may need to handle ENC_CODERANGE_BROKEN */
		if (cr1 == ENC_CODERANGE_7BIT) return enc2;
		if (cr2 == ENC_CODERANGE_7BIT) return enc1;
	    }
	    if (cr2 == ENC_CODERANGE_7BIT) {
		return enc1;
	    }
	}
	if (cr1 == ENC_CODERANGE_7BIT)
	    return enc2;
    }
    return 0;
}
Пример #2
0
static VALUE
fenix_coerce_to_path(VALUE obj)
{
	VALUE tmp;
	ID to_path;
	rb_encoding *enc;
	int level = rb_safe_level();

	if (insecure_obj_p(obj, level)) {
		rb_insecure_operation();
	}

	CONST_ID(to_path, "to_path");
	tmp = rb_check_funcall(obj, to_path, 0, 0);
	if (tmp == Qundef)
		tmp = obj;

	StringValue(tmp);

	tmp = file_path_convert(tmp);
	if (obj != tmp && insecure_obj_p(tmp, level)) {
		rb_insecure_operation();
	}
	enc = rb_enc_get(tmp);
	if (!rb_enc_asciicompat(enc)) {
		tmp = rb_str_inspect(tmp);
		rb_raise(rb_eEncCompatError, "path name must be ASCII-compatible (%s): %s",
			 rb_enc_name(enc), RSTRING_PTR(tmp));
	}
	return rb_str_new4(tmp);
}
Пример #3
0
VALUE
rb_enc_associate_index(VALUE obj, int idx)
{
    rb_encoding *enc;
    int oldidx, oldtermlen, termlen;

/*    enc_check_capable(obj);*/
    rb_check_frozen(obj);
    oldidx = rb_enc_get_index(obj);
    if (oldidx == idx)
	return obj;
    if (SPECIAL_CONST_P(obj)) {
	rb_raise(rb_eArgError, "cannot set encoding");
    }
    enc = must_encindex(idx);
    if (!ENC_CODERANGE_ASCIIONLY(obj) ||
	!rb_enc_asciicompat(enc)) {
	ENC_CODERANGE_CLEAR(obj);
    }
    termlen = rb_enc_mbminlen(enc);
    oldtermlen = rb_enc_mbminlen(rb_enc_from_index(oldidx));
    if (oldtermlen < termlen && RB_TYPE_P(obj, T_STRING)) {
	rb_str_fill_terminator(obj, termlen);
    }
    enc_set_index(obj, idx);
    return obj;
}
Пример #4
0
static VALUE
str_compat_and_valid(VALUE str, rb_encoding *enc)
{
    int cr;
    str = StringValue(str);
    cr = rb_enc_str_coderange(str);
    if (cr == ENC_CODERANGE_BROKEN) {
#ifdef PRIsVALUE
	rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
#else
	str = rb_inspect(str);
	rb_raise(rb_eArgError, "replacement must be valid byte sequence '%s'", RSTRING_PTR(str));
	RB_GC_GUARD(str);
#endif
    }
    else if (cr == ENC_CODERANGE_7BIT) {
	rb_encoding *e = STR_ENC_GET(str);
	if (!rb_enc_asciicompat(enc)) {
	    rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
		    rb_enc_name(enc), rb_enc_name(e));
	}
    }
    else { /* ENC_CODERANGE_VALID */
	rb_encoding *e = STR_ENC_GET(str);
	if (enc != e) {
	    rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
		    rb_enc_name(enc), rb_enc_name(e));
	}
    }
    return str;
}
Пример #5
0
static VALUE
syserr_initialize(int argc, VALUE *argv, VALUE self)
{
#if !defined(_WIN32)
    char *strerror();
#endif
    const char *err;
    VALUE mesg, error, func;
    VALUE klass = rb_obj_class(self);

    if (klass == rb_eSystemCallError) {
	st_data_t data = (st_data_t)klass;
	rb_scan_args(argc, argv, "12", &mesg, &error, &func);
	if (argc == 1 && FIXNUM_P(mesg)) {
	    error = mesg; mesg = Qnil;
	}
	if (!NIL_P(error) && st_lookup(syserr_tbl, NUM2LONG(error), &data)) {
	    klass = (VALUE)data;
	    /* change class */
	    if (!RB_TYPE_P(self, T_OBJECT)) { /* insurance to avoid type crash */
		rb_raise(rb_eTypeError, "invalid instance type");
	    }
	    RBASIC_SET_CLASS(self, klass);
	}
    }
    else {
	rb_scan_args(argc, argv, "02", &mesg, &func);
	error = rb_const_get(klass, rb_intern("Errno"));
    }
    if (!NIL_P(error)) err = strerror(NUM2INT(error));
    else err = "unknown error";
    if (!NIL_P(mesg)) {
	rb_encoding *le = rb_locale_encoding();
	VALUE str = StringValue(mesg);
	rb_encoding *me = rb_enc_get(mesg);

	if (NIL_P(func))
	    mesg = rb_sprintf("%s - %"PRIsVALUE, err, mesg);
	else
	    mesg = rb_sprintf("%s @ %"PRIsVALUE" - %"PRIsVALUE, err, func, mesg);
	if (le != me && rb_enc_asciicompat(me)) {
	    le = me;
	}/* else assume err is non ASCII string. */
	OBJ_INFECT(mesg, str);
	rb_enc_associate(mesg, le);
    }
    else {
	mesg = rb_str_new2(err);
	rb_enc_associate(mesg, rb_locale_encoding());
    }
    rb_call_super(1, &mesg);
    rb_iv_set(self, "errno", error);
    return self;
}
Пример #6
0
/* Returns encoding index or UNSPECIFIED_ENCODING */
static int
str_find_encindex(VALUE enc)
{
    int idx;

    StringValue(enc);
    if (!rb_enc_asciicompat(rb_enc_get(enc))) {
	rb_raise(rb_eArgError, "invalid name encoding (non ASCII)");
    }
    idx = rb_enc_find_index(StringValueCStr(enc));
    return idx;
}
Пример #7
0
static const char *
must_not_be_anonymous(const char *type, VALUE path)
{
    char *n = RSTRING_PTR(path);

    if (!rb_enc_asciicompat(rb_enc_get(path))) {
	/* cannot occur? */
	rb_raise(rb_eTypeError, "can't dump non-ascii %s name", type);
    }
    if (n[0] == '#') {
	rb_raise(rb_eTypeError, "can't dump anonymous %s %.*s", type,
		 (int)RSTRING_LEN(path), n);
    }
    return n;
}
Пример #8
0
static rb_encoding *
to_encoding(VALUE enc)
{
    int idx;

    StringValue(enc);
    if (!rb_enc_asciicompat(rb_enc_get(enc))) {
	rb_raise(rb_eArgError, "invalid name encoding (non ASCII)");
    }
    idx = rb_enc_find_index(StringValueCStr(enc));
    if (idx < 0) {
	rb_raise(rb_eArgError, "unknown encoding name - %s", RSTRING_PTR(enc));
    }
    return rb_enc_from_index(idx);
}
Пример #9
0
VALUE
rb_enc_associate_index(VALUE obj, int idx)
{
/*    enc_check_capable(obj);*/
    if (rb_enc_get_index(obj) == idx)
	return obj;
    if (SPECIAL_CONST_P(obj)) {
	rb_raise(rb_eArgError, "cannot set encoding");
    }
    if (!ENC_CODERANGE_ASCIIONLY(obj) ||
	!rb_enc_asciicompat(rb_enc_from_index(idx))) {
	ENC_CODERANGE_CLEAR(obj);
    }
    rb_enc_set_index(obj, idx);
    return obj;
}
Пример #10
0
int
rb_to_encoding_index(VALUE enc)
{
    int idx;

    idx = enc_check_encoding(enc);
    if (idx >= 0) {
	return idx;
    }
    else if (NIL_P(enc = rb_check_string_type(enc))) {
	return -1;
    }
    if (!rb_enc_asciicompat(rb_enc_get(enc))) {
	return -1;
    }
    return rb_enc_find_index(StringValueCStr(enc));
}
Пример #11
0
int
rb_to_encoding_index(VALUE enc)
{
    if (CLASS_OF(enc) != rb_cEncoding && TYPE(enc) != T_STRING) {
        return -1;
    }
    else {
        int idx = index_of_encoding((rb_encoding_t *)enc);
        if (idx >= 0) {
            return idx;
        }
        else if (NIL_P(enc = rb_check_string_type(enc))) {
            return -1;
        }
        if (!rb_enc_asciicompat(rb_enc_get(enc))) {
            return -1;
        }
        return rb_enc_find_index(StringValueCStr(enc));
    }
}
Пример #12
0
static VALUE
str_encode_bang(int argc, VALUE *argv, VALUE str)
{
    VALUE newstr = str;
    int encidx = str_transcode(argc, argv, &newstr);
    int cr = 0;

    if (encidx < 0) return str;
    rb_str_shared_replace(str, newstr);
    rb_enc_associate_index(str, encidx);

    /* transcoded string never be broken. */
    if (rb_enc_asciicompat(rb_enc_from_index(encidx))) {
	rb_str_coderange_scan_restartable(RSTRING_PTR(str), RSTRING_END(str), 0, &cr);
    }
    else {
	cr = ENC_CODERANGE_VALID;
    }
    ENC_CODERANGE_SET(str, cr);
    return str;
}
Пример #13
0
VALUE
rb_path_to_class(VALUE pathname)
{
    rb_encoding *enc = rb_enc_get(pathname);
    const char *pbeg, *p, *path = RSTRING_PTR(pathname);
    ID id;
    VALUE c = rb_cObject;

    if (!rb_enc_asciicompat(enc)) {
	rb_raise(rb_eArgError, "invalid class path encoding (non ASCII)");
    }
    pbeg = p = path;
    if (path[0] == '#') {
	rb_raise(rb_eArgError, "can't retrieve anonymous class %s", path);
    }
    while (*p) {
	while (*p && *p != ':') p++;
	id = rb_intern3(pbeg, p-pbeg, enc);
	if (p[0] == ':') {
	    if (p[1] != ':') goto undefined_class;
	    p += 2;
	    pbeg = p;
	}
	if (!rb_const_defined(c, id)) {
	  undefined_class:
	    rb_raise(rb_eArgError, "undefined class/module %.*s", (int)(p-path), path);
	}
	c = rb_const_get_at(c, id);
	switch (TYPE(c)) {
	  case T_MODULE:
	  case T_CLASS:
	    break;
	  default:
	    rb_raise(rb_eTypeError, "%s does not refer to class/module", path);
	}
    }

    return c;
}
Пример #14
0
int
rb_enc_ascget(const char *p, const char *e, int *len, rb_encoding *enc)
{
    unsigned int c, l;
    if (e <= p)
        return -1;
    if (rb_enc_asciicompat(enc)) {
        c = (unsigned char)*p;
        if (!ISASCII(c))
            return -1;
        if (len) *len = 1;
        return c;
    }
    l = rb_enc_precise_mbclen(p, e, enc);
    if (!MBCLEN_CHARFOUND_P(l))
        return -1;
    c = rb_enc_mbc_to_codepoint(p, e, enc);
    if (!rb_enc_isascii(c, enc))
        return -1;
    if (len) *len = l;
    return c;
}
Пример #15
0
static int
str_transcode(int argc, VALUE *argv, VALUE *self)
{
    VALUE dest;
    VALUE str = *self;
    long blen, slen;
    unsigned char *buf, *bp, *sp, *fromp;
    rb_encoding *from_enc, *to_enc;
    const char *from_e, *to_e;
    int from_encidx, to_encidx;
    VALUE from_encval, to_encval;
    const rb_transcoder *my_transcoder;
    rb_transcoding my_transcoding;
    int final_encoding = 0;
    VALUE opt;
    int options = 0;

    opt = rb_check_convert_type(argv[argc-1], T_HASH, "Hash", "to_hash");
    if (!NIL_P(opt)) {
	VALUE v;

	argc--;
	v = rb_hash_aref(opt, sym_invalid);
	if (NIL_P(v)) {
	    rb_raise(rb_eArgError, "unknown value for invalid: setting");
	}
	else if (v==sym_ignore) {
	    options |= INVALID_IGNORE;
	}
    }
    if (argc < 1 || argc > 2) {
	rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
    }
    if ((to_encidx = rb_to_encoding_index(to_encval = argv[0])) < 0) {
	to_enc = 0;
	to_encidx = 0;
	to_e = StringValueCStr(to_encval);
    }
    else {
	to_enc = rb_enc_from_index(to_encidx);
	to_e = rb_enc_name(to_enc);
    }
    if (argc==1) {
	from_encidx = rb_enc_get_index(str);
	from_enc = rb_enc_from_index(from_encidx);
	from_e = rb_enc_name(from_enc);
    }
    else if ((from_encidx = rb_to_encoding_index(from_encval = argv[1])) < 0) {
	from_enc = 0;
	from_e = StringValueCStr(from_encval);
    }
    else {
	from_enc = rb_enc_from_index(from_encidx);
	from_e = rb_enc_name(from_enc);
    }

    if (from_enc && from_enc == to_enc) {
	return -1;
    }
    if (from_enc && to_enc && rb_enc_asciicompat(from_enc) && rb_enc_asciicompat(to_enc)) {
	if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
	    return to_encidx;
	}
    }
    if (encoding_equal(from_e, to_e)) {
	return -1;
    }

    do { /* loop for multistep transcoding */
	/* later, maybe use smaller intermediate strings for very long strings */
	if (!(my_transcoder = transcode_dispatch(from_e, to_e))) {
	    rb_raise(rb_eArgError, "transcoding not supported (from %s to %s)", from_e, to_e);
	}

	my_transcoding.transcoder = my_transcoder;

	if (my_transcoder->preprocessor) {
	    fromp = sp = (unsigned char *)RSTRING_PTR(str);
	    slen = RSTRING_LEN(str);
	    blen = slen + 30; /* len + margin */
	    dest = rb_str_tmp_new(blen);
	    bp = (unsigned char *)RSTRING_PTR(dest);
	    my_transcoding.ruby_string_dest = dest;
	    (*my_transcoder->preprocessor)(&fromp, &bp, (sp+slen), (bp+blen), &my_transcoding);
	    if (fromp != sp+slen) {
		rb_raise(rb_eArgError, "not fully converted, %td bytes left", sp+slen-fromp);
	    }
	    buf = (unsigned char *)RSTRING_PTR(dest);
	    *bp = '\0';
	    rb_str_set_len(dest, bp - buf);
	    str = dest;
	}
	fromp = sp = (unsigned char *)RSTRING_PTR(str);
	slen = RSTRING_LEN(str);
	blen = slen + 30; /* len + margin */
	dest = rb_str_tmp_new(blen);
	bp = (unsigned char *)RSTRING_PTR(dest);
	my_transcoding.ruby_string_dest = dest;
	my_transcoding.flush_func = str_transcoding_resize;

	transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), my_transcoder, &my_transcoding, options);
	if (fromp != sp+slen) {
	    rb_raise(rb_eArgError, "not fully converted, %td bytes left", sp+slen-fromp);
	}
	buf = (unsigned char *)RSTRING_PTR(dest);
	*bp = '\0';
	rb_str_set_len(dest, bp - buf);
	if (my_transcoder->postprocessor) {
	    str = dest;
	    fromp = sp = (unsigned char *)RSTRING_PTR(str);
	    slen = RSTRING_LEN(str);
	    blen = slen + 30; /* len + margin */
	    dest = rb_str_tmp_new(blen);
	    bp = (unsigned char *)RSTRING_PTR(dest);
	    my_transcoding.ruby_string_dest = dest;
	    (*my_transcoder->postprocessor)(&fromp, &bp, (sp+slen), (bp+blen), &my_transcoding);
	    if (fromp != sp+slen) {
		rb_raise(rb_eArgError, "not fully converted, %td bytes left", sp+slen-fromp);
	    }
	    buf = (unsigned char *)RSTRING_PTR(dest);
	    *bp = '\0';
	    rb_str_set_len(dest, bp - buf);
	}

	if (encoding_equal(my_transcoder->to_encoding, to_e)) {
	    final_encoding = 1;
	}
	else {
	    from_e = my_transcoder->to_encoding;
	    str = dest;
	}
    } while (!final_encoding);
    /* set encoding */
    if (!to_enc) {
	to_encidx = rb_define_dummy_encoding(to_e);
    }
    *self = dest;

    return to_encidx;
}
Пример #16
0
/**
 * @param str the string to be scrubbed
 * @param repl the replacement character
 * @return If given string is invalid, returns a new string. Otherwise, returns Qnil.
 */
static VALUE
str_scrub0(int argc, VALUE *argv, VALUE str)
{
    int cr = ENC_CODERANGE(str);
    rb_encoding *enc;
    int encidx;
    VALUE repl;

    if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID)
	return Qnil;

    enc = STR_ENC_GET(str);
    rb_scan_args(argc, argv, "01", &repl);
    if (argc != 0) {
	repl = str_compat_and_valid(repl, enc);
    }

    if (rb_enc_dummy_p(enc)) {
	return Qnil;
    }
    encidx = rb_enc_to_index(enc);

#define DEFAULT_REPLACE_CHAR(str) do { \
	static const char replace[sizeof(str)-1] = str; \
	rep = replace; replen = (int)sizeof(replace); \
    } while (0)

    if (rb_enc_asciicompat(enc)) {
	const char *p = RSTRING_PTR(str);
	const char *e = RSTRING_END(str);
	const char *p1 = p;
	const char *rep;
	long replen;
	int rep7bit_p;
	VALUE buf = Qnil;
	if (rb_block_given_p()) {
	    rep = NULL;
	    replen = 0;
	    rep7bit_p = FALSE;
	}
	else if (!NIL_P(repl)) {
	    rep = RSTRING_PTR(repl);
	    replen = RSTRING_LEN(repl);
	    rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
	}
	else if (encidx == rb_utf8_encindex()) {
	    DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
	    rep7bit_p = FALSE;
	}
	else {
	    DEFAULT_REPLACE_CHAR("?");
	    rep7bit_p = TRUE;
	}
	cr = ENC_CODERANGE_7BIT;

	p = search_nonascii(p, e);
	if (!p) {
	    p = e;
	}
	while (p < e) {
	    int ret = rb_enc_precise_mbclen(p, e, enc);
	    if (MBCLEN_NEEDMORE_P(ret)) {
		break;
	    }
	    else if (MBCLEN_CHARFOUND_P(ret)) {
		cr = ENC_CODERANGE_VALID;
		p += MBCLEN_CHARFOUND_LEN(ret);
	    }
	    else if (MBCLEN_INVALID_P(ret)) {
		/*
		 * p1~p: valid ascii/multibyte chars
		 * p ~e: invalid bytes + unknown bytes
		 */
		long clen = rb_enc_mbmaxlen(enc);
		if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
		if (p > p1) {
		    rb_str_buf_cat(buf, p1, p - p1);
		}

		if (e - p < clen) clen = e - p;
		if (clen <= 2) {
		    clen = 1;
		}
		else {
		    const char *q = p;
		    clen--;
		    for (; clen > 1; clen--) {
			ret = rb_enc_precise_mbclen(q, q + clen, enc);
			if (MBCLEN_NEEDMORE_P(ret)) break;
			if (MBCLEN_INVALID_P(ret)) continue;
			UNREACHABLE;
		    }
		}
		if (rep) {
		    rb_str_buf_cat(buf, rep, replen);
		    if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
		}
		else {
		    repl = rb_yield(rb_enc_str_new(p, clen, enc));
		    repl = str_compat_and_valid(repl, enc);
		    rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
		    if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
			cr = ENC_CODERANGE_VALID;
		}
		p += clen;
		p1 = p;
		p = search_nonascii(p, e);
		if (!p) {
		    p = e;
		    break;
		}
	    }
	    else {
		UNREACHABLE;
	    }
	}
	if (NIL_P(buf)) {
	    if (p == e) {
		ENC_CODERANGE_SET(str, cr);
		return Qnil;
	    }
	    buf = rb_str_buf_new(RSTRING_LEN(str));
	}
	if (p1 < p) {
	    rb_str_buf_cat(buf, p1, p - p1);
	}
	if (p < e) {
	    if (rep) {
		rb_str_buf_cat(buf, rep, replen);
		if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
	    }
	    else {
		repl = rb_yield(rb_enc_str_new(p, e-p, enc));
		repl = str_compat_and_valid(repl, enc);
		rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
		if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
		    cr = ENC_CODERANGE_VALID;
	    }
	}
	ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
	return buf;
    }
    else {
	/* ASCII incompatible */
	const char *p = RSTRING_PTR(str);
	const char *e = RSTRING_END(str);
	const char *p1 = p;
	VALUE buf = Qnil;
	const char *rep;
	long replen;
	long mbminlen = rb_enc_mbminlen(enc);
	if (!NIL_P(repl)) {
	    rep = RSTRING_PTR(repl);
	    replen = RSTRING_LEN(repl);
	}
	else if (!strcasecmp(rb_enc_name(enc), "UTF-16BE")) {
	    DEFAULT_REPLACE_CHAR("\xFF\xFD");
	}
	else if (!strcasecmp(rb_enc_name(enc), "UTF-16LE")) {
	    DEFAULT_REPLACE_CHAR("\xFD\xFF");
	}
	else if (!strcasecmp(rb_enc_name(enc), "UTF-32BE")) {
	    DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
	}
	else if (!strcasecmp(rb_enc_name(enc), "UTF-32lE")) {
	    DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
	}
	else {
	    DEFAULT_REPLACE_CHAR("?");
	}

	while (p < e) {
	    int ret = rb_enc_precise_mbclen(p, e, enc);
	    if (MBCLEN_NEEDMORE_P(ret)) {
		break;
	    }
	    else if (MBCLEN_CHARFOUND_P(ret)) {
		p += MBCLEN_CHARFOUND_LEN(ret);
	    }
	    else if (MBCLEN_INVALID_P(ret)) {
		const char *q = p;
		long clen = rb_enc_mbmaxlen(enc);
		if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
		if (p > p1) rb_str_buf_cat(buf, p1, p - p1);

		if (e - p < clen) clen = e - p;
		if (clen <= mbminlen * 2) {
		    clen = mbminlen;
		}
		else {
		    clen -= mbminlen;
		    for (; clen > mbminlen; clen-=mbminlen) {
			ret = rb_enc_precise_mbclen(q, q + clen, enc);
			if (MBCLEN_NEEDMORE_P(ret)) break;
			if (MBCLEN_INVALID_P(ret)) continue;
			UNREACHABLE;
		    }
		}
		if (rep) {
		    rb_str_buf_cat(buf, rep, replen);
		}
		else {
		    repl = rb_yield(rb_enc_str_new(p, e-p, enc));
		    repl = str_compat_and_valid(repl, enc);
		    rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
		}
		p += clen;
		p1 = p;
	    }
	    else {
		UNREACHABLE;
	    }
	}
	if (NIL_P(buf)) {
	    if (p == e) {
		ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
		return Qnil;
	    }
	    buf = rb_str_buf_new(RSTRING_LEN(str));
	}
	if (p1 < p) {
	    rb_str_buf_cat(buf, p1, p - p1);
	}
	if (p < e) {
	    if (rep) {
		rb_str_buf_cat(buf, rep, replen);
	    }
	    else {
		repl = rb_yield(rb_enc_str_new(p, e-p, enc));
		repl = str_compat_and_valid(repl, enc);
		rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
	    }
	}
	ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), ENC_CODERANGE_VALID);
	return buf;
    }
}
Пример #17
0
/*
 * call-seq:
 *   enc.ascii_compatible? -> true or false
 *
 * Returns whether ASCII-compatible or not.
 *
 *   Encoding::UTF_8.ascii_compatible?     #=> true
 *   Encoding::UTF_16BE.ascii_compatible?  #=> false
 *
 */
static VALUE
enc_ascii_compatible_p(VALUE enc)
{
    return rb_enc_asciicompat(must_encoding(enc)) ? Qtrue : Qfalse;
}
Пример #18
0
rb_encoding*
rb_enc_compatible(VALUE str1, VALUE str2)
{
    int idx1, idx2;
    rb_encoding *enc1, *enc2;

    idx1 = rb_enc_get_index(str1);
    idx2 = rb_enc_get_index(str2);

    if (idx1 < 0 || idx2 < 0)
        return 0;

    if (idx1 == idx2) {
	return rb_enc_from_index(idx1);
    }
    enc1 = rb_enc_from_index(idx1);
    enc2 = rb_enc_from_index(idx2);

    if (TYPE(str2) == T_STRING && RSTRING_LEN(str2) == 0)
	return (idx1 == ENCINDEX_US_ASCII && rb_enc_asciicompat(enc2)) ? enc2 : enc1;
    if (TYPE(str1) == T_STRING && RSTRING_LEN(str1) == 0)
	return (idx2 == ENCINDEX_US_ASCII && rb_enc_asciicompat(enc1)) ? enc1 : enc2;
    if (!rb_enc_asciicompat(enc1) || !rb_enc_asciicompat(enc2)) {
	return 0;
    }

    /* objects whose encoding is the same of contents */
    if (BUILTIN_TYPE(str2) != T_STRING && idx2 == ENCINDEX_US_ASCII)
	return enc1;
    if (BUILTIN_TYPE(str1) != T_STRING && idx1 == ENCINDEX_US_ASCII)
	return enc2;

    if (BUILTIN_TYPE(str1) != T_STRING) {
	VALUE tmp = str1;
	int idx0 = idx1;
	str1 = str2;
	str2 = tmp;
	idx1 = idx2;
	idx2 = idx0;
    }
    if (BUILTIN_TYPE(str1) == T_STRING) {
	int cr1, cr2;

	cr1 = rb_enc_str_coderange(str1);
	if (BUILTIN_TYPE(str2) == T_STRING) {
	    cr2 = rb_enc_str_coderange(str2);
	    if (cr1 != cr2) {
		/* may need to handle ENC_CODERANGE_BROKEN */
		if (cr1 == ENC_CODERANGE_7BIT) return enc2;
		if (cr2 == ENC_CODERANGE_7BIT) return enc1;
	    }
	    if (cr2 == ENC_CODERANGE_7BIT) {
		if (idx1 == ENCINDEX_ASCII) return enc2;
		return enc1;
	    }
	}
	if (cr1 == ENC_CODERANGE_7BIT)
	    return enc2;
    }
    return 0;
}
Пример #19
0
/*
 * call-seq:
 *   enc.ascii_compatible? -> true or false
 *
 * Returns whether ASCII-compatible or not.
 *
 *   Encoding::UTF_8.ascii_compatible?     #=> true
 *   Encoding::UTF_16BE.ascii_compatible?  #=> false
 *
 */
static VALUE
enc_ascii_compatible_p(VALUE enc)
{
    return rb_enc_asciicompat(enc_table.list[must_encoding(enc)].enc) ? Qtrue : Qfalse;
}