static rb_encoding* enc_compatible_latter(VALUE str1, VALUE str2, int idx1, int idx2) { int isstr1, isstr2; rb_encoding *enc1 = rb_enc_from_index(idx1); rb_encoding *enc2 = rb_enc_from_index(idx2); isstr2 = RB_TYPE_P(str2, T_STRING); if (isstr2 && RSTRING_LEN(str2) == 0) return enc1; isstr1 = RB_TYPE_P(str1, T_STRING); if (isstr1 && RSTRING_LEN(str1) == 0) return (rb_enc_asciicompat(enc1) && rb_enc_str_asciionly_p(str2)) ? enc1 : enc2; if (!rb_enc_asciicompat(enc1) || !rb_enc_asciicompat(enc2)) { return 0; } /* objects whose encoding is the same of contents */ if (!isstr2 && idx2 == ENCINDEX_US_ASCII) return enc1; if (!isstr1 && idx1 == ENCINDEX_US_ASCII) return enc2; if (!isstr1) { VALUE tmp = str1; int idx0 = idx1; str1 = str2; str2 = tmp; idx1 = idx2; idx2 = idx0; idx0 = isstr1; isstr1 = isstr2; isstr2 = idx0; } if (isstr1) { int cr1, cr2; cr1 = rb_enc_str_coderange(str1); if (isstr2) { cr2 = rb_enc_str_coderange(str2); if (cr1 != cr2) { /* may need to handle ENC_CODERANGE_BROKEN */ if (cr1 == ENC_CODERANGE_7BIT) return enc2; if (cr2 == ENC_CODERANGE_7BIT) return enc1; } if (cr2 == ENC_CODERANGE_7BIT) { return enc1; } } if (cr1 == ENC_CODERANGE_7BIT) return enc2; } return 0; }
static VALUE str_compat_and_valid(VALUE str, rb_encoding *enc) { int cr; str = StringValue(str); cr = rb_enc_str_coderange(str); if (cr == ENC_CODERANGE_BROKEN) { #ifdef PRIsVALUE rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str); #else str = rb_inspect(str); rb_raise(rb_eArgError, "replacement must be valid byte sequence '%s'", RSTRING_PTR(str)); RB_GC_GUARD(str); #endif } else if (cr == ENC_CODERANGE_7BIT) { rb_encoding *e = STR_ENC_GET(str); if (!rb_enc_asciicompat(enc)) { rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s", rb_enc_name(enc), rb_enc_name(e)); } } else { /* ENC_CODERANGE_VALID */ rb_encoding *e = STR_ENC_GET(str); if (enc != e) { rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s", rb_enc_name(enc), rb_enc_name(e)); } } return str; }
static VALUE encoding_spec_rb_enc_str_coderange(VALUE self, VALUE str) { int coderange = rb_enc_str_coderange(str); switch(coderange) { case ENC_CODERANGE_UNKNOWN: return ID2SYM(rb_intern("coderange_unknown")); case ENC_CODERANGE_7BIT: return ID2SYM(rb_intern("coderange_7bit")); case ENC_CODERANGE_VALID: return ID2SYM(rb_intern("coderange_valid")); case ENC_CODERANGE_BROKEN: return ID2SYM(rb_intern("coderange_broken")); default: return ID2SYM(rb_intern("coderange_unrecognized")); } }
VALUE native_slot_encode_and_freeze_string(upb_fieldtype_t type, VALUE value) { rb_encoding* desired_encoding = (type == UPB_TYPE_STRING) ? kRubyStringUtf8Encoding : kRubyString8bitEncoding; VALUE desired_encoding_value = rb_enc_from_encoding(desired_encoding); // Note: this will not duplicate underlying string data unless necessary. value = rb_str_encode(value, desired_encoding_value, 0, Qnil); if (type == UPB_TYPE_STRING && rb_enc_str_coderange(value) == ENC_CODERANGE_BROKEN) { rb_raise(rb_eEncodingError, "String is invalid UTF-8"); } // Ensure the data remains valid. Since we called #encode a moment ago, // this does not freeze the string the user assigned. rb_obj_freeze(value); return value; }
rb_encoding* rb_enc_compatible(VALUE str1, VALUE str2) { int idx1, idx2; rb_encoding *enc1, *enc2; idx1 = rb_enc_get_index(str1); idx2 = rb_enc_get_index(str2); if (idx1 < 0 || idx2 < 0) return 0; if (idx1 == idx2) { return rb_enc_from_index(idx1); } enc1 = rb_enc_from_index(idx1); enc2 = rb_enc_from_index(idx2); if (TYPE(str2) == T_STRING && RSTRING_LEN(str2) == 0) return (idx1 == ENCINDEX_US_ASCII && rb_enc_asciicompat(enc2)) ? enc2 : enc1; if (TYPE(str1) == T_STRING && RSTRING_LEN(str1) == 0) return (idx2 == ENCINDEX_US_ASCII && rb_enc_asciicompat(enc1)) ? enc1 : enc2; if (!rb_enc_asciicompat(enc1) || !rb_enc_asciicompat(enc2)) { return 0; } /* objects whose encoding is the same of contents */ if (BUILTIN_TYPE(str2) != T_STRING && idx2 == ENCINDEX_US_ASCII) return enc1; if (BUILTIN_TYPE(str1) != T_STRING && idx1 == ENCINDEX_US_ASCII) return enc2; if (BUILTIN_TYPE(str1) != T_STRING) { VALUE tmp = str1; int idx0 = idx1; str1 = str2; str2 = tmp; idx1 = idx2; idx2 = idx0; } if (BUILTIN_TYPE(str1) == T_STRING) { int cr1, cr2; cr1 = rb_enc_str_coderange(str1); if (BUILTIN_TYPE(str2) == T_STRING) { cr2 = rb_enc_str_coderange(str2); if (cr1 != cr2) { /* may need to handle ENC_CODERANGE_BROKEN */ if (cr1 == ENC_CODERANGE_7BIT) return enc2; if (cr2 == ENC_CODERANGE_7BIT) return enc1; } if (cr2 == ENC_CODERANGE_7BIT) { if (idx1 == ENCINDEX_ASCII) return enc2; return enc1; } } if (cr1 == ENC_CODERANGE_7BIT) return enc2; } return 0; }
static VALUE optimized_unescape(VALUE str, VALUE encoding) { long i, len, beg = 0; VALUE dest = 0; const char *cstr; int cr, origenc, encidx = rb_to_encoding_index(encoding); len = RSTRING_LEN(str); cstr = RSTRING_PTR(str); for (i = 0; i < len; ++i) { char buf[1]; const char c = cstr[i]; int clen = 0; if (c == '%') { if (i + 3 > len) break; if (!ISXDIGIT(cstr[i+1])) continue; if (!ISXDIGIT(cstr[i+2])) continue; buf[0] = ((char_to_number(cstr[i+1]) << 4) | char_to_number(cstr[i+2])); clen = 2; } else if (c == '+') { buf[0] = ' '; } else { continue; } if (!dest) { dest = rb_str_buf_new(len); } rb_str_cat(dest, cstr + beg, i - beg); i += clen; beg = i + 1; rb_str_cat(dest, buf, 1); } if (dest) { rb_str_cat(dest, cstr + beg, len - beg); preserve_original_state(str, dest); cr = ENC_CODERANGE_UNKNOWN; } else { dest = rb_str_dup(str); cr = ENC_CODERANGE(str); } origenc = rb_enc_get_index(str); if (origenc != encidx) { rb_enc_associate_index(dest, encidx); if (!ENC_CODERANGE_CLEAN_P(rb_enc_str_coderange(dest))) { rb_enc_associate_index(dest, origenc); if (cr != ENC_CODERANGE_UNKNOWN) ENC_CODERANGE_SET(dest, cr); } } return dest; }