VALUE rb_enc_associate_index(VALUE obj, int idx) { /* enc_check_capable(obj);*/ if (rb_enc_get_index(obj) == idx) return obj; if (SPECIAL_CONST_P(obj)) { rb_raise(rb_eArgError, "cannot set encoding"); } if (!ENC_CODERANGE_ASCIIONLY(obj) || !rb_enc_asciicompat(rb_enc_from_index(idx))) { ENC_CODERANGE_CLEAR(obj); } rb_enc_set_index(obj, idx); return obj; }
static rb_encoding* enc_compatible_str(VALUE str1, VALUE str2) { int idx1 = enc_get_index_str(str1); int idx2 = enc_get_index_str(str2); if (idx1 < 0 || idx2 < 0) return 0; if (idx1 == idx2) { return rb_enc_from_index(idx1); } else { return enc_compatible_latter(str1, str2, idx1, idx2); } }
/* * call-seq: * enc.inspect -> string * * Returns a string which represents the encoding for programmers. * * Encoding::UTF_8.inspect #=> "#<Encoding:UTF-8>" * Encoding::ISO_2022_JP.inspect #=> "#<Encoding:ISO-2022-JP (dummy)>" */ static VALUE enc_inspect(VALUE self) { rb_encoding *enc; if (!is_data_encoding(self)) { not_encoding(self); } if (!(enc = DATA_PTR(self)) || rb_enc_from_index(rb_enc_to_index(enc)) != enc) { rb_raise(rb_eTypeError, "broken Encoding"); } return rb_enc_sprintf(rb_usascii_encoding(), "#<%"PRIsVALUE":%s%s%s>", rb_obj_class(self), rb_enc_name(enc), (ENC_DUMMY_P(enc) ? " (dummy)" : ""), enc_autoload_p(enc) ? " (autoload)" : ""); }
static VALUE transcode_io(VALUE src, int * parser_encoding) { VALUE io_external_encoding; int io_external_enc_index; io_external_encoding = rb_funcall(src, rb_intern("external_encoding"), 0); /* if no encoding is returned, assume ascii8bit. */ if (NIL_P(io_external_encoding)) { io_external_enc_index = rb_ascii8bit_encindex(); } else { io_external_enc_index = rb_to_encoding_index(io_external_encoding); } /* Treat US-ASCII as utf_8 */ if (io_external_enc_index == rb_usascii_encindex()) { *parser_encoding = YAML_UTF8_ENCODING; return src; } if (io_external_enc_index == rb_utf8_encindex()) { *parser_encoding = YAML_UTF8_ENCODING; return src; } if (io_external_enc_index == rb_enc_find_index("UTF-16LE")) { *parser_encoding = YAML_UTF16LE_ENCODING; return src; } if (io_external_enc_index == rb_enc_find_index("UTF-16BE")) { *parser_encoding = YAML_UTF16BE_ENCODING; return src; } /* Just guess on ASCII-8BIT */ if (io_external_enc_index == rb_ascii8bit_encindex()) { *parser_encoding = YAML_ANY_ENCODING; return src; } rb_raise(rb_eArgError, "YAML file must be UTF-8, UTF-16LE, or UTF-16BE, not %s", rb_enc_name(rb_enc_from_index(io_external_enc_index))); return Qnil; }
static VALUE file_path_convert(VALUE name) { #ifndef _WIN32 /* non Windows == Unix */ rb_encoding *fname_encoding = rb_enc_from_index(ENCODING_GET(name)); rb_encoding *fs_encoding; if (rb_default_internal_encoding() != NULL && rb_usascii_encoding() != fname_encoding && rb_ascii8bit_encoding() != fname_encoding && (fs_encoding = rb_filesystem_encoding()) != fname_encoding && !rb_enc_str_asciionly_p(name)) { /* Don't call rb_filesystem_encoding() before US-ASCII and ASCII-8BIT */ /* fs_encoding should be ascii compatible */ name = rb_str_conv_enc(name, fname_encoding, fs_encoding); } #endif return name; }
static rb_encoding * must_encindex(int index) { rb_encoding *enc = rb_enc_from_index(index); if (!enc) { rb_raise(rb_eEncodingError, "encoding index out of bound: %d", index); } if (ENC_TO_ENCINDEX(enc) != (int)(index & ENC_INDEX_MASK)) { rb_raise(rb_eEncodingError, "wrong encoding index %d for %s (expected %d)", index, rb_enc_name(enc), ENC_TO_ENCINDEX(enc)); } if (enc_autoload_p(enc) && enc_autoload(enc) == -1) { rb_loaderror("failed to load encoding (%s)", rb_enc_name(enc)); } return enc; }
static void sax_drive_init(SaxDrive dr, VALUE handler, VALUE io, SaxOptions options) { ox_sax_buf_init(&dr->buf, io); dr->buf.dr = dr; stack_init(&dr->stack); dr->handler = handler; dr->value_obj = rb_data_object_alloc(ox_sax_value_class, dr, 0, 0); rb_gc_register_address(&dr->value_obj); dr->options = *options; dr->hints = 0; dr->err = 0; has_init(&dr->has, handler); #if HAS_ENCODING_SUPPORT if ('\0' == *ox_default_options.encoding) { VALUE encoding; dr->encoding = 0; if (rb_respond_to(io, ox_external_encoding_id) && Qnil != (encoding = rb_funcall(io, ox_external_encoding_id, 0))) { int e = rb_enc_get_index(encoding); if (0 <= e) { dr->encoding = rb_enc_from_index(e); } } } else { dr->encoding = rb_enc_find(ox_default_options.encoding); } #elif HAS_PRIVATE_ENCODING if ('\0' == *ox_default_options.encoding) { VALUE encoding; if (rb_respond_to(io, ox_external_encoding_id) && Qnil != (encoding = rb_funcall(io, ox_external_encoding_id, 0))) { dr->encoding = encoding; } else { dr->encoding = Qnil; } } else { dr->encoding = rb_str_new2(ox_default_options.encoding); } #else dr->encoding = 0; #endif }
void native_slot_validate_string_encoding(upb_fieldtype_t type, VALUE value) { bool bad_encoding = false; rb_encoding* string_encoding = rb_enc_from_index(ENCODING_GET(value)); if (type == UPB_TYPE_STRING) { bad_encoding = string_encoding != kRubyStringUtf8Encoding && string_encoding != kRubyStringASCIIEncoding; } else { bad_encoding = string_encoding != kRubyString8bitEncoding; } // Check that encoding is UTF-8 or ASCII (for string fields) or ASCII-8BIT // (for bytes fields). if (bad_encoding) { rb_raise(rb_eTypeError, "Encoding for '%s' fields must be %s (was %s)", (type == UPB_TYPE_STRING) ? "string" : "bytes", (type == UPB_TYPE_STRING) ? "UTF-8 or ASCII" : "ASCII-8BIT", rb_enc_name(string_encoding)); } }
static VALUE str_encode_bang(int argc, VALUE *argv, VALUE str) { VALUE newstr = str; int encidx = str_transcode(argc, argv, &newstr); int cr = 0; if (encidx < 0) return str; rb_str_shared_replace(str, newstr); rb_enc_associate_index(str, encidx); /* transcoded string never be broken. */ if (rb_enc_asciicompat(rb_enc_from_index(encidx))) { rb_str_coderange_scan_restartable(RSTRING_PTR(str), RSTRING_END(str), 0, &cr); } else { cr = ENC_CODERANGE_VALID; } ENC_CODERANGE_SET(str, cr); return str; }
int rb_enc_find_index(const char *name) { int i = rb_enc_registered(name); rb_encoding *enc; if (i < 0) { i = load_encoding(name); } else if (!(enc = rb_enc_from_index(i))) { if (i != UNSPECIFIED_ENCODING) { rb_raise(rb_eArgError, "encoding %s is not registered", name); } } else if (enc_autoload_p(enc)) { if (enc_autoload(enc) < 0) { rb_warn("failed to load encoding (%s); use ASCII-8BIT instead", name); return 0; } } return i; }
rb_encoding * rb_filesystem_encoding(void) { return rb_enc_from_index(rb_filesystem_encindex()); }
rb_encoding * rb_locale_encoding(void) { return rb_enc_from_index(rb_locale_encindex()); }
static VALUE encoding_spec_rb_enc_from_index(VALUE self, VALUE index) { return rb_str_new2(rb_enc_from_index(NUM2INT(index))->name); }
void rb_encdb_set_unicode(int index) { rb_enc_from_index(index)->flags |= ONIGENC_FLAG_UNICODE; }
static int str_transcode(int argc, VALUE *argv, VALUE *self) { VALUE dest; VALUE str = *self; long blen, slen; unsigned char *buf, *bp, *sp, *fromp; rb_encoding *from_enc, *to_enc; const char *from_e, *to_e; int from_encidx, to_encidx; VALUE from_encval, to_encval; const rb_transcoder *my_transcoder; rb_transcoding my_transcoding; int final_encoding = 0; VALUE opt; int options = 0; opt = rb_check_convert_type(argv[argc-1], T_HASH, "Hash", "to_hash"); if (!NIL_P(opt)) { VALUE v; argc--; v = rb_hash_aref(opt, sym_invalid); if (NIL_P(v)) { rb_raise(rb_eArgError, "unknown value for invalid: setting"); } else if (v==sym_ignore) { options |= INVALID_IGNORE; } } if (argc < 1 || argc > 2) { rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc); } if ((to_encidx = rb_to_encoding_index(to_encval = argv[0])) < 0) { to_enc = 0; to_encidx = 0; to_e = StringValueCStr(to_encval); } else { to_enc = rb_enc_from_index(to_encidx); to_e = rb_enc_name(to_enc); } if (argc==1) { from_encidx = rb_enc_get_index(str); from_enc = rb_enc_from_index(from_encidx); from_e = rb_enc_name(from_enc); } else if ((from_encidx = rb_to_encoding_index(from_encval = argv[1])) < 0) { from_enc = 0; from_e = StringValueCStr(from_encval); } else { from_enc = rb_enc_from_index(from_encidx); from_e = rb_enc_name(from_enc); } if (from_enc && from_enc == to_enc) { return -1; } if (from_enc && to_enc && rb_enc_asciicompat(from_enc) && rb_enc_asciicompat(to_enc)) { if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) { return to_encidx; } } if (encoding_equal(from_e, to_e)) { return -1; } do { /* loop for multistep transcoding */ /* later, maybe use smaller intermediate strings for very long strings */ if (!(my_transcoder = transcode_dispatch(from_e, to_e))) { rb_raise(rb_eArgError, "transcoding not supported (from %s to %s)", from_e, to_e); } my_transcoding.transcoder = my_transcoder; if (my_transcoder->preprocessor) { fromp = sp = (unsigned char *)RSTRING_PTR(str); slen = RSTRING_LEN(str); blen = slen + 30; /* len + margin */ dest = rb_str_tmp_new(blen); bp = (unsigned char *)RSTRING_PTR(dest); my_transcoding.ruby_string_dest = dest; (*my_transcoder->preprocessor)(&fromp, &bp, (sp+slen), (bp+blen), &my_transcoding); if (fromp != sp+slen) { rb_raise(rb_eArgError, "not fully converted, %td bytes left", sp+slen-fromp); } buf = (unsigned char *)RSTRING_PTR(dest); *bp = '\0'; rb_str_set_len(dest, bp - buf); str = dest; } fromp = sp = (unsigned char *)RSTRING_PTR(str); slen = RSTRING_LEN(str); blen = slen + 30; /* len + margin */ dest = rb_str_tmp_new(blen); bp = (unsigned char *)RSTRING_PTR(dest); my_transcoding.ruby_string_dest = dest; my_transcoding.flush_func = str_transcoding_resize; transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), my_transcoder, &my_transcoding, options); if (fromp != sp+slen) { rb_raise(rb_eArgError, "not fully converted, %td bytes left", sp+slen-fromp); } buf = (unsigned char *)RSTRING_PTR(dest); *bp = '\0'; rb_str_set_len(dest, bp - buf); if (my_transcoder->postprocessor) { str = dest; fromp = sp = (unsigned char *)RSTRING_PTR(str); slen = RSTRING_LEN(str); blen = slen + 30; /* len + margin */ dest = rb_str_tmp_new(blen); bp = (unsigned char *)RSTRING_PTR(dest); my_transcoding.ruby_string_dest = dest; (*my_transcoder->postprocessor)(&fromp, &bp, (sp+slen), (bp+blen), &my_transcoding); if (fromp != sp+slen) { rb_raise(rb_eArgError, "not fully converted, %td bytes left", sp+slen-fromp); } buf = (unsigned char *)RSTRING_PTR(dest); *bp = '\0'; rb_str_set_len(dest, bp - buf); } if (encoding_equal(my_transcoder->to_encoding, to_e)) { final_encoding = 1; } else { from_e = my_transcoder->to_encoding; str = dest; } } while (!final_encoding); /* set encoding */ if (!to_enc) { to_encidx = rb_define_dummy_encoding(to_e); } *self = dest; return to_encidx; }
rb_encoding* rb_enc_get(VALUE obj) { return rb_enc_from_index(rb_enc_get_index(obj)); }
void rb_encdb_set_unicode(int index) { ((rb_raw_encoding *)rb_enc_from_index(index))->flags |= ONIGENC_FLAG_UNICODE; }
VALUE bamfcsv_parse_string(VALUE self, VALUE string) { char *buf = RSTRING_PTR(string); long bufsize = RSTRING_LEN(string); rb_encoding *enc = rb_enc_from_index(ENCODING_GET(string)); unsigned long num_rows = 1, cell_count = 1; int quote_count = 0, quotes_matched = 1; VALUE matrix = rb_ary_new(); VALUE row = rb_ary_new(); char *cur = buf, *cell_start = buf; if (bufsize > 0 && *(buf+bufsize-1) == '\n') { *(buf+bufsize-1) = 0; --bufsize; } VALUE dbl_dquote = rb_str_new("\"\"", 2), dquote = rb_str_new("\"", 1); ID gsub_bang = rb_intern("gsub!"); for (; cur < buf+bufsize; cur++) { if (*cur == '"') { if (0 == quote_count && cell_start != cur) /* Quotes begin past opening of cell */ rb_raise(BAMFCSV_MalformedCSVError_class, "Illegal quoting on line %lu, cell %lu: Quoted cell must open with '\"'", num_rows, cell_count); else ++quote_count; } quotes_matched = !(quote_count & 1); /* count is even */ if (quotes_matched) { if (*cur == ',') { if (quote_count && *(cur-1) != '"') rb_raise(BAMFCSV_MalformedCSVError_class, "Unclosed quoted field on line %lu, cell %lu.", num_rows, cell_count); VALUE cell_str = bamfcsv_finalize_cell(cell_start, cur-1, quote_count, enc); if (quote_count) rb_funcall(cell_str, gsub_bang, 2, dbl_dquote, dquote); rb_ary_push(row, cell_str); cell_start = cur+1; quote_count = 0; ++cell_count; } else if (*cur == '\n') { if (quote_count && !quotes_end_line(cur)) rb_raise(BAMFCSV_MalformedCSVError_class, "Unclosed quoted field on line %lu, cell %lu: EOL", num_rows, cell_count); VALUE cell_str = bamfcsv_finalize_cell(cell_start, cur-1, quote_count, enc); if (quote_count) rb_funcall(cell_str, gsub_bang, 2, dbl_dquote, dquote); /* Completely blank lines don't even get a nil. This matches CSV's behavior. */ if (cell_count > 1 || cell_str != Qnil) rb_ary_push(row, cell_str); rb_ary_push(matrix, row); row = rb_ary_new(); cell_start = cur+1; quote_count = 0; ++num_rows; cell_count = 0; } else if (quote_count && *cur != '\r' && *cur != '"') rb_raise(BAMFCSV_MalformedCSVError_class, "Illegal quoting on line %lu, cell %lu", num_rows, cell_count); } } if (!quotes_matched) /* Reached EOF without matching quotes */ rb_raise(BAMFCSV_MalformedCSVError_class, "Illegal quoting on line %lu, cell %lu: File ends without closing '\"'", num_rows, cell_count); else if (quote_count && !quotes_end_line(cur)) /* Quotes closed before end of final cell */ rb_raise(BAMFCSV_MalformedCSVError_class, "Unclosed quoted field on line %lu, cell %lu: EOF", num_rows, cell_count); VALUE cell_str = bamfcsv_finalize_cell(cell_start, cur-1, quote_count, enc); if (quote_count) rb_funcall(cell_str, gsub_bang, 2, dbl_dquote, dquote); /* Completely blank lines don't even get a nil. This matches CSV's behavior. */ if (cell_count > 1 || cell_str != Qnil) rb_ary_push(row, cell_str); rb_ary_push(matrix, row); return matrix; }
rb_encoding* rb_enc_compatible(VALUE str1, VALUE str2) { int idx1, idx2; rb_encoding *enc1, *enc2; idx1 = rb_enc_get_index(str1); idx2 = rb_enc_get_index(str2); if (idx1 < 0 || idx2 < 0) return 0; if (idx1 == idx2) { return rb_enc_from_index(idx1); } enc1 = rb_enc_from_index(idx1); enc2 = rb_enc_from_index(idx2); if (TYPE(str2) == T_STRING && RSTRING_LEN(str2) == 0) return (idx1 == ENCINDEX_US_ASCII && rb_enc_asciicompat(enc2)) ? enc2 : enc1; if (TYPE(str1) == T_STRING && RSTRING_LEN(str1) == 0) return (idx2 == ENCINDEX_US_ASCII && rb_enc_asciicompat(enc1)) ? enc1 : enc2; if (!rb_enc_asciicompat(enc1) || !rb_enc_asciicompat(enc2)) { return 0; } /* objects whose encoding is the same of contents */ if (BUILTIN_TYPE(str2) != T_STRING && idx2 == ENCINDEX_US_ASCII) return enc1; if (BUILTIN_TYPE(str1) != T_STRING && idx1 == ENCINDEX_US_ASCII) return enc2; if (BUILTIN_TYPE(str1) != T_STRING) { VALUE tmp = str1; int idx0 = idx1; str1 = str2; str2 = tmp; idx1 = idx2; idx2 = idx0; } if (BUILTIN_TYPE(str1) == T_STRING) { int cr1, cr2; cr1 = rb_enc_str_coderange(str1); if (BUILTIN_TYPE(str2) == T_STRING) { cr2 = rb_enc_str_coderange(str2); if (cr1 != cr2) { /* may need to handle ENC_CODERANGE_BROKEN */ if (cr1 == ENC_CODERANGE_7BIT) return enc2; if (cr2 == ENC_CODERANGE_7BIT) return enc1; } if (cr2 == ENC_CODERANGE_7BIT) { if (idx1 == ENCINDEX_ASCII) return enc2; return enc1; } } if (cr1 == ENC_CODERANGE_7BIT) return enc2; } return 0; }
static rb_encoding * str_to_encoding(VALUE enc) { return rb_enc_from_index(str_to_encindex(enc)); }