*/ REBSER *Temp_Bin_Str_Managed(REBVAL *val, REBCNT *index, REBCNT *length) /* ** Determines if UTF8 conversion is needed for a series before it ** is used with a byte-oriented function. ** ** If conversion is needed, a UTF8 series will be created. Otherwise, ** the source series is returned as-is. ** ** Note: This routine should only be used to generate a value used ** for temporary purposes, because it has a "surprising variance" ** regarding its input. If the value's series can be reused, it is-- ** and this depends on an implementation detail of internal encoding ** that the user should not be aware of (they need not know if the ** internal representation of an ASCII string uses 1, 2, or however ** many bytes). But copying vs. non-copying means the resulting ** data might or might not have previous values available to step ** back into from the originating series! ** ** !!! Should performance dictate it, the callsites could be ** adapted to know whether this produced a new series or not, and ** instead of managing a created result they could be responsible ** for freeing it if so. ** ***********************************************************************/ { REBCNT len = (length && *length) ? *length : VAL_LEN(val); REBSER *series; assert(IS_BINARY(val) || ANY_STR(val)); if (len == 0 || IS_BINARY(val) || VAL_STR_IS_ASCII(val)) { // If it's zero length, BINARY!, or an ANY-STRING! whose bytes are // all values less than 128, we reuse the series. series = VAL_SERIES(val); ASSERT_SERIES_MANAGED(series); if (index) *index = VAL_INDEX(val); if (length) *length = len; } else { // UTF-8 conversion is required, and we manage the result. series = Make_UTF8_From_Any_String(val, len, OPT_ENC_CRLF_MAYBE); MANAGE_SERIES(series); if (index) *index = 0; if (length) *length = SERIES_TAIL(series); } return series; }
// // Modify_String: C // // Returns new dst_idx. // REBCNT Modify_String( REBCNT action, // INSERT, APPEND, CHANGE REBSER *dst_ser, // target REBCNT dst_idx, // position const REBVAL *src_val, // source REBFLGS flags, // AN_PART REBINT dst_len, // length to remove REBINT dups // dup count ) { REBSER *src_ser = 0; REBCNT src_idx = 0; REBCNT src_len; REBCNT tail = SER_LEN(dst_ser); REBINT size; // total to insert REBOOL needs_free; REBINT limit; // For INSERT/PART and APPEND/PART if (action != SYM_CHANGE && GET_FLAG(flags, AN_PART)) limit = dst_len; // should be non-negative else limit = -1; if (limit == 0 || dups < 0) return (action == SYM_APPEND) ? 0 : dst_idx; if (action == SYM_APPEND || dst_idx > tail) dst_idx = tail; // If the src_val is not a string, then we need to create a string: if (GET_FLAG(flags, AN_SERIES)) { // used to indicate a BINARY series if (IS_INTEGER(src_val)) { src_ser = Make_Series_Codepoint(Int8u(src_val)); needs_free = TRUE; limit = -1; } else if (IS_BLOCK(src_val)) { src_ser = Join_Binary(src_val, limit); // NOTE: it's the shared FORM buffer! needs_free = FALSE; limit = -1; } else if (IS_CHAR(src_val)) { // // "UTF-8 was originally specified to allow codepoints with up to // 31 bits (or 6 bytes). But with RFC3629, this was reduced to 4 // bytes max. to be more compatible to UTF-16." So depending on // which RFC you consider "the UTF-8", max size is either 4 or 6. // src_ser = Make_Binary(6); SET_SERIES_LEN( src_ser, Encode_UTF8_Char(BIN_HEAD(src_ser), VAL_CHAR(src_val)) ); needs_free = TRUE; limit = -1; } else if (ANY_STRING(src_val)) { src_len = VAL_LEN_AT(src_val); if (limit >= 0 && src_len > cast(REBCNT, limit)) src_len = limit; src_ser = Make_UTF8_From_Any_String(src_val, src_len, 0); needs_free = TRUE; limit = -1; } else if (!IS_BINARY(src_val)) fail (Error_Invalid_Arg(src_val)); } else if (IS_CHAR(src_val)) { src_ser = Make_Series_Codepoint(VAL_CHAR(src_val)); needs_free = TRUE; } else if (IS_BLOCK(src_val)) { src_ser = Form_Tight_Block(src_val); needs_free = TRUE; } else if (!ANY_STRING(src_val) || IS_TAG(src_val)) { src_ser = Copy_Form_Value(src_val, 0); needs_free = TRUE; } // Use either new src or the one that was passed: if (src_ser) { src_len = SER_LEN(src_ser); } else { src_ser = VAL_SERIES(src_val); src_idx = VAL_INDEX(src_val); src_len = VAL_LEN_AT(src_val); needs_free = FALSE; } if (limit >= 0) src_len = limit; // If Source == Destination we need to prevent possible conflicts. // Clone the argument just to be safe. // (Note: It may be possible to optimize special cases like append !!) if (dst_ser == src_ser) { assert(!needs_free); src_ser = Copy_Sequence_At_Len(src_ser, src_idx, src_len); needs_free = TRUE; src_idx = 0; } // Total to insert: size = dups * src_len; if (action != SYM_CHANGE) { // Always expand dst_ser for INSERT and APPEND actions: Expand_Series(dst_ser, dst_idx, size); } else { if (size > dst_len) Expand_Series(dst_ser, dst_idx, size - dst_len); else if (size < dst_len && GET_FLAG(flags, AN_PART)) Remove_Series(dst_ser, dst_idx, dst_len - size); else if (size + dst_idx > tail) { EXPAND_SERIES_TAIL(dst_ser, size - (tail - dst_idx)); } } // For dup count: for (; dups > 0; dups--) { Insert_String(dst_ser, dst_idx, src_ser, src_idx, src_len, TRUE); dst_idx += src_len; } TERM_SEQUENCE(dst_ser); if (needs_free) { // If we did not use the series that was passed in, but rather // created an internal temporary one, we need to free it. Free_Series(src_ser); } return (action == SYM_APPEND) ? 0 : dst_idx; }
// // Temp_Bin_Str_Managed: C // // Determines if UTF8 conversion is needed for a series before it // is used with a byte-oriented function. // // If conversion is needed, a UTF8 series will be created. Otherwise, // the source series is returned as-is. // // Note: This routine should only be used to generate a value used // for temporary purposes, because it has a "surprising variance" // regarding its input. If the value's series can be reused, it is-- // and this depends on an implementation detail of internal encoding // that the user should not be aware of (they need not know if the // internal representation of an ASCII string uses 1, 2, or however // many bytes). But copying vs. non-copying means the resulting // data might or might not have previous values available to step // back into from the originating series! // // !!! Should performance dictate it, the callsites could be // adapted to know whether this produced a new series or not, and // instead of managing a created result they could be responsible // for freeing it if so. // REBSER *Temp_Bin_Str_Managed(const REBVAL *val, REBCNT *index, REBCNT *length) { REBCNT len = (length && *length) ? *length : VAL_LEN_AT(val); REBSER *series; assert(IS_BINARY(val) || ANY_STRING(val)); // !!! This used to check `len == 0` and reuse a zero length string. // However, the zero length string could have the wrong width. We are // expected to be returning a BYTE_SIZE() string, and that confused // things. It's not a good idea to mutate the source string (e.g. // reallocate under a new width) so consider having an EMPTY_BYTE_STRING // like EMPTY_ARRAY which is protected to hand back. // if ( IS_BINARY(val) || ( VAL_BYTE_SIZE(val) && All_Bytes_ASCII(VAL_BIN_AT(val), VAL_LEN_AT(val)) ) ){ // // It's BINARY!, or an ANY-STRING! whose codepoints are all values in // ASCII (0x00 => 0x7F), hence not needing any UTF-8 encoding. // series = VAL_SERIES(val); ASSERT_SERIES_MANAGED(series); if (index) *index = VAL_INDEX(val); if (length) *length = len; } else { // UTF-8 conversion is required, and we manage the result. series = Make_UTF8_From_Any_String(val, len, OPT_ENC_CRLF_MAYBE); MANAGE_SERIES(series); #if !defined(NDEBUG) // // Also, PROTECT the result in the debug build...because since the // caller doesn't know if a new series was created or if the initial // data is being used, they should not be modifying it! (We don't // want to protect the original data, because we wouldn't know when // we were allowed to unlock it...there's no later call in this // model to clean up the series.) { REBVAL protect; Val_Init_String(&protect, series); Protect_Value(&protect, FLAGIT(PROT_SET)); // just a string...not /DEEP...shouldn't need to Unmark() } #endif if (index) *index = 0; if (length) *length = SER_LEN(series); } assert(BYTE_SIZE(series)); return series; }
static REBSER *make_binary(const REBVAL *arg, REBOOL make) { REBSER *ser; // MAKE BINARY! 123 switch (VAL_TYPE(arg)) { case REB_INTEGER: case REB_DECIMAL: if (make) ser = Make_Binary(Int32s(arg, 0)); else ser = Make_Binary_BE64(arg); break; // MAKE/TO BINARY! BINARY! case REB_BINARY: ser = Copy_Bytes(VAL_BIN_AT(arg), VAL_LEN_AT(arg)); break; // MAKE/TO BINARY! <any-string> case REB_STRING: case REB_FILE: case REB_EMAIL: case REB_URL: case REB_TAG: // case REB_ISSUE: ser = Make_UTF8_From_Any_String(arg, VAL_LEN_AT(arg), 0); break; case REB_BLOCK: // Join_Binary returns a shared buffer, so produce a copy: ser = Copy_Sequence(Join_Binary(arg, -1)); break; // MAKE/TO BINARY! <tuple!> case REB_TUPLE: ser = Copy_Bytes(VAL_TUPLE(arg), VAL_TUPLE_LEN(arg)); break; // MAKE/TO BINARY! <char!> case REB_CHAR: ser = Make_Binary(6); TERM_SEQUENCE_LEN(ser, Encode_UTF8_Char(BIN_HEAD(ser), VAL_CHAR(arg))); break; // MAKE/TO BINARY! <bitset!> case REB_BITSET: ser = Copy_Bytes(VAL_BIN(arg), VAL_LEN_HEAD(arg)); break; // MAKE/TO BINARY! <image!> case REB_IMAGE: ser = Make_Image_Binary(arg); break; case REB_MONEY: ser = Make_Binary(12); deci_to_binary(BIN_HEAD(ser), VAL_MONEY_AMOUNT(arg)); TERM_SEQUENCE_LEN(ser, 12); break; default: ser = 0; } return ser; }