// // Temp_Byte_Chars_May_Fail: C // // NOTE: This function returns a temporary result, and uses an internal // buffer. Do not use it recursively. Also, it will Trap on errors. // // Prequalifies a string before using it with a function that // expects it to be 8-bits. It would be used for instance to convert // a string that is potentially REBUNI-wide into a form that can be used // with a Scan_XXX routine, that is expecting ASCII or UTF-8 source. // (Many TO-XXX conversions from STRING re-use that scanner logic.) // // Returns a temporary string and sets the length field. // // If `allow_utf8`, the constructed result is converted to UTF8. // // Checks or converts it: // // 1. it is byte string (not unicode) // 2. if unicode, copy and return as temp byte string // 3. it's actual content (less space, newlines) <= max len // 4. it does not contain other values ("123 456") // 5. it's not empty or only whitespace // REBYTE *Temp_Byte_Chars_May_Fail( const REBVAL *val, REBINT max_len, REBCNT *length, REBOOL allow_utf8 ) { REBCNT tail = VAL_LEN_HEAD(val); REBCNT index = VAL_INDEX(val); REBCNT len; REBUNI c; REBYTE *bp; REBSER *src = VAL_SERIES(val); if (index > tail) fail (Error(RE_PAST_END)); Resize_Series(BYTE_BUF, max_len+1); bp = BIN_HEAD(BYTE_BUF); // Skip leading whitespace: for (; index < tail; index++) { c = GET_ANY_CHAR(src, index); if (!IS_SPACE(c)) break; } // Copy chars that are valid: for (; index < tail; index++) { c = GET_ANY_CHAR(src, index); if (c >= 0x80) { if (!allow_utf8) fail (Error(RE_INVALID_CHARS)); len = Encode_UTF8_Char(bp, c); max_len -= len; bp += len; } else if (!IS_SPACE(c)) { *bp++ = (REBYTE)c; max_len--; } else break; if (max_len < 0) fail (Error(RE_TOO_LONG)); } // Rest better be just spaces: for (; index < tail; index++) { c = GET_ANY_CHAR(src, index); if (!IS_SPACE(c)) fail (Error(RE_INVALID_CHARS)); } *bp = '\0'; len = bp - BIN_HEAD(BYTE_BUF); if (len == 0) fail (Error(RE_TOO_SHORT)); if (length) *length = len; return BIN_HEAD(BYTE_BUF); }
*/ REBCNT Encode_UTF8(REBYTE *dst, REBINT max, void *src, REBCNT *len, REBFLG uni, REBFLG ccr) /* ** Encode the unicode into UTF8 byte string. ** ** Source string can be byte or unichar sized (uni = TRUE); ** Max is the maximum size of the result (UTF8). ** Returns number of source chars used. ** Updates len for dst bytes used. ** Does not add a terminator. ** ***********************************************************************/ { REBUNI c; REBINT n; REBYTE buf[8]; REBYTE *bs = dst; // save start REBYTE *bp = (REBYTE*)src; REBUNI *up = (REBUNI*)src; REBCNT cnt; if (len) cnt = *len; else { cnt = (REBCNT)(uni ? wcslen((const wchar_t*)bp) : LEN_BYTES((REBYTE*)bp)); } for (; max > 0 && cnt > 0; cnt--) { c = uni ? *up++ : *bp++; if (c < 0x80) { #if defined(TO_WINDOWS) if (ccr && c == LF) { // If there's not room, don't try to output CRLF if (2 > max) {up--; break;} *dst++ = CR; max--; c = LF; } #endif *dst++ = (REBYTE)c; max--; } else { n = Encode_UTF8_Char(buf, c); if (n > max) {up--; break;} memcpy(dst, buf, n); dst += n; max -= n; } } if (len) *len = dst - bs; return uni ? up - (REBUNI*)src : bp - (REBYTE*)src; }
*/ int Encode_UTF8_Line(REBSER *dst, REBSER *src, REBCNT idx) /* ** Encode a unicode source buffer into a binary line of UTF8. ** Include the LF terminator in the result. ** Return the length of the line buffer. ** ***********************************************************************/ { REBUNI *up = UNI_HEAD(src); REBCNT len = SERIES_TAIL(src); REBCNT tail; REBUNI c; REBINT n; REBYTE buf[8]; tail = RESET_TAIL(dst); while (idx < len) { if ((c = up[idx]) < 0x80) { EXPAND_SERIES_TAIL(dst, 1); BIN_HEAD(dst)[tail++] = (REBYTE)c; } else { n = Encode_UTF8_Char(buf, c); EXPAND_SERIES_TAIL(dst, n); memcpy(BIN_SKIP(dst, tail), buf, n); tail += n; } idx++; if (c == LF) break; } BIN_HEAD(dst)[tail] = 0; SERIES_TAIL(dst) = tail; return idx; }
static REBSER *make_binary(REBVAL *arg, REBOOL make) { REBSER *ser; // MAKE BINARY! 123 switch (VAL_TYPE(arg)) { case REB_INTEGER: case REB_DECIMAL: if (make) ser = Make_Binary(Int32s(arg, 0)); else ser = Make_Binary_BE64(arg); break; // MAKE/TO BINARY! BINARY! case REB_BINARY: ser = Copy_Bytes(VAL_BIN_DATA(arg), VAL_LEN(arg)); break; // MAKE/TO BINARY! <any-string> case REB_STRING: case REB_FILE: case REB_EMAIL: case REB_URL: case REB_TAG: // case REB_ISSUE: ser = Encode_UTF8_Value(arg, VAL_LEN(arg), 0); break; case REB_BLOCK: ser = Join_Binary(arg); break; // MAKE/TO BINARY! <tuple!> case REB_TUPLE: ser = Copy_Bytes(VAL_TUPLE(arg), VAL_TUPLE_LEN(arg)); break; // MAKE/TO BINARY! <char!> case REB_CHAR: ser = Make_Binary(6); ser->tail = Encode_UTF8_Char(BIN_HEAD(ser), VAL_CHAR(arg)); break; // MAKE/TO BINARY! <bitset!> case REB_BITSET: ser = Copy_Bytes(VAL_BIN(arg), VAL_TAIL(arg)); break; // MAKE/TO BINARY! <image!> case REB_IMAGE: ser = Make_Image_Binary(arg); break; case REB_MONEY: ser = Make_Binary(12); ser->tail = 12; deci_to_binary(ser->data, VAL_DECI(arg)); ser->data[12] = 0; break; default: ser = 0; } return ser; }
// // Modify_String: C // // Returns new dst_idx. // REBCNT Modify_String( REBCNT action, // INSERT, APPEND, CHANGE REBSER *dst_ser, // target REBCNT dst_idx, // position const REBVAL *src_val, // source REBFLGS flags, // AN_PART REBINT dst_len, // length to remove REBINT dups // dup count ) { REBSER *src_ser = 0; REBCNT src_idx = 0; REBCNT src_len; REBCNT tail = SER_LEN(dst_ser); REBINT size; // total to insert REBOOL needs_free; REBINT limit; // For INSERT/PART and APPEND/PART if (action != SYM_CHANGE && GET_FLAG(flags, AN_PART)) limit = dst_len; // should be non-negative else limit = -1; if (limit == 0 || dups < 0) return (action == SYM_APPEND) ? 0 : dst_idx; if (action == SYM_APPEND || dst_idx > tail) dst_idx = tail; // If the src_val is not a string, then we need to create a string: if (GET_FLAG(flags, AN_SERIES)) { // used to indicate a BINARY series if (IS_INTEGER(src_val)) { src_ser = Make_Series_Codepoint(Int8u(src_val)); needs_free = TRUE; limit = -1; } else if (IS_BLOCK(src_val)) { src_ser = Join_Binary(src_val, limit); // NOTE: it's the shared FORM buffer! needs_free = FALSE; limit = -1; } else if (IS_CHAR(src_val)) { // // "UTF-8 was originally specified to allow codepoints with up to // 31 bits (or 6 bytes). But with RFC3629, this was reduced to 4 // bytes max. to be more compatible to UTF-16." So depending on // which RFC you consider "the UTF-8", max size is either 4 or 6. // src_ser = Make_Binary(6); SET_SERIES_LEN( src_ser, Encode_UTF8_Char(BIN_HEAD(src_ser), VAL_CHAR(src_val)) ); needs_free = TRUE; limit = -1; } else if (ANY_STRING(src_val)) { src_len = VAL_LEN_AT(src_val); if (limit >= 0 && src_len > cast(REBCNT, limit)) src_len = limit; src_ser = Make_UTF8_From_Any_String(src_val, src_len, 0); needs_free = TRUE; limit = -1; } else if (!IS_BINARY(src_val)) fail (Error_Invalid_Arg(src_val)); } else if (IS_CHAR(src_val)) { src_ser = Make_Series_Codepoint(VAL_CHAR(src_val)); needs_free = TRUE; } else if (IS_BLOCK(src_val)) { src_ser = Form_Tight_Block(src_val); needs_free = TRUE; } else if (!ANY_STRING(src_val) || IS_TAG(src_val)) { src_ser = Copy_Form_Value(src_val, 0); needs_free = TRUE; } // Use either new src or the one that was passed: if (src_ser) { src_len = SER_LEN(src_ser); } else { src_ser = VAL_SERIES(src_val); src_idx = VAL_INDEX(src_val); src_len = VAL_LEN_AT(src_val); needs_free = FALSE; } if (limit >= 0) src_len = limit; // If Source == Destination we need to prevent possible conflicts. // Clone the argument just to be safe. // (Note: It may be possible to optimize special cases like append !!) if (dst_ser == src_ser) { assert(!needs_free); src_ser = Copy_Sequence_At_Len(src_ser, src_idx, src_len); needs_free = TRUE; src_idx = 0; } // Total to insert: size = dups * src_len; if (action != SYM_CHANGE) { // Always expand dst_ser for INSERT and APPEND actions: Expand_Series(dst_ser, dst_idx, size); } else { if (size > dst_len) Expand_Series(dst_ser, dst_idx, size - dst_len); else if (size < dst_len && GET_FLAG(flags, AN_PART)) Remove_Series(dst_ser, dst_idx, dst_len - size); else if (size + dst_idx > tail) { EXPAND_SERIES_TAIL(dst_ser, size - (tail - dst_idx)); } } // For dup count: for (; dups > 0; dups--) { Insert_String(dst_ser, dst_idx, src_ser, src_idx, src_len, TRUE); dst_idx += src_len; } TERM_SEQUENCE(dst_ser); if (needs_free) { // If we did not use the series that was passed in, but rather // created an internal temporary one, we need to free it. Free_Series(src_ser); } return (action == SYM_APPEND) ? 0 : dst_idx; }
*/ REBCNT Modify_String(REBCNT action, REBSER *dst_ser, REBCNT dst_idx, const REBVAL *src_val, REBCNT flags, REBINT dst_len, REBINT dups) /* ** action: INSERT, APPEND, CHANGE ** ** dst_ser: target ** dst_idx: position ** src_val: source ** flags: AN_PART ** dst_len: length to remove ** dups: dup count ** ** return: new dst_idx ** ***********************************************************************/ { REBSER *src_ser = 0; REBCNT src_idx = 0; REBCNT src_len; REBCNT tail = SERIES_TAIL(dst_ser); REBINT size; // total to insert if (dups < 0) return (action == A_APPEND) ? 0 : dst_idx; if (action == A_APPEND || dst_idx > tail) dst_idx = tail; // If the src_val is not a string, then we need to create a string: if (GET_FLAG(flags, AN_SERIES)) { // used to indicate a BINARY series if (IS_INTEGER(src_val)) { src_ser = Append_Byte(0, Int8u(src_val)); // creates a binary } else if (IS_BLOCK(src_val)) { src_ser = Join_Binary(src_val); // NOTE: it's the shared FORM buffer! } else if (IS_CHAR(src_val)) { src_ser = Make_Binary(6); // (I hate unicode) src_ser->tail = Encode_UTF8_Char(BIN_HEAD(src_ser), VAL_CHAR(src_val)); } else if (!ANY_BINSTR(src_val)) Trap_Arg_DEAD_END(src_val); } else if (IS_CHAR(src_val)) { src_ser = Append_Byte(0, VAL_CHAR(src_val)); // unicode ok too } else if (IS_BLOCK(src_val)) { src_ser = Form_Tight_Block(src_val); } else if (!ANY_STR(src_val) || IS_TAG(src_val)) { src_ser = Copy_Form_Value(src_val, 0); } // Use either new src or the one that was passed: if (src_ser) { src_len = SERIES_TAIL(src_ser); } else { src_ser = VAL_SERIES(src_val); src_idx = VAL_INDEX(src_val); src_len = VAL_LEN(src_val); } // For INSERT or APPEND with /PART use the dst_len not src_len: if (action != A_CHANGE && GET_FLAG(flags, AN_PART)) src_len = dst_len; // If Source == Destination we need to prevent possible conflicts. // Clone the argument just to be safe. // (Note: It may be possible to optimize special cases like append !!) if (dst_ser == src_ser) { src_ser = Copy_Series_Part(src_ser, src_idx, src_len); src_idx = 0; } // Total to insert: size = dups * src_len; if (action != A_CHANGE) { // Always expand dst_ser for INSERT and APPEND actions: Expand_Series(dst_ser, dst_idx, size); } else { if (size > dst_len) Expand_Series(dst_ser, dst_idx, size - dst_len); else if (size < dst_len && GET_FLAG(flags, AN_PART)) Remove_Series(dst_ser, dst_idx, dst_len - size); else if (size + dst_idx > tail) { EXPAND_SERIES_TAIL(dst_ser, size - (tail - dst_idx)); } } // For dup count: for (; dups > 0; dups--) { Insert_String(dst_ser, dst_idx, src_ser, src_idx, src_len, TRUE); dst_idx += src_len; } TERM_SERIES(dst_ser); return (action == A_APPEND) ? 0 : dst_idx; }
*/ REBYTE *Temp_Byte_Chars_May_Fail(const REBVAL *val, REBINT max_len, REBCNT *length, REBINT opts) /* ** NOTE: This function returns a temporary result, and uses an internal ** buffer. Do not use it recursively. Also, it will Trap on errors. ** ** Prequalifies a string before using it with a function that ** expects it to be 8-bits. It would be used for instance to convert ** a string that is potentially REBUNI-wide into a form that can be used ** with a Scan_XXX routine, that is expecting ASCII or UTF-8 source. ** (Many TO-XXX conversions from STRING re-use that scanner logic.) ** ** Returns a temporary string and sets the length field. ** ** Opts can be: ** 0 - no special options ** 1 - allow UTF8 (val is converted to UTF8 during qualification) ** 2 - allow binary ** ** Checks or converts it: ** ** 1. it is byte string (not unicode) ** 2. if unicode, copy and return as temp byte string ** 3. it's actual content (less space, newlines) <= max len ** 4. it does not contain other values ("123 456") ** 5. it's not empty or only whitespace ** ***********************************************************************/ { REBCNT tail = VAL_TAIL(val); REBCNT index = VAL_INDEX(val); REBCNT len; REBUNI c; REBYTE *bp; REBSER *src = VAL_SERIES(val); if (index > tail) raise Error_0(RE_PAST_END); Resize_Series(BUF_FORM, max_len+1); bp = BIN_HEAD(BUF_FORM); // Skip leading whitespace: for (; index < tail; index++) { c = GET_ANY_CHAR(src, index); if (!IS_SPACE(c)) break; } // Copy chars that are valid: for (; index < tail; index++) { c = GET_ANY_CHAR(src, index); if (opts < 2 && c >= 0x80) { if (opts == 0) raise Error_0(RE_INVALID_CHARS); len = Encode_UTF8_Char(bp, c); max_len -= len; bp += len; } else if (!IS_SPACE(c)) { *bp++ = (REBYTE)c; max_len--; } else break; if (max_len < 0) raise Error_0(RE_TOO_LONG); } // Rest better be just spaces: for (; index < tail; index++) { c = GET_ANY_CHAR(src, index); if (!IS_SPACE(c)) raise Error_0(RE_INVALID_CHARS); } *bp= 0; len = bp - BIN_HEAD(BUF_FORM); if (len == 0) raise Error_0(RE_TOO_SHORT); if (length) *length = len; return BIN_HEAD(BUF_FORM); }
static REBSER *make_binary(const REBVAL *arg, REBOOL make) { REBSER *ser; // MAKE BINARY! 123 switch (VAL_TYPE(arg)) { case REB_INTEGER: case REB_DECIMAL: if (make) ser = Make_Binary(Int32s(arg, 0)); else ser = Make_Binary_BE64(arg); break; // MAKE/TO BINARY! BINARY! case REB_BINARY: ser = Copy_Bytes(VAL_BIN_AT(arg), VAL_LEN_AT(arg)); break; // MAKE/TO BINARY! <any-string> case REB_STRING: case REB_FILE: case REB_EMAIL: case REB_URL: case REB_TAG: // case REB_ISSUE: ser = Make_UTF8_From_Any_String(arg, VAL_LEN_AT(arg), 0); break; case REB_BLOCK: // Join_Binary returns a shared buffer, so produce a copy: ser = Copy_Sequence(Join_Binary(arg, -1)); break; // MAKE/TO BINARY! <tuple!> case REB_TUPLE: ser = Copy_Bytes(VAL_TUPLE(arg), VAL_TUPLE_LEN(arg)); break; // MAKE/TO BINARY! <char!> case REB_CHAR: ser = Make_Binary(6); TERM_SEQUENCE_LEN(ser, Encode_UTF8_Char(BIN_HEAD(ser), VAL_CHAR(arg))); break; // MAKE/TO BINARY! <bitset!> case REB_BITSET: ser = Copy_Bytes(VAL_BIN(arg), VAL_LEN_HEAD(arg)); break; // MAKE/TO BINARY! <image!> case REB_IMAGE: ser = Make_Image_Binary(arg); break; case REB_MONEY: ser = Make_Binary(12); deci_to_binary(BIN_HEAD(ser), VAL_MONEY_AMOUNT(arg)); TERM_SEQUENCE_LEN(ser, 12); break; default: ser = 0; } return ser; }