Ejemplo n.º 1
0
//
//  Temp_Byte_Chars_May_Fail: C
// 
// NOTE: This function returns a temporary result, and uses an internal
// buffer.  Do not use it recursively.  Also, it will Trap on errors.
// 
// Prequalifies a string before using it with a function that
// expects it to be 8-bits.  It would be used for instance to convert
// a string that is potentially REBUNI-wide into a form that can be used
// with a Scan_XXX routine, that is expecting ASCII or UTF-8 source.
// (Many TO-XXX conversions from STRING re-use that scanner logic.)
// 
// Returns a temporary string and sets the length field.
// 
// If `allow_utf8`, the constructed result is converted to UTF8.
// 
// Checks or converts it:
// 
//     1. it is byte string (not unicode)
//     2. if unicode, copy and return as temp byte string
//     3. it's actual content (less space, newlines) <= max len
//     4. it does not contain other values ("123 456")
//     5. it's not empty or only whitespace
//
REBYTE *Temp_Byte_Chars_May_Fail(
    const REBVAL *val,
    REBINT max_len,
    REBCNT *length,
    REBOOL allow_utf8
) {
    REBCNT tail = VAL_LEN_HEAD(val);
    REBCNT index = VAL_INDEX(val);
    REBCNT len;
    REBUNI c;
    REBYTE *bp;
    REBSER *src = VAL_SERIES(val);

    if (index > tail) fail (Error(RE_PAST_END));

    Resize_Series(BYTE_BUF, max_len+1);
    bp = BIN_HEAD(BYTE_BUF);

    // Skip leading whitespace:
    for (; index < tail; index++) {
        c = GET_ANY_CHAR(src, index);
        if (!IS_SPACE(c)) break;
    }

    // Copy chars that are valid:
    for (; index < tail; index++) {
        c = GET_ANY_CHAR(src, index);
        if (c >= 0x80) {
            if (!allow_utf8) fail (Error(RE_INVALID_CHARS));

            len = Encode_UTF8_Char(bp, c);
            max_len -= len;
            bp += len;
        }
        else if (!IS_SPACE(c)) {
            *bp++ = (REBYTE)c;
            max_len--;
        }
        else break;
        if (max_len < 0)
            fail (Error(RE_TOO_LONG));
    }

    // Rest better be just spaces:
    for (; index < tail; index++) {
        c = GET_ANY_CHAR(src, index);
        if (!IS_SPACE(c)) fail (Error(RE_INVALID_CHARS));
    }

    *bp = '\0';

    len = bp - BIN_HEAD(BYTE_BUF);
    if (len == 0) fail (Error(RE_TOO_SHORT));

    if (length) *length = len;

    return BIN_HEAD(BYTE_BUF);
}
Ejemplo n.º 2
0
Archivo: s-unicode.c Proyecto: Oldes/r3
*/	REBCNT Encode_UTF8(REBYTE *dst, REBINT max, void *src, REBCNT *len, REBFLG uni, REBFLG ccr)
/*
**		Encode the unicode into UTF8 byte string.
**
**		Source string can be byte or unichar sized (uni = TRUE);
**		Max is the maximum size of the result (UTF8).
**		Returns number of source chars used.
**		Updates len for dst bytes used.
**		Does not add a terminator.
**
***********************************************************************/
{
	REBUNI c;
	REBINT n;
	REBYTE buf[8];
	REBYTE *bs = dst; // save start
	REBYTE *bp = (REBYTE*)src;
	REBUNI *up = (REBUNI*)src;
	REBCNT cnt;

	if (len) cnt = *len;
	else {
		cnt = (REBCNT)(uni ? wcslen((const wchar_t*)bp) : LEN_BYTES((REBYTE*)bp));
	}

	for (; max > 0 && cnt > 0; cnt--) {
		c = uni ? *up++ : *bp++;
		if (c < 0x80) {
#if defined(TO_WINDOWS)
			if (ccr && c == LF) {
				// If there's not room, don't try to output CRLF
				if (2 > max) {up--; break;}
				*dst++ = CR;
				max--;
				c = LF;
			}
#endif
			*dst++ = (REBYTE)c;
			max--;
		}
		else {
			n = Encode_UTF8_Char(buf, c);
			if (n > max) {up--; break;}
			memcpy(dst, buf, n);
			dst += n;
			max -= n;
		}
	}

	if (len) *len = dst - bs;

	return uni ? up - (REBUNI*)src : bp - (REBYTE*)src;
}
Ejemplo n.º 3
0
Archivo: s-unicode.c Proyecto: Oldes/r3
*/  int Encode_UTF8_Line(REBSER *dst, REBSER *src, REBCNT idx)
/*
**		Encode a unicode source buffer into a binary line of UTF8.
**		Include the LF terminator in the result.
**		Return the length of the line buffer.
**
***********************************************************************/
{
	REBUNI *up = UNI_HEAD(src);
	REBCNT len  = SERIES_TAIL(src);
	REBCNT tail;
	REBUNI c;
	REBINT n;
	REBYTE buf[8];

	tail = RESET_TAIL(dst);

	while (idx < len) {
		if ((c = up[idx]) < 0x80) {
			EXPAND_SERIES_TAIL(dst, 1);
			BIN_HEAD(dst)[tail++] = (REBYTE)c;
		}
		else {
			n = Encode_UTF8_Char(buf, c);
			EXPAND_SERIES_TAIL(dst, n);
			memcpy(BIN_SKIP(dst, tail), buf, n);
			tail += n;
		}
		idx++;
		if (c == LF) break;
	}

	BIN_HEAD(dst)[tail] = 0;
	SERIES_TAIL(dst) = tail;
	return idx;
}
Ejemplo n.º 4
0
static REBSER *make_binary(REBVAL *arg, REBOOL make)
{
	REBSER *ser;

	// MAKE BINARY! 123
	switch (VAL_TYPE(arg)) {
	case REB_INTEGER:
	case REB_DECIMAL:
		if (make) ser = Make_Binary(Int32s(arg, 0));
		else ser = Make_Binary_BE64(arg);
		break;

	// MAKE/TO BINARY! BINARY!
	case REB_BINARY:
		ser = Copy_Bytes(VAL_BIN_DATA(arg), VAL_LEN(arg));
		break;

	// MAKE/TO BINARY! <any-string>
	case REB_STRING:
	case REB_FILE:
	case REB_EMAIL:
	case REB_URL:
	case REB_TAG:
//	case REB_ISSUE:
		ser = Encode_UTF8_Value(arg, VAL_LEN(arg), 0);
		break;

	case REB_BLOCK:
		ser = Join_Binary(arg);
		break;

	// MAKE/TO BINARY! <tuple!>
	case REB_TUPLE:
		ser = Copy_Bytes(VAL_TUPLE(arg), VAL_TUPLE_LEN(arg));
		break;

	// MAKE/TO BINARY! <char!>
	case REB_CHAR:
		ser = Make_Binary(6);
		ser->tail = Encode_UTF8_Char(BIN_HEAD(ser), VAL_CHAR(arg));
		break;

	// MAKE/TO BINARY! <bitset!>
	case REB_BITSET:
		ser = Copy_Bytes(VAL_BIN(arg), VAL_TAIL(arg));
		break;

	// MAKE/TO BINARY! <image!>
	case REB_IMAGE:
	  	ser = Make_Image_Binary(arg);
		break;

	case REB_MONEY:
		ser = Make_Binary(12);
		ser->tail = 12;
		deci_to_binary(ser->data, VAL_DECI(arg));
		ser->data[12] = 0;
		break;

	default:
		ser = 0;
	}

	return ser;
}
Ejemplo n.º 5
0
//
//  Modify_String: C
// 
// Returns new dst_idx.
//
REBCNT Modify_String(
    REBCNT action,          // INSERT, APPEND, CHANGE
    REBSER *dst_ser,        // target
    REBCNT dst_idx,         // position
    const REBVAL *src_val,  // source
    REBFLGS flags,          // AN_PART
    REBINT dst_len,         // length to remove
    REBINT dups             // dup count
) {
    REBSER *src_ser = 0;
    REBCNT src_idx = 0;
    REBCNT src_len;
    REBCNT tail  = SER_LEN(dst_ser);
    REBINT size;        // total to insert
    REBOOL needs_free;
    REBINT limit;

    // For INSERT/PART and APPEND/PART
    if (action != SYM_CHANGE && GET_FLAG(flags, AN_PART))
        limit = dst_len; // should be non-negative
    else
        limit = -1;

    if (limit == 0 || dups < 0) return (action == SYM_APPEND) ? 0 : dst_idx;
    if (action == SYM_APPEND || dst_idx > tail) dst_idx = tail;

    // If the src_val is not a string, then we need to create a string:
    if (GET_FLAG(flags, AN_SERIES)) { // used to indicate a BINARY series
        if (IS_INTEGER(src_val)) {
            src_ser = Make_Series_Codepoint(Int8u(src_val));
            needs_free = TRUE;
            limit = -1;
        }
        else if (IS_BLOCK(src_val)) {
            src_ser = Join_Binary(src_val, limit); // NOTE: it's the shared FORM buffer!
            needs_free = FALSE;
            limit = -1;
        }
        else if (IS_CHAR(src_val)) {
            //
            // "UTF-8 was originally specified to allow codepoints with up to
            // 31 bits (or 6 bytes). But with RFC3629, this was reduced to 4
            // bytes max. to be more compatible to UTF-16."  So depending on
            // which RFC you consider "the UTF-8", max size is either 4 or 6.
            //
            src_ser = Make_Binary(6);
            SET_SERIES_LEN(
                src_ser,
                Encode_UTF8_Char(BIN_HEAD(src_ser), VAL_CHAR(src_val))
            );
            needs_free = TRUE;
            limit = -1;
        }
        else if (ANY_STRING(src_val)) {
            src_len = VAL_LEN_AT(src_val);
            if (limit >= 0 && src_len > cast(REBCNT, limit))
                src_len = limit;
            src_ser = Make_UTF8_From_Any_String(src_val, src_len, 0);
            needs_free = TRUE;
            limit = -1;
        }
        else if (!IS_BINARY(src_val))
            fail (Error_Invalid_Arg(src_val));
    }
    else if (IS_CHAR(src_val)) {
        src_ser = Make_Series_Codepoint(VAL_CHAR(src_val));
        needs_free = TRUE;
    }
    else if (IS_BLOCK(src_val)) {
        src_ser = Form_Tight_Block(src_val);
        needs_free = TRUE;
    }
    else if (!ANY_STRING(src_val) || IS_TAG(src_val)) {
        src_ser = Copy_Form_Value(src_val, 0);
        needs_free = TRUE;
    }

    // Use either new src or the one that was passed:
    if (src_ser) {
        src_len = SER_LEN(src_ser);
    }
    else {
        src_ser = VAL_SERIES(src_val);
        src_idx = VAL_INDEX(src_val);
        src_len = VAL_LEN_AT(src_val);
        needs_free = FALSE;
    }

    if (limit >= 0) src_len = limit;

    // If Source == Destination we need to prevent possible conflicts.
    // Clone the argument just to be safe.
    // (Note: It may be possible to optimize special cases like append !!)
    if (dst_ser == src_ser) {
        assert(!needs_free);
        src_ser = Copy_Sequence_At_Len(src_ser, src_idx, src_len);
        needs_free = TRUE;
        src_idx = 0;
    }

    // Total to insert:
    size = dups * src_len;

    if (action != SYM_CHANGE) {
        // Always expand dst_ser for INSERT and APPEND actions:
        Expand_Series(dst_ser, dst_idx, size);
    } else {
        if (size > dst_len)
            Expand_Series(dst_ser, dst_idx, size - dst_len);
        else if (size < dst_len && GET_FLAG(flags, AN_PART))
            Remove_Series(dst_ser, dst_idx, dst_len - size);
        else if (size + dst_idx > tail) {
            EXPAND_SERIES_TAIL(dst_ser, size - (tail - dst_idx));
        }
    }

    // For dup count:
    for (; dups > 0; dups--) {
        Insert_String(dst_ser, dst_idx, src_ser, src_idx, src_len, TRUE);
        dst_idx += src_len;
    }

    TERM_SEQUENCE(dst_ser);

    if (needs_free) {
        // If we did not use the series that was passed in, but rather
        // created an internal temporary one, we need to free it.
        Free_Series(src_ser);
    }

    return (action == SYM_APPEND) ? 0 : dst_idx;
}
Ejemplo n.º 6
0
Archivo: f-modify.c Proyecto: mbk/ren-c
*/	REBCNT Modify_String(REBCNT action, REBSER *dst_ser, REBCNT dst_idx, const REBVAL *src_val, REBCNT flags, REBINT dst_len, REBINT dups)
/*
**		action: INSERT, APPEND, CHANGE
**
**		dst_ser:	target
**		dst_idx:	position
**		src_val:	source
**		flags:		AN_PART
**		dst_len:	length to remove
**		dups:		dup count
**
**		return: new dst_idx
**
***********************************************************************/
{
	REBSER *src_ser = 0;
	REBCNT src_idx = 0;
	REBCNT src_len;
	REBCNT tail  = SERIES_TAIL(dst_ser);
	REBINT size;		// total to insert

	if (dups < 0) return (action == A_APPEND) ? 0 : dst_idx;
	if (action == A_APPEND || dst_idx > tail) dst_idx = tail;

	// If the src_val is not a string, then we need to create a string:
	if (GET_FLAG(flags, AN_SERIES)) { // used to indicate a BINARY series
		if (IS_INTEGER(src_val)) {
			src_ser = Append_Byte(0, Int8u(src_val)); // creates a binary
		}
		else if (IS_BLOCK(src_val)) {
			src_ser = Join_Binary(src_val); // NOTE: it's the shared FORM buffer!
		}
		else if (IS_CHAR(src_val)) {
			src_ser = Make_Binary(6); // (I hate unicode)
			src_ser->tail = Encode_UTF8_Char(BIN_HEAD(src_ser), VAL_CHAR(src_val));
		}
		else if (!ANY_BINSTR(src_val)) Trap_Arg_DEAD_END(src_val);
	}
	else if (IS_CHAR(src_val)) {
		src_ser = Append_Byte(0, VAL_CHAR(src_val)); // unicode ok too
	}
	else if (IS_BLOCK(src_val)) {
		src_ser = Form_Tight_Block(src_val);
	}
	else if (!ANY_STR(src_val) || IS_TAG(src_val)) {
		src_ser = Copy_Form_Value(src_val, 0);
	}

	// Use either new src or the one that was passed:
	if (src_ser) {
		src_len = SERIES_TAIL(src_ser);
	}
	else {
		src_ser = VAL_SERIES(src_val);
		src_idx = VAL_INDEX(src_val);
		src_len = VAL_LEN(src_val);
	}

	// For INSERT or APPEND with /PART use the dst_len not src_len:
	if (action != A_CHANGE && GET_FLAG(flags, AN_PART)) src_len = dst_len;

	// If Source == Destination we need to prevent possible conflicts.
	// Clone the argument just to be safe.
	// (Note: It may be possible to optimize special cases like append !!)
	if (dst_ser == src_ser) {
		src_ser = Copy_Series_Part(src_ser, src_idx, src_len);
		src_idx = 0;
	}

	// Total to insert:
	size = dups * src_len;

	if (action != A_CHANGE) {
		// Always expand dst_ser for INSERT and APPEND actions:
		Expand_Series(dst_ser, dst_idx, size);
	} else {
		if (size > dst_len)
			Expand_Series(dst_ser, dst_idx, size - dst_len);
		else if (size < dst_len && GET_FLAG(flags, AN_PART))
			Remove_Series(dst_ser, dst_idx, dst_len - size);
		else if (size + dst_idx > tail) {
			EXPAND_SERIES_TAIL(dst_ser, size - (tail - dst_idx));
		}
	}

	// For dup count:
	for (; dups > 0; dups--) {
		Insert_String(dst_ser, dst_idx, src_ser, src_idx, src_len, TRUE);
		dst_idx += src_len;
	}

	TERM_SERIES(dst_ser);

	return (action == A_APPEND) ? 0 : dst_idx;
}
Ejemplo n.º 7
0
*/	REBYTE *Temp_Byte_Chars_May_Fail(const REBVAL *val, REBINT max_len, REBCNT *length, REBINT opts)
/*
**	NOTE: This function returns a temporary result, and uses an internal
**	buffer.  Do not use it recursively.  Also, it will Trap on errors.
**
**	Prequalifies a string before using it with a function that
**	expects it to be 8-bits.  It would be used for instance to convert
**	a string that is potentially REBUNI-wide into a form that can be used
**	with a Scan_XXX routine, that is expecting ASCII or UTF-8 source.
**	(Many TO-XXX conversions from STRING re-use that scanner logic.)
**
**	Returns a temporary string and sets the length field.
**
**	Opts can be:
**		0 - no special options
**		1 - allow UTF8 (val is converted to UTF8 during qualification)
**		2 - allow binary
**
**	Checks or converts it:
**
**		1. it is byte string (not unicode)
**		2. if unicode, copy and return as temp byte string
**		3. it's actual content (less space, newlines) <= max len
**		4. it does not contain other values ("123 456")
**		5. it's not empty or only whitespace
**
***********************************************************************/
{
	REBCNT tail = VAL_TAIL(val);
	REBCNT index = VAL_INDEX(val);
	REBCNT len;
	REBUNI c;
	REBYTE *bp;
	REBSER *src = VAL_SERIES(val);

	if (index > tail) raise Error_0(RE_PAST_END);

	Resize_Series(BUF_FORM, max_len+1);
	bp = BIN_HEAD(BUF_FORM);

	// Skip leading whitespace:
	for (; index < tail; index++) {
		c = GET_ANY_CHAR(src, index);
		if (!IS_SPACE(c)) break;
	}

	// Copy chars that are valid:
	for (; index < tail; index++) {
		c = GET_ANY_CHAR(src, index);
		if (opts < 2 && c >= 0x80) {
			if (opts == 0) raise Error_0(RE_INVALID_CHARS);
			len = Encode_UTF8_Char(bp, c);
			max_len -= len;
			bp += len;
		}
		else if (!IS_SPACE(c)) {
			*bp++ = (REBYTE)c;
			max_len--;
		}
		else break;
		if (max_len < 0)
			raise Error_0(RE_TOO_LONG);
	}

	// Rest better be just spaces:
	for (; index < tail; index++) {
		c = GET_ANY_CHAR(src, index);
		if (!IS_SPACE(c)) raise Error_0(RE_INVALID_CHARS);
	}

	*bp= 0;

	len = bp - BIN_HEAD(BUF_FORM);
	if (len == 0) raise Error_0(RE_TOO_SHORT);

	if (length) *length = len;

	return BIN_HEAD(BUF_FORM);
}
Ejemplo n.º 8
0
static REBSER *make_binary(const REBVAL *arg, REBOOL make)
{
    REBSER *ser;

    // MAKE BINARY! 123
    switch (VAL_TYPE(arg)) {
    case REB_INTEGER:
    case REB_DECIMAL:
        if (make) ser = Make_Binary(Int32s(arg, 0));
        else ser = Make_Binary_BE64(arg);
        break;

    // MAKE/TO BINARY! BINARY!
    case REB_BINARY:
        ser = Copy_Bytes(VAL_BIN_AT(arg), VAL_LEN_AT(arg));
        break;

    // MAKE/TO BINARY! <any-string>
    case REB_STRING:
    case REB_FILE:
    case REB_EMAIL:
    case REB_URL:
    case REB_TAG:
//  case REB_ISSUE:
        ser = Make_UTF8_From_Any_String(arg, VAL_LEN_AT(arg), 0);
        break;

    case REB_BLOCK:
        // Join_Binary returns a shared buffer, so produce a copy:
        ser = Copy_Sequence(Join_Binary(arg, -1));
        break;

    // MAKE/TO BINARY! <tuple!>
    case REB_TUPLE:
        ser = Copy_Bytes(VAL_TUPLE(arg), VAL_TUPLE_LEN(arg));
        break;

    // MAKE/TO BINARY! <char!>
    case REB_CHAR:
        ser = Make_Binary(6);
        TERM_SEQUENCE_LEN(ser, Encode_UTF8_Char(BIN_HEAD(ser), VAL_CHAR(arg)));
        break;

    // MAKE/TO BINARY! <bitset!>
    case REB_BITSET:
        ser = Copy_Bytes(VAL_BIN(arg), VAL_LEN_HEAD(arg));
        break;

    // MAKE/TO BINARY! <image!>
    case REB_IMAGE:
        ser = Make_Image_Binary(arg);
        break;

    case REB_MONEY:
        ser = Make_Binary(12);
        deci_to_binary(BIN_HEAD(ser), VAL_MONEY_AMOUNT(arg));
        TERM_SEQUENCE_LEN(ser, 12);
        break;

    default:
        ser = 0;
    }

    return ser;
}