Ejemplo n.º 1
0
//
//  Temp_Bin_Str_Managed: C
// 
// Determines if UTF8 conversion is needed for a series before it
// is used with a byte-oriented function.
// 
// If conversion is needed, a UTF8 series will be created.  Otherwise,
// the source series is returned as-is.
// 
// Note: This routine should only be used to generate a value used
// for temporary purposes, because it has a "surprising variance"
// regarding its input.  If the value's series can be reused, it is--
// and this depends on an implementation detail of internal encoding
// that the user should not be aware of (they need not know if the
// internal representation of an ASCII string uses 1, 2, or however
// many bytes).  But copying vs. non-copying means the resulting
// data might or might not have previous values available to step
// back into from the originating series!
// 
// !!! Should performance dictate it, the callsites could be
// adapted to know whether this produced a new series or not, and
// instead of managing a created result they could be responsible
// for freeing it if so.
//
REBSER *Temp_Bin_Str_Managed(const REBVAL *val, REBCNT *index, REBCNT *length)
{
    REBCNT len = (length && *length) ? *length : VAL_LEN_AT(val);
    REBSER *series;

    assert(IS_BINARY(val) || ANY_STRING(val));

    // !!! This used to check `len == 0` and reuse a zero length string.
    // However, the zero length string could have the wrong width.  We are
    // expected to be returning a BYTE_SIZE() string, and that confused
    // things.  It's not a good idea to mutate the source string (e.g.
    // reallocate under a new width) so consider having an EMPTY_BYTE_STRING
    // like EMPTY_ARRAY which is protected to hand back.
    //
    if (
        IS_BINARY(val)
        || (
            VAL_BYTE_SIZE(val)
            && All_Bytes_ASCII(VAL_BIN_AT(val), VAL_LEN_AT(val))
        )
    ){
        //
        // It's BINARY!, or an ANY-STRING! whose codepoints are all values in
        // ASCII (0x00 => 0x7F), hence not needing any UTF-8 encoding.
        //
        series = VAL_SERIES(val);
        ASSERT_SERIES_MANAGED(series);

        if (index)
            *index = VAL_INDEX(val);
        if (length)
            *length = len;
    }
    else {
        // UTF-8 conversion is required, and we manage the result.

        series = Make_UTF8_From_Any_String(val, len, OPT_ENC_CRLF_MAYBE);
        MANAGE_SERIES(series);

    #if !defined(NDEBUG)
        //
        // Also, PROTECT the result in the debug build...because since the
        // caller doesn't know if a new series was created or if the initial
        // data is being used, they should not be modifying it!  (We don't
        // want to protect the original data, because we wouldn't know when
        // we were allowed to unlock it...there's no later call in this
        // model to clean up the series.)
        {
            REBVAL protect;
            Val_Init_String(&protect, series);

            Protect_Value(&protect, FLAGIT(PROT_SET));

            // just a string...not /DEEP...shouldn't need to Unmark()
        }
    #endif

        if (index)
            *index = 0;
        if (length)
            *length = SER_LEN(series);
    }

    assert(BYTE_SIZE(series));
    return series;
}
Ejemplo n.º 2
0
//
//  Modify_String: C
// 
// Returns new dst_idx.
//
REBCNT Modify_String(
    REBCNT action,          // INSERT, APPEND, CHANGE
    REBSER *dst_ser,        // target
    REBCNT dst_idx,         // position
    const REBVAL *src_val,  // source
    REBFLGS flags,          // AN_PART
    REBINT dst_len,         // length to remove
    REBINT dups             // dup count
) {
    REBSER *src_ser = 0;
    REBCNT src_idx = 0;
    REBCNT src_len;
    REBCNT tail  = SER_LEN(dst_ser);
    REBINT size;        // total to insert
    REBOOL needs_free;
    REBINT limit;

    // For INSERT/PART and APPEND/PART
    if (action != SYM_CHANGE && GET_FLAG(flags, AN_PART))
        limit = dst_len; // should be non-negative
    else
        limit = -1;

    if (limit == 0 || dups < 0) return (action == SYM_APPEND) ? 0 : dst_idx;
    if (action == SYM_APPEND || dst_idx > tail) dst_idx = tail;

    // If the src_val is not a string, then we need to create a string:
    if (GET_FLAG(flags, AN_SERIES)) { // used to indicate a BINARY series
        if (IS_INTEGER(src_val)) {
            src_ser = Make_Series_Codepoint(Int8u(src_val));
            needs_free = TRUE;
            limit = -1;
        }
        else if (IS_BLOCK(src_val)) {
            src_ser = Join_Binary(src_val, limit); // NOTE: it's the shared FORM buffer!
            needs_free = FALSE;
            limit = -1;
        }
        else if (IS_CHAR(src_val)) {
            //
            // "UTF-8 was originally specified to allow codepoints with up to
            // 31 bits (or 6 bytes). But with RFC3629, this was reduced to 4
            // bytes max. to be more compatible to UTF-16."  So depending on
            // which RFC you consider "the UTF-8", max size is either 4 or 6.
            //
            src_ser = Make_Binary(6);
            SET_SERIES_LEN(
                src_ser,
                Encode_UTF8_Char(BIN_HEAD(src_ser), VAL_CHAR(src_val))
            );
            needs_free = TRUE;
            limit = -1;
        }
        else if (ANY_STRING(src_val)) {
            src_len = VAL_LEN_AT(src_val);
            if (limit >= 0 && src_len > cast(REBCNT, limit))
                src_len = limit;
            src_ser = Make_UTF8_From_Any_String(src_val, src_len, 0);
            needs_free = TRUE;
            limit = -1;
        }
        else if (!IS_BINARY(src_val))
            fail (Error_Invalid_Arg(src_val));
    }
    else if (IS_CHAR(src_val)) {
        src_ser = Make_Series_Codepoint(VAL_CHAR(src_val));
        needs_free = TRUE;
    }
    else if (IS_BLOCK(src_val)) {
        src_ser = Form_Tight_Block(src_val);
        needs_free = TRUE;
    }
    else if (!ANY_STRING(src_val) || IS_TAG(src_val)) {
        src_ser = Copy_Form_Value(src_val, 0);
        needs_free = TRUE;
    }

    // Use either new src or the one that was passed:
    if (src_ser) {
        src_len = SER_LEN(src_ser);
    }
    else {
        src_ser = VAL_SERIES(src_val);
        src_idx = VAL_INDEX(src_val);
        src_len = VAL_LEN_AT(src_val);
        needs_free = FALSE;
    }

    if (limit >= 0) src_len = limit;

    // If Source == Destination we need to prevent possible conflicts.
    // Clone the argument just to be safe.
    // (Note: It may be possible to optimize special cases like append !!)
    if (dst_ser == src_ser) {
        assert(!needs_free);
        src_ser = Copy_Sequence_At_Len(src_ser, src_idx, src_len);
        needs_free = TRUE;
        src_idx = 0;
    }

    // Total to insert:
    size = dups * src_len;

    if (action != SYM_CHANGE) {
        // Always expand dst_ser for INSERT and APPEND actions:
        Expand_Series(dst_ser, dst_idx, size);
    } else {
        if (size > dst_len)
            Expand_Series(dst_ser, dst_idx, size - dst_len);
        else if (size < dst_len && GET_FLAG(flags, AN_PART))
            Remove_Series(dst_ser, dst_idx, dst_len - size);
        else if (size + dst_idx > tail) {
            EXPAND_SERIES_TAIL(dst_ser, size - (tail - dst_idx));
        }
    }

    // For dup count:
    for (; dups > 0; dups--) {
        Insert_String(dst_ser, dst_idx, src_ser, src_idx, src_len, TRUE);
        dst_idx += src_len;
    }

    TERM_SEQUENCE(dst_ser);

    if (needs_free) {
        // If we did not use the series that was passed in, but rather
        // created an internal temporary one, we need to free it.
        Free_Series(src_ser);
    }

    return (action == SYM_APPEND) ? 0 : dst_idx;
}
Ejemplo n.º 3
0
//
//  Change_Case: C
//
// Common code for string case handling.
//
void Change_Case(
    REBVAL *out,
    REBVAL *val, // !!! Not const--uses Partial(), may change index, review
    const REBVAL *part,
    bool upper
){
    if (IS_CHAR(val)) {
        REBUNI c = VAL_CHAR(val);
        Init_Char_Unchecked(out, upper ? UP_CASE(c) : LO_CASE(c));
        return;
    }

    assert(ANY_STRING(val));
    FAIL_IF_READ_ONLY(val);

    // This is a mutating operation, and we want to return the same series at
    // the same index.  However, R3-Alpha code would use Partial() and may
    // change val's index.  Capture it before potential change, review.
    //
    Move_Value(out, val);

    REBCNT len = Part_Len_May_Modify_Index(val, part);

    // !!! This assumes that all case changes will preserve the encoding size,
    // but that's not true (some strange multibyte accented characters have
    // capital or lowercase versions that are single byte).  This may be
    // uncommon enough to have special handling (only do something weird, e.g.
    // use the mold buffer, if it happens...for the remaining portion of such
    // a string...and only if the size *expands*).  Expansions also may never
    // be possible, only contractions (is that true?)  Review when UTF-8
    // Everywhere is more mature to the point this is worth worrying about.
    //
    REBCHR(*) up = VAL_STRING_AT(val);
    REBCHR(*) dp;
    if (upper) {
        REBCNT n;
        for (n = 0; n < len; n++) {
            dp = up;

            REBUNI c;
            up = NEXT_CHR(&c, up);
            if (c < UNICODE_CASES) {
                dp = WRITE_CHR(dp, UP_CASE(c));
                assert(dp == up); // !!! not all case changes same byte size?
            }
        }
    }
    else {
        REBCNT n;
        for (n = 0; n < len; n++) {
            dp = up;

            REBUNI c;
            up = NEXT_CHR(&c, up);
            if (c < UNICODE_CASES) {
                dp = WRITE_CHR(dp, LO_CASE(c));
                assert(dp == up); // !!! not all case changes same byte size?
            }
        }
    }
}