Beispiel #1
0
//
//  Split_Lines: C
//
// Given a string series, split lines on CR-LF.  Give back array of strings.
//
// Note: The definition of "line" in POSIX is a sequence of characters that
// end with a newline.  Hence, the last line of a file should have a newline
// marker, or it's not a "line")
//
// https://stackoverflow.com/a/729795
//
// This routine does not require it.
//
// !!! CR support is likely to be removed...and CR will be handled as a normal
// character, with special code needed to process it.
//
REBARR *Split_Lines(const REBVAL *str)
{
    REBDSP dsp_orig = DSP;

    REBCNT len = VAL_LEN_AT(str);
    REBCNT i = VAL_INDEX(str);
    if (i == len)
        return Make_Array(0);

    DECLARE_MOLD (mo);
    Push_Mold(mo);

    REBCHR(const*) cp = VAL_STRING_AT(str);

    REBUNI c;
    cp = NEXT_CHR(&c, cp);

    for (; i < len; ++i, cp = NEXT_CHR(&c, cp)) {
        if (c != LF && c != CR) {
            Append_Codepoint(mo->series, c);
            continue;
        }

        Init_Text(DS_PUSH(), Pop_Molded_String(mo));
        SET_CELL_FLAG(DS_TOP, NEWLINE_BEFORE);

        Push_Mold(mo);

        if (c == CR) {
            REBCHR(const*) tp = NEXT_CHR(&c, cp);
            if (c == LF) {
                ++i;
                cp = tp; // treat CR LF as LF, lone CR as LF
            }
        }
    }

    // If there's any remainder we pushed in the buffer, consider the end of
    // string to be an implicit line-break

    if (STR_SIZE(mo->series) == mo->offset)
        Drop_Mold(mo);
    else {
        Init_Text(DS_PUSH(), Pop_Molded_String(mo));
        SET_CELL_FLAG(DS_TOP, NEWLINE_BEFORE);
    }

    return Pop_Stack_Values_Core(dsp_orig, ARRAY_FLAG_NEWLINE_AT_TAIL);
}
Beispiel #2
0
//
//  Entab_Unicode: C
// 
// Entab a string and return a new series.
//
REBSER *Entab_Unicode(REBUNI *bp, REBCNT index, REBCNT len, REBINT tabsize)
{
    REBINT n = 0;
    REBUNI *dp;
    REBUNI c;

    REB_MOLD mo;
    CLEARS(&mo);
    mo.opts = MOPT_RESERVE;
    mo.reserve = len;

    Push_Mold(&mo);
    dp = UNI_AT(mo.series, mo.start);

    for (; index < len; index++) {

        c = bp[index];

        // Count leading spaces, insert TAB for each tabsize:
        if (c == ' ') {
            if (++n >= tabsize) {
                *dp++ = '\t';
                n = 0;
            }
            continue;
        }

        // Hitting a leading TAB resets space counter:
        if (c == '\t') {
            *dp++ = (REBYTE)c;
            n = 0;
        }
        else {
            // Incomplete tab space, pad with spaces:
            for (; n > 0; n--) *dp++ = ' ';

            // Copy chars thru end-of-line (or end of buffer):
            while (index < len) {
                if ((*dp++ = bp[index++]) == '\n') break;
            }
        }
    }

    SET_SERIES_LEN(mo.series, mo.start + cast(REBCNT, dp - UNI_AT(mo.series, mo.start)));
    UNI_TERM(mo.series);

    return Pop_Molded_String(&mo);
}
Beispiel #3
0
//
//  Detab_Unicode: C
// 
// Detab a unicode string and return a new series.
//
REBSER *Detab_Unicode(REBUNI *bp, REBCNT index, REBCNT len, REBINT tabsize)
{
    REBCNT cnt = 0;
    REBCNT n;
    REBUNI *dp;
    REBUNI c;

    REB_MOLD mo;
    CLEARS(&mo);

    // Estimate new length based on tab expansion:
    for (n = index; n < len; n++)
        if (bp[n] == TAB) cnt++;

    mo.opts = MOPT_RESERVE;
    mo.reserve = len + (cnt * (tabsize - 1));

    Push_Mold(&mo);
    dp = UNI_AT(mo.series, mo.start);
    n = 0;
    while (index < len) {

        c = bp[index++];

        if (c == '\t') {
            *dp++ = ' ';
            n++;
            for (; n % tabsize != 0; n++) *dp++ = ' ';
            continue;
        }

        if (c == '\n') n = 0;
        else n++;

        *dp++ = c;
    }

    SET_SERIES_LEN(mo.series, mo.start + cast(REBCNT, dp - UNI_AT(mo.series, mo.start)));
    UNI_TERM(mo.series);

    return Pop_Molded_String(&mo);
}
Beispiel #4
0
//
//  Make_Set_Operation_Series: C
// 
// Do set operations on a series.  Case-sensitive if `cased` is TRUE.
// `skip` is the record size.
//
static REBSER *Make_Set_Operation_Series(
    const REBVAL *val1,
    const REBVAL *val2,
    REBFLGS flags,
    REBOOL cased,
    REBCNT skip
) {
    REBCNT i;
    REBINT h = 1; // used for both logic true/false and hash check
    REBOOL first_pass = TRUE; // are we in the first pass over the series?
    REBSER *out_ser;

    assert(ANY_SERIES(val1));

    if (val2) {
        assert(ANY_SERIES(val2));

        if (ANY_ARRAY(val1)) {
            if (!ANY_ARRAY(val2))
                fail (Error_Unexpected_Type(VAL_TYPE(val1), VAL_TYPE(val2)));

            // As long as they're both arrays, we're willing to do:
            //
            //     >> union quote (a b c) 'b/d/e
            //     (a b c d e)
            //
            // The type of the result will match the first value.
        }
        else if (!IS_BINARY(val1)) {

            // We will similarly do any two ANY-STRING! types:
            //
            //      >> union <abc> "bde"
            //      <abcde>

            if (IS_BINARY(val2))
                fail (Error_Unexpected_Type(VAL_TYPE(val1), VAL_TYPE(val2)));
        }
        else {
            // Binaries only operate with other binaries

            if (!IS_BINARY(val2))
                fail (Error_Unexpected_Type(VAL_TYPE(val1), VAL_TYPE(val2)));
        }
    }

    // Calculate `i` as maximum length of result block.  The temporary buffer
    // will be allocated at this size, but copied out at the exact size of
    // the actual result.
    //
    i = VAL_LEN_AT(val1);
    if (flags & SOP_FLAG_BOTH) i += VAL_LEN_AT(val2);

    if (ANY_ARRAY(val1)) {
        REBSER *hser = 0;   // hash table for series
        REBSER *hret;       // hash table for return series

        // The buffer used for building the return series.  Currently it
        // reuses BUF_EMIT, because that buffer is not likely to be in
        // use (emit doesn't call set operations, nor vice versa).  However,
        // other routines may get the same idea and start recursing so it
        // may be better to use something more similar to the mold stack
        // approach of marking off successive ranges in the array.
        //
        REBSER *buffer = ARR_SERIES(BUF_EMIT);
        Resize_Series(buffer, i);
        hret = Make_Hash_Sequence(i);   // allocated

        // Optimization note: !!
        // This code could be optimized for small blocks by not hashing them
        // and extending Find_Key to FIND on the value itself w/o the hash.

        do {
            REBARR *array1 = VAL_ARRAY(val1); // val1 and val2 swapped 2nd pass!

            // Check what is in series1 but not in series2
            //
            if (flags & SOP_FLAG_CHECK)
                hser = Hash_Block(val2, skip, cased);

            // Iterate over first series
            //
            i = VAL_INDEX(val1);
            for (; i < ARR_LEN(array1); i += skip) {
                RELVAL *item = ARR_AT(array1, i);
                if (flags & SOP_FLAG_CHECK) {
                    h = Find_Key_Hashed(
                        VAL_ARRAY(val2),
                        hser,
                        item,
                        VAL_SPECIFIER(val1),
                        skip,
                        cased,
                        1
                    );
                    h = (h >= 0);
                    if (flags & SOP_FLAG_INVERT) h = !h;
                }
                if (h) {
                    Find_Key_Hashed(
                        AS_ARRAY(buffer),
                        hret,
                        item,
                        VAL_SPECIFIER(val1),
                        skip,
                        cased,
                        2
                    );
                }
            }

            if (i != ARR_LEN(array1)) {
                //
                // In the current philosophy, the semantics of what to do
                // with things like `intersect/skip [1 2 3] [7] 2` is too
                // shaky to deal with, so an error is reported if it does
                // not work out evenly to the skip size.
                //
                fail (Error(RE_BLOCK_SKIP_WRONG));
            }

            if (flags & SOP_FLAG_CHECK)
                Free_Series(hser);

            if (!first_pass) break;
            first_pass = FALSE;

            // Iterate over second series?
            //
            if ((i = ((flags & SOP_FLAG_BOTH) != 0))) {
                const REBVAL *temp = val1;
                val1 = val2;
                val2 = temp;
            }
        } while (i);

        if (hret)
            Free_Series(hret);

        out_ser = ARR_SERIES(Copy_Array_Shallow(AS_ARRAY(buffer), SPECIFIED));
        SET_SERIES_LEN(buffer, 0); // required - allow reuse
    }
    else {
        REB_MOLD mo;
        CLEARS(&mo);

        if (IS_BINARY(val1)) {
            //
            // All binaries use "case-sensitive" comparison (e.g. each byte
            // is treated distinctly)
            //
            cased = TRUE;
        }

        // ask mo.series to have at least `i` capacity beyond mo.start
        //
        mo.opts = MOPT_RESERVE;
        mo.reserve = i;
        Push_Mold(&mo);

        do {
            REBSER *ser = VAL_SERIES(val1); // val1 and val2 swapped 2nd pass!
            REBUNI uc;

            // Iterate over first series
            //
            i = VAL_INDEX(val1);
            for (; i < SER_LEN(ser); i += skip) {
                uc = GET_ANY_CHAR(ser, i);
                if (flags & SOP_FLAG_CHECK) {
                    h = (NOT_FOUND != Find_Str_Char(
                        uc,
                        VAL_SERIES(val2),
                        0,
                        VAL_INDEX(val2),
                        VAL_LEN_HEAD(val2),
                        skip,
                        cased ? AM_FIND_CASE : 0
                    ));

                    if (flags & SOP_FLAG_INVERT) h = !h;
                }

                if (!h) continue;

                if (
                    NOT_FOUND == Find_Str_Char(
                        uc, // c2 (the character to find)
                        mo.series, // ser
                        mo.start, // head
                        mo.start, // index
                        SER_LEN(mo.series), // tail
                        skip, // skip
                        cased ? AM_FIND_CASE : 0 // flags
        )
                ) {
                    Append_String(mo.series, ser, i, skip);
                }
            }

            if (!first_pass) break;
            first_pass = FALSE;

            // Iterate over second series?
            //
            if ((i = ((flags & SOP_FLAG_BOTH) != 0))) {
                const REBVAL *temp = val1;
                val1 = val2;
                val2 = temp;
            }
        } while (i);

        out_ser = Pop_Molded_String(&mo);
    }

    return out_ser;
}