// // Split_Lines: C // // Given a string series, split lines on CR-LF. Give back array of strings. // // Note: The definition of "line" in POSIX is a sequence of characters that // end with a newline. Hence, the last line of a file should have a newline // marker, or it's not a "line") // // https://stackoverflow.com/a/729795 // // This routine does not require it. // // !!! CR support is likely to be removed...and CR will be handled as a normal // character, with special code needed to process it. // REBARR *Split_Lines(const REBVAL *str) { REBDSP dsp_orig = DSP; REBCNT len = VAL_LEN_AT(str); REBCNT i = VAL_INDEX(str); if (i == len) return Make_Array(0); DECLARE_MOLD (mo); Push_Mold(mo); REBCHR(const*) cp = VAL_STRING_AT(str); REBUNI c; cp = NEXT_CHR(&c, cp); for (; i < len; ++i, cp = NEXT_CHR(&c, cp)) { if (c != LF && c != CR) { Append_Codepoint(mo->series, c); continue; } Init_Text(DS_PUSH(), Pop_Molded_String(mo)); SET_CELL_FLAG(DS_TOP, NEWLINE_BEFORE); Push_Mold(mo); if (c == CR) { REBCHR(const*) tp = NEXT_CHR(&c, cp); if (c == LF) { ++i; cp = tp; // treat CR LF as LF, lone CR as LF } } } // If there's any remainder we pushed in the buffer, consider the end of // string to be an implicit line-break if (STR_SIZE(mo->series) == mo->offset) Drop_Mold(mo); else { Init_Text(DS_PUSH(), Pop_Molded_String(mo)); SET_CELL_FLAG(DS_TOP, NEWLINE_BEFORE); } return Pop_Stack_Values_Core(dsp_orig, ARRAY_FLAG_NEWLINE_AT_TAIL); }
// // Entab_Unicode: C // // Entab a string and return a new series. // REBSER *Entab_Unicode(REBUNI *bp, REBCNT index, REBCNT len, REBINT tabsize) { REBINT n = 0; REBUNI *dp; REBUNI c; REB_MOLD mo; CLEARS(&mo); mo.opts = MOPT_RESERVE; mo.reserve = len; Push_Mold(&mo); dp = UNI_AT(mo.series, mo.start); for (; index < len; index++) { c = bp[index]; // Count leading spaces, insert TAB for each tabsize: if (c == ' ') { if (++n >= tabsize) { *dp++ = '\t'; n = 0; } continue; } // Hitting a leading TAB resets space counter: if (c == '\t') { *dp++ = (REBYTE)c; n = 0; } else { // Incomplete tab space, pad with spaces: for (; n > 0; n--) *dp++ = ' '; // Copy chars thru end-of-line (or end of buffer): while (index < len) { if ((*dp++ = bp[index++]) == '\n') break; } } } SET_SERIES_LEN(mo.series, mo.start + cast(REBCNT, dp - UNI_AT(mo.series, mo.start))); UNI_TERM(mo.series); return Pop_Molded_String(&mo); }
// // Detab_Unicode: C // // Detab a unicode string and return a new series. // REBSER *Detab_Unicode(REBUNI *bp, REBCNT index, REBCNT len, REBINT tabsize) { REBCNT cnt = 0; REBCNT n; REBUNI *dp; REBUNI c; REB_MOLD mo; CLEARS(&mo); // Estimate new length based on tab expansion: for (n = index; n < len; n++) if (bp[n] == TAB) cnt++; mo.opts = MOPT_RESERVE; mo.reserve = len + (cnt * (tabsize - 1)); Push_Mold(&mo); dp = UNI_AT(mo.series, mo.start); n = 0; while (index < len) { c = bp[index++]; if (c == '\t') { *dp++ = ' '; n++; for (; n % tabsize != 0; n++) *dp++ = ' '; continue; } if (c == '\n') n = 0; else n++; *dp++ = c; } SET_SERIES_LEN(mo.series, mo.start + cast(REBCNT, dp - UNI_AT(mo.series, mo.start))); UNI_TERM(mo.series); return Pop_Molded_String(&mo); }
// // Make_Set_Operation_Series: C // // Do set operations on a series. Case-sensitive if `cased` is TRUE. // `skip` is the record size. // static REBSER *Make_Set_Operation_Series( const REBVAL *val1, const REBVAL *val2, REBFLGS flags, REBOOL cased, REBCNT skip ) { REBCNT i; REBINT h = 1; // used for both logic true/false and hash check REBOOL first_pass = TRUE; // are we in the first pass over the series? REBSER *out_ser; assert(ANY_SERIES(val1)); if (val2) { assert(ANY_SERIES(val2)); if (ANY_ARRAY(val1)) { if (!ANY_ARRAY(val2)) fail (Error_Unexpected_Type(VAL_TYPE(val1), VAL_TYPE(val2))); // As long as they're both arrays, we're willing to do: // // >> union quote (a b c) 'b/d/e // (a b c d e) // // The type of the result will match the first value. } else if (!IS_BINARY(val1)) { // We will similarly do any two ANY-STRING! types: // // >> union <abc> "bde" // <abcde> if (IS_BINARY(val2)) fail (Error_Unexpected_Type(VAL_TYPE(val1), VAL_TYPE(val2))); } else { // Binaries only operate with other binaries if (!IS_BINARY(val2)) fail (Error_Unexpected_Type(VAL_TYPE(val1), VAL_TYPE(val2))); } } // Calculate `i` as maximum length of result block. The temporary buffer // will be allocated at this size, but copied out at the exact size of // the actual result. // i = VAL_LEN_AT(val1); if (flags & SOP_FLAG_BOTH) i += VAL_LEN_AT(val2); if (ANY_ARRAY(val1)) { REBSER *hser = 0; // hash table for series REBSER *hret; // hash table for return series // The buffer used for building the return series. Currently it // reuses BUF_EMIT, because that buffer is not likely to be in // use (emit doesn't call set operations, nor vice versa). However, // other routines may get the same idea and start recursing so it // may be better to use something more similar to the mold stack // approach of marking off successive ranges in the array. // REBSER *buffer = ARR_SERIES(BUF_EMIT); Resize_Series(buffer, i); hret = Make_Hash_Sequence(i); // allocated // Optimization note: !! // This code could be optimized for small blocks by not hashing them // and extending Find_Key to FIND on the value itself w/o the hash. do { REBARR *array1 = VAL_ARRAY(val1); // val1 and val2 swapped 2nd pass! // Check what is in series1 but not in series2 // if (flags & SOP_FLAG_CHECK) hser = Hash_Block(val2, skip, cased); // Iterate over first series // i = VAL_INDEX(val1); for (; i < ARR_LEN(array1); i += skip) { RELVAL *item = ARR_AT(array1, i); if (flags & SOP_FLAG_CHECK) { h = Find_Key_Hashed( VAL_ARRAY(val2), hser, item, VAL_SPECIFIER(val1), skip, cased, 1 ); h = (h >= 0); if (flags & SOP_FLAG_INVERT) h = !h; } if (h) { Find_Key_Hashed( AS_ARRAY(buffer), hret, item, VAL_SPECIFIER(val1), skip, cased, 2 ); } } if (i != ARR_LEN(array1)) { // // In the current philosophy, the semantics of what to do // with things like `intersect/skip [1 2 3] [7] 2` is too // shaky to deal with, so an error is reported if it does // not work out evenly to the skip size. // fail (Error(RE_BLOCK_SKIP_WRONG)); } if (flags & SOP_FLAG_CHECK) Free_Series(hser); if (!first_pass) break; first_pass = FALSE; // Iterate over second series? // if ((i = ((flags & SOP_FLAG_BOTH) != 0))) { const REBVAL *temp = val1; val1 = val2; val2 = temp; } } while (i); if (hret) Free_Series(hret); out_ser = ARR_SERIES(Copy_Array_Shallow(AS_ARRAY(buffer), SPECIFIED)); SET_SERIES_LEN(buffer, 0); // required - allow reuse } else { REB_MOLD mo; CLEARS(&mo); if (IS_BINARY(val1)) { // // All binaries use "case-sensitive" comparison (e.g. each byte // is treated distinctly) // cased = TRUE; } // ask mo.series to have at least `i` capacity beyond mo.start // mo.opts = MOPT_RESERVE; mo.reserve = i; Push_Mold(&mo); do { REBSER *ser = VAL_SERIES(val1); // val1 and val2 swapped 2nd pass! REBUNI uc; // Iterate over first series // i = VAL_INDEX(val1); for (; i < SER_LEN(ser); i += skip) { uc = GET_ANY_CHAR(ser, i); if (flags & SOP_FLAG_CHECK) { h = (NOT_FOUND != Find_Str_Char( uc, VAL_SERIES(val2), 0, VAL_INDEX(val2), VAL_LEN_HEAD(val2), skip, cased ? AM_FIND_CASE : 0 )); if (flags & SOP_FLAG_INVERT) h = !h; } if (!h) continue; if ( NOT_FOUND == Find_Str_Char( uc, // c2 (the character to find) mo.series, // ser mo.start, // head mo.start, // index SER_LEN(mo.series), // tail skip, // skip cased ? AM_FIND_CASE : 0 // flags ) ) { Append_String(mo.series, ser, i, skip); } } if (!first_pass) break; first_pass = FALSE; // Iterate over second series? // if ((i = ((flags & SOP_FLAG_BOTH) != 0))) { const REBVAL *temp = val1; val1 = val2; val2 = temp; } } while (i); out_ser = Pop_Molded_String(&mo); } return out_ser; }