*/ REBINT Compare_String_Vals(REBVAL *v1, REBVAL *v2, REBOOL uncase) /* ** Compare two string values. Either can be byte or unicode wide. ** ** Uncase: compare is case-insensitive. ** ** Used for: general string comparions (various places) ** ***********************************************************************/ { REBCNT l1 = VAL_LEN(v1); REBCNT l2 = VAL_LEN(v2); REBCNT len = MIN(l1, l2); REBINT n; if (IS_BINARY(v1) || IS_BINARY(v2)) uncase = FALSE; if (VAL_BYTE_SIZE(v1)) { // v1 is 8 if (VAL_BYTE_SIZE(v2)) n = Compare_Bytes(VAL_BIN_DATA(v1), VAL_BIN_DATA(v2), len, uncase); else n = -Compare_Uni_Byte(VAL_UNI_DATA(v2), VAL_BIN_DATA(v1), len, uncase); } else { // v1 is 16 if (VAL_BYTE_SIZE(v2)) n = Compare_Uni_Byte(VAL_UNI_DATA(v1), VAL_BIN_DATA(v2), len, uncase); else n = Compare_Uni_Str(VAL_UNI_DATA(v1), VAL_UNI_DATA(v2), len, uncase); } if (n != 0) return n; return l1 - l2; }
static int Check_Char_Range(REBVAL *val, REBINT limit) { REBCNT len; if (IS_CHAR(val)) { if (VAL_CHAR(val) > limit) return R_FALSE; return R_TRUE; } if (IS_INTEGER(val)) { if (VAL_INT64(val) > limit) return R_FALSE; return R_TRUE; } len = VAL_LEN_AT(val); if (VAL_BYTE_SIZE(val)) { REBYTE *bp = VAL_BIN_AT(val); if (limit == 0xff) return R_TRUE; // by definition for (; len > 0; len--, bp++) if (*bp > limit) return R_FALSE; } else { REBUNI *up = VAL_UNI_AT(val); for (; len > 0; len--, up++) if (*up > limit) return R_FALSE; } return R_TRUE; }
static void reverse_string(REBVAL *value, REBCNT len) { REBCNT n; REBCNT m; REBUNI c; if (VAL_BYTE_SIZE(value)) { REBYTE *bp = VAL_BIN_DATA(value); for (n = 0, m = len-1; n < len / 2; n++, m--) { c = bp[n]; bp[n] = bp[m]; bp[m] = (REBYTE)c; } } else { REBUNI *up = VAL_UNI_DATA(value); for (n = 0, m = len-1; n < len / 2; n++, m--) { c = up[n]; up[n] = up[m]; up[m] = c; } } }
static REBCNT find_string(REBSER *series, REBCNT index, REBCNT end, REBVAL *target, REBCNT len, REBCNT flags, REBINT skip) { REBCNT start = index; if (flags & (AM_FIND_REVERSE | AM_FIND_LAST)) { skip = -1; start = 0; if (flags & AM_FIND_LAST) index = end - len; else index--; } if (ANY_BINSTR(target)) { // Do the optimal search or the general search? if (BYTE_SIZE(series) && VAL_BYTE_SIZE(target) && !(flags & ~(AM_FIND_CASE|AM_FIND_MATCH))) return Find_Byte_Str(series, start, VAL_BIN_DATA(target), len, !GET_FLAG(flags, ARG_FIND_CASE-1), GET_FLAG(flags, ARG_FIND_MATCH-1)); else return Find_Str_Str(series, start, index, end, skip, VAL_SERIES(target), VAL_INDEX(target), len, flags & (AM_FIND_MATCH|AM_FIND_CASE)); } else if (IS_BINARY(target)) { return Find_Byte_Str(series, start, VAL_BIN_DATA(target), len, 0, GET_FLAG(flags, ARG_FIND_MATCH-1)); } else if (IS_CHAR(target)) { return Find_Str_Char(series, start, index, end, skip, VAL_CHAR(target), flags); } else if (IS_INTEGER(target)) { return Find_Str_Char(series, start, index, end, skip, (REBUNI)VAL_INT32(target), flags); } else if (IS_BITSET(target)) { return Find_Str_Bitset(series, start, index, end, skip, VAL_SERIES(target), flags); } return NOT_FOUND; }
// // Find_Max_Bit: C // // Return integer number for the maximum bit number defined by // the value. Used to determine how much space to allocate. // REBINT Find_Max_Bit(REBVAL *val) { REBINT maxi = 0; REBINT n; switch (VAL_TYPE(val)) { case REB_CHAR: maxi = VAL_CHAR(val)+1; break; case REB_INTEGER: maxi = Int32s(val, 0); break; case REB_STRING: case REB_FILE: case REB_EMAIL: case REB_URL: case REB_TAG: // case REB_ISSUE: n = VAL_INDEX(val); if (VAL_BYTE_SIZE(val)) { REBYTE *bp = VAL_BIN(val); for (; n < cast(REBINT, VAL_LEN_HEAD(val)); n++) if (bp[n] > maxi) maxi = bp[n]; } else { REBUNI *up = VAL_UNI(val); for (; n < cast(REBINT, VAL_LEN_HEAD(val)); n++) if (up[n] > maxi) maxi = up[n]; } maxi++; break; case REB_BINARY: maxi = VAL_LEN_AT(val) * 8 - 1; if (maxi < 0) maxi = 0; break; case REB_BLOCK: for (val = VAL_ARRAY_AT(val); NOT_END(val); val++) { n = Find_Max_Bit(val); if (n > maxi) maxi = n; } //maxi++; break; case REB_NONE: maxi = 0; break; default: return -1; } return maxi; }
*/ static void replace_with(REBSER *ser, REBCNT index, REBCNT tail, REBVAL *with) /* ** Replace whitespace chars that match WITH string. ** ** Resulting string is always smaller than it was to start. ** ***********************************************************************/ { #define MAX_WITH 32 REBCNT wlen; REBUNI with_chars[MAX_WITH]; // chars to be trimmed REBUNI *up = with_chars; REBYTE *bp; REBCNT n; REBUNI uc; // Setup WITH array from arg or the default: n = 0; if (IS_NONE(with)) { bp = "\n \r\t"; wlen = n = 4; } else if (IS_CHAR(with)) { wlen = 1; *up++ = VAL_CHAR(with); } else if (IS_INTEGER(with)) { wlen = 1; *up++ = Int32s(with, 0); } else if (ANY_BINSTR(with)) { n = VAL_LEN(with); if (n >= MAX_WITH) n = MAX_WITH-1; wlen = n; if (VAL_BYTE_SIZE(with)) { bp = VAL_BIN_DATA(with); } else { memcpy(up, VAL_UNI_DATA(with), n * sizeof(REBUNI)); n = 0; } } for (; n > 0; n--) *up++ = (REBUNI)*bp++; // Remove all occurances of chars found in WITH string: for (n = index; index < tail; index++) { uc = GET_ANY_CHAR(ser, index); if (!find_in_uni(with_chars, wlen, uc)) { SET_ANY_CHAR(ser, n, uc); n++; } } SET_ANY_CHAR(ser, n, 0); SERIES_TAIL(ser) = n; }
*/ void Change_Case(REBVAL *out, REBVAL *val, REBVAL *part, REBOOL upper) /* ** Common code for string case handling. ** ***********************************************************************/ { REBCNT len; REBCNT n; *out = *val; if (IS_CHAR(val)) { REBUNI c = VAL_CHAR(val); if (c < UNICODE_CASES) { c = upper ? UP_CASE(c) : LO_CASE(c); } VAL_CHAR(out) = c; return; } // String series: if (IS_PROTECT_SERIES(VAL_SERIES(val))) raise Error_0(RE_PROTECTED); len = Partial(val, 0, part, 0); n = VAL_INDEX(val); len += n; if (VAL_BYTE_SIZE(val)) { REBYTE *bp = VAL_BIN(val); if (upper) for (; n < len; n++) bp[n] = (REBYTE)UP_CASE(bp[n]); else { for (; n < len; n++) bp[n] = (REBYTE)LO_CASE(bp[n]); } } else { REBUNI *up = VAL_UNI(val); if (upper) { for (; n < len; n++) { if (up[n] < UNICODE_CASES) up[n] = UP_CASE(up[n]); } } else { for (; n < len; n++) { if (up[n] < UNICODE_CASES) up[n] = LO_CASE(up[n]); } } } }
// // Check_Bit_Str: C // // If uncased is TRUE, try to match either upper or lower case. // REBOOL Check_Bit_Str(REBSER *bset, const REBVAL *val, REBOOL uncased) { REBCNT n = VAL_INDEX(val); if (VAL_BYTE_SIZE(val)) { REBYTE *bp = VAL_BIN(val); for (; n < VAL_LEN_HEAD(val); n++) if (Check_Bit(bset, bp[n], uncased)) return TRUE; } else { REBUNI *up = VAL_UNI(val); for (; n < VAL_LEN_HEAD(val); n++) if (Check_Bit(bset, up[n], uncased)) return TRUE; } return FALSE; }
*/ REBSER *Encode_UTF8_Value(REBVAL *arg, REBCNT len, REBFLG opts) /* ** Do all the details to encode a string as UTF8. ** No_copy means do not make a copy. ** Result can be a shared buffer! ** ***********************************************************************/ { REBSER * ser; if (VAL_BYTE_SIZE(arg)) { ser = Encode_UTF8_String(VAL_BIN_DATA(arg), len, FALSE, opts); } else { ser = Encode_UTF8_String(VAL_UNI_DATA(arg), len, TRUE, opts); } return ser; }
// // Change_Case: C // // Common code for string case handling. // void Change_Case(REBVAL *out, REBVAL *val, REBVAL *part, REBOOL upper) { REBCNT len; REBCNT n; *out = *val; if (IS_CHAR(val)) { REBUNI c = VAL_CHAR(val); if (c < UNICODE_CASES) { c = upper ? UP_CASE(c) : LO_CASE(c); } VAL_CHAR(out) = c; return; } // String series: FAIL_IF_LOCKED_SERIES(VAL_SERIES(val)); len = Partial(val, 0, part); n = VAL_INDEX(val); len += n; if (VAL_BYTE_SIZE(val)) { REBYTE *bp = VAL_BIN(val); if (upper) for (; n < len; n++) bp[n] = (REBYTE)UP_CASE(bp[n]); else { for (; n < len; n++) bp[n] = (REBYTE)LO_CASE(bp[n]); } } else { REBUNI *up = VAL_UNI(val); if (upper) { for (; n < len; n++) { if (up[n] < UNICODE_CASES) up[n] = UP_CASE(up[n]); } } else { for (; n < len; n++) { if (up[n] < UNICODE_CASES) up[n] = LO_CASE(up[n]); } } } }
*/ REBFLG Check_Bit_Str(REBSER *bset, REBVAL *val, REBFLG uncased) /* ** If uncased is TRUE, try to match either upper or lower case. ** ***********************************************************************/ { REBCNT n = VAL_INDEX(val); if (VAL_BYTE_SIZE(val)) { REBYTE *bp = VAL_BIN(val); for (; n < VAL_TAIL(val); n++) if (Check_Bit(bset, bp[n], uncased)) return TRUE; } else { REBUNI *up = VAL_UNI(val); for (; n < VAL_TAIL(val); n++) if (Check_Bit(bset, up[n], uncased)) return TRUE; } return FALSE; }
*/ REBINT Find_Max_Bit(REBVAL *val) /* ** Return integer number for the maximum bit number defined by ** the value. Used to determine how much space to allocate. ** ***********************************************************************/ { REBINT maxi = 0; REBINT n; switch (VAL_TYPE(val)) { case REB_CHAR: maxi = VAL_CHAR(val)+1; break; case REB_INTEGER: maxi = Int32s(val, 0); break; case REB_STRING: case REB_FILE: case REB_EMAIL: case REB_URL: case REB_TAG: // case REB_ISSUE: n = VAL_INDEX(val); if (VAL_BYTE_SIZE(val)) { REBYTE *bp = VAL_BIN(val); for (; n < (REBINT)VAL_TAIL(val); n++) if (bp[n] > maxi) maxi = bp[n]; } else { REBUNI *up = VAL_UNI(val); for (; n < (REBINT)VAL_TAIL(val); n++) if (up[n] > maxi) maxi = up[n]; } maxi++; break; case REB_BINARY: maxi = VAL_LEN(val) * 8 - 1; if (maxi < 0) maxi = 0; break; case REB_BLOCK: for (val = VAL_BLK_DATA(val); NOT_END(val); val++) { n = Find_Max_Bit(val); if (n > maxi) maxi = n; } //maxi++; break; case REB_NONE: maxi = 0; break; default: return -1; } return maxi; }
// // Temp_Bin_Str_Managed: C // // Determines if UTF8 conversion is needed for a series before it // is used with a byte-oriented function. // // If conversion is needed, a UTF8 series will be created. Otherwise, // the source series is returned as-is. // // Note: This routine should only be used to generate a value used // for temporary purposes, because it has a "surprising variance" // regarding its input. If the value's series can be reused, it is-- // and this depends on an implementation detail of internal encoding // that the user should not be aware of (they need not know if the // internal representation of an ASCII string uses 1, 2, or however // many bytes). But copying vs. non-copying means the resulting // data might or might not have previous values available to step // back into from the originating series! // // !!! Should performance dictate it, the callsites could be // adapted to know whether this produced a new series or not, and // instead of managing a created result they could be responsible // for freeing it if so. // REBSER *Temp_Bin_Str_Managed(const REBVAL *val, REBCNT *index, REBCNT *length) { REBCNT len = (length && *length) ? *length : VAL_LEN_AT(val); REBSER *series; assert(IS_BINARY(val) || ANY_STRING(val)); // !!! This used to check `len == 0` and reuse a zero length string. // However, the zero length string could have the wrong width. We are // expected to be returning a BYTE_SIZE() string, and that confused // things. It's not a good idea to mutate the source string (e.g. // reallocate under a new width) so consider having an EMPTY_BYTE_STRING // like EMPTY_ARRAY which is protected to hand back. // if ( IS_BINARY(val) || ( VAL_BYTE_SIZE(val) && All_Bytes_ASCII(VAL_BIN_AT(val), VAL_LEN_AT(val)) ) ){ // // It's BINARY!, or an ANY-STRING! whose codepoints are all values in // ASCII (0x00 => 0x7F), hence not needing any UTF-8 encoding. // series = VAL_SERIES(val); ASSERT_SERIES_MANAGED(series); if (index) *index = VAL_INDEX(val); if (length) *length = len; } else { // UTF-8 conversion is required, and we manage the result. series = Make_UTF8_From_Any_String(val, len, OPT_ENC_CRLF_MAYBE); MANAGE_SERIES(series); #if !defined(NDEBUG) // // Also, PROTECT the result in the debug build...because since the // caller doesn't know if a new series was created or if the initial // data is being used, they should not be modifying it! (We don't // want to protect the original data, because we wouldn't know when // we were allowed to unlock it...there's no later call in this // model to clean up the series.) { REBVAL protect; Val_Init_String(&protect, series); Protect_Value(&protect, FLAGIT(PROT_SET)); // just a string...not /DEEP...shouldn't need to Unmark() } #endif if (index) *index = 0; if (length) *length = SER_LEN(series); } assert(BYTE_SIZE(series)); return series; }
static REBCNT find_string( REBSER *series, REBCNT index, REBCNT end, REBVAL *target, REBCNT target_len, REBCNT flags, REBINT skip ) { assert(end >= index); if (target_len > end - index) // series not long enough to have target return NOT_FOUND; REBCNT start = index; if (flags & (AM_FIND_REVERSE | AM_FIND_LAST)) { skip = -1; start = 0; if (flags & AM_FIND_LAST) index = end - target_len; else index--; } if (ANY_BINSTR(target)) { // Do the optimal search or the general search? if ( BYTE_SIZE(series) && VAL_BYTE_SIZE(target) && !(flags & ~(AM_FIND_CASE|AM_FIND_MATCH)) ) { return Find_Byte_Str( series, start, VAL_BIN_AT(target), target_len, NOT(GET_FLAG(flags, ARG_FIND_CASE - 1)), GET_FLAG(flags, ARG_FIND_MATCH - 1) ); } else { return Find_Str_Str( series, start, index, end, skip, VAL_SERIES(target), VAL_INDEX(target), target_len, flags & (AM_FIND_MATCH|AM_FIND_CASE) ); } } else if (IS_BINARY(target)) { const REBOOL uncase = FALSE; return Find_Byte_Str( series, start, VAL_BIN_AT(target), target_len, uncase, // "don't treat case insensitively" GET_FLAG(flags, ARG_FIND_MATCH - 1) ); } else if (IS_CHAR(target)) { return Find_Str_Char( VAL_CHAR(target), series, start, index, end, skip, flags ); } else if (IS_INTEGER(target)) { return Find_Str_Char( cast(REBUNI, VAL_INT32(target)), series, start, index, end, skip, flags ); } else if (IS_BITSET(target)) { return Find_Str_Bitset( series, start, index, end, skip, VAL_SERIES(target), flags ); } return NOT_FOUND; }