*/ int Decode_UTF8(REBUNI *dst, const REBYTE *src, REBCNT len, REBFLG ccr) /* ** Decode UTF8 byte string into a 16 bit preallocated array. ** ** dst: the desination array, must always be large enough! ** src: source binary data ** len: byte-length of source (not number of chars) ** ccr: convert CRLF/CR to LF ** ** Returns length in chars (negative if all chars are latin-1). ** No terminator is added. ** ***********************************************************************/ { int flag = -1; UTF32 ch; REBUNI *start = dst; for (; len > 0; len--, src++) { if ((ch = *src) >= 0x80) { ch = Decode_UTF8_Char(&src, &len); if (ch == 0) ch = UNI_REPLACEMENT_CHAR; // temporary! if (ch > 0xff) flag = 1; } if (ch == CR && ccr) { if (src[1] == LF) continue; ch = LF; } *dst++ = (REBUNI)ch; } return (dst - start) * flag; }
*/ REBINT Compare_UTF8(REBYTE *s1, REBYTE *s2, REBCNT l2) /* ** Compare two UTF8 strings. ** ** It is necessary to decode the strings to check if the match ** case-insensitively. ** ** Returns: ** -3: no match, s2 > s1 ** -1: no match, s1 > s2 ** 0: exact match ** 1: non-case match, s2 > s1 ** 3: non-case match, s1 > s2 ** ** So, result + 2 for no-match gives proper sort order. ** And, result - 2 for non-case match gives sort order. ** ** Used for: WORD comparison. ** ***********************************************************************/ { REBINT c1, c2; REBCNT l1 = LEN_BYTES(s1); REBINT result = 0; for (; l1 > 0 && l2 > 0; s1++, s2++, l1--, l2--) { c1 = (REBYTE)*s1; c2 = (REBYTE)*s2; if (c1 > 127) c1 = Decode_UTF8_Char(&s1, &l1); //!!! can return 0 on error! if (c2 > 127) c2 = Decode_UTF8_Char(&s2, &l2); if (c1 != c2) { if (c1 >= UNICODE_CASES || c2 >= UNICODE_CASES || LO_CASE(c1) != LO_CASE(c2)) { return (c1 > c2) ? -1 : -3; } if (!result) result = (c1 > c2) ? 3 : 1; } } if (l1 != l2) result = (l1 > l2) ? -1 : -3; return result; }
*/ REBYTE *Scan_Item(REBYTE *src, REBYTE *end, REBUNI term, REBYTE *invalid) /* ** Scan as UTF8 an item like a file or URL. ** ** Returns continuation point or zero for error. ** ** Put result into the MOLD_BUF as uni-chars. ** ***********************************************************************/ { REBUNI c; REBSER *buf; buf = BUF_MOLD; RESET_TAIL(buf); while (src < end && *src != term) { c = *src; // End of stream? if (c == 0) break; // If no term, then any white will terminate: if (!term && IS_WHITE(c)) break; // Ctrl chars are invalid: if (c < ' ') return 0; // invalid char if (c == '\\') c = '/'; // Accept %xx encoded char: else if (c == '%') { if (!Scan_Hex2(src+1, &c, FALSE)) return 0; src += 2; } // Accept ^X encoded char: else if (c == '^') { if (src+1 == end) return 0; // nothing follows ^ c = Scan_Char(&src); if (!term && IS_WHITE(c)) break; src--; } // Accept UTF8 encoded char: else if (c >= 0x80) { c = Decode_UTF8_Char(&src, 0); // zero on error if (c == 0) return 0; } // Is char as literal valid? (e.g. () [] etc.) else if (invalid && strchr(invalid, c)) return 0; src++; *UNI_SKIP(buf, buf->tail) = c; // not affected by Extend_Series if (++(buf->tail) >= SERIES_REST(buf)) Extend_Series(buf, 1); } if (*src && *src == term) src++; UNI_TERM(buf); return src; }
*/ REBYTE *Scan_Quote(REBYTE *src, SCAN_STATE *scan_state) /* ** Scan a quoted string, handling all the escape characters. ** ** The result will be put into the temporary MOLD_BUF unistring. ** ***********************************************************************/ { REBINT nest = 0; REBUNI term; REBINT chr; REBCNT lines = 0; REBSER *buf = BUF_MOLD; RESET_TAIL(buf); term = (*src++ == '{') ? '}' : '"'; // pick termination while (*src != term || nest > 0) { chr = *src; switch (chr) { case 0: return 0; // Scan_state shows error location. case '^': chr = Scan_Char(&src); if (chr == -1) return 0; src--; break; case '{': if (term != '"') nest++; break; case '}': if (term != '"' && nest > 0) nest--; break; case CR: if (src[1] == LF) src++; // fall thru case LF: if (term == '"') return 0; lines++; chr = LF; break; default: if (chr >= 0x80) { chr = Decode_UTF8_Char(&src, 0); // zero on error if (chr == 0) return 0; } } src++; *UNI_SKIP(buf, buf->tail) = chr; if (++(buf->tail) >= SERIES_REST(buf)) Extend_Series(buf, 1); } src++; // Skip ending quote or brace. if (scan_state) scan_state->line_count += lines; UNI_TERM(buf); return src; }
*/ static REBINT Scan_Char(REBYTE **bp) /* ** Scan a char, handling ^A, ^/, ^(null), ^(1234) ** ** Returns the numeric value for char, or -1 for errors. ** ** Advances the cp to just past the last position. ** ** test: to-integer load to-binary mold to-char 1234 ** ***********************************************************************/ { REBINT n; REBYTE *cp; REBYTE c; REBYTE lex; c = **bp; // Handle unicoded char: if (c >= 0x80) { n = Decode_UTF8_Char(bp, 0); // zero on error (*bp)++; // skip char return n; } (*bp)++; if (c != '^') return c; // Must be ^ escaped char: c = **bp; (*bp)++; switch (c) { case 0: n = 0; break; case '/': n = LF; break; case '^': n = c; break; case '-': n = TAB; break; case '!': n = '\036'; // record separator break; case '(': // ^(tab) ^(1234) // Check for hex integers ^(1234): cp = *bp; // restart location n = 0; while ((lex = Lex_Map[*cp]) > LEX_WORD) { c = lex & LEX_VALUE; if (!c && lex < LEX_NUMBER) break; n = (n << 4) + c; cp++; } if ((cp - *bp) > 4) return -1; if (*cp == ')') { cp++; *bp = cp; return n; } // Check for identifiers: for (n = 0; n < ESC_MAX; n++) { if (NZ(cp = Match_Bytes(*bp, (REBYTE*)(Esc_Names[n])))) { if (cp && *cp == ')') { *bp = cp + 1; return Esc_Codes[n]; } } } return -1; default: n = UP_CASE(c); if (n >= '@' && n <= '_') n -= '@'; else if (n == '~') n = 0x7f; // special for DEL else n = c; // includes: ^{ ^} ^" } return n; }