Ejemplo n.º 1
0
Archivo: s-unicode.c Proyecto: Oldes/r3
*/	int Decode_UTF8(REBUNI *dst, const REBYTE *src, REBCNT len, REBFLG ccr)
/*
**		Decode UTF8 byte string into a 16 bit preallocated array.
**
**		dst: the desination array, must always be large enough!
**		src: source binary data
**		len: byte-length of source (not number of chars)
**		ccr: convert CRLF/CR to LF
**
**		Returns length in chars (negative if all chars are latin-1).
**		No terminator is added.
**
***********************************************************************/
{
	int flag = -1;
	UTF32 ch;
	REBUNI *start = dst;

	for (; len > 0; len--, src++) {
		if ((ch = *src) >= 0x80) {
			ch = Decode_UTF8_Char(&src, &len);
			if (ch == 0) ch = UNI_REPLACEMENT_CHAR; // temporary!
			if (ch > 0xff) flag = 1;
		} if (ch == CR && ccr) {
			if (src[1] == LF) continue;
			ch = LF;
		}
		*dst++ = (REBUNI)ch;
	}

	return (dst - start) * flag;
}
Ejemplo n.º 2
0
*/	REBINT Compare_UTF8(REBYTE *s1, REBYTE *s2, REBCNT l2)
/*
**		Compare two UTF8 strings.
**
**		It is necessary to decode the strings to check if the match
**		case-insensitively.
**
**		Returns:
**			-3: no match, s2 > s1
**			-1: no match, s1 > s2
**			 0: exact match
**			 1: non-case match, s2 > s1
**			 3: non-case match, s1 > s2
**
**		So, result + 2 for no-match gives proper sort order.
**		And, result - 2 for non-case match gives sort order.
**
**		Used for: WORD comparison.
**
***********************************************************************/
{
	REBINT c1, c2;
	REBCNT l1 = LEN_BYTES(s1);
	REBINT result = 0;

	for (; l1 > 0 && l2 > 0; s1++, s2++, l1--, l2--) {
		c1 = (REBYTE)*s1;
		c2 = (REBYTE)*s2;
		if (c1 > 127) c1 = Decode_UTF8_Char(&s1, &l1); //!!! can return 0 on error!
		if (c2 > 127) c2 = Decode_UTF8_Char(&s2, &l2);
		if (c1 != c2) {
			if (c1 >= UNICODE_CASES || c2 >= UNICODE_CASES ||
				LO_CASE(c1) != LO_CASE(c2)) {
				return (c1 > c2) ? -1 : -3;
			}
			if (!result) result = (c1 > c2) ? 3 : 1;
		}
	}
	if (l1 != l2) result = (l1 > l2) ? -1 : -3;

	return result;
}
Ejemplo n.º 3
0
*/  REBYTE *Scan_Item(REBYTE *src, REBYTE *end, REBUNI term, REBYTE *invalid)
/*
**      Scan as UTF8 an item like a file or URL.
**
**		Returns continuation point or zero for error.
**
**		Put result into the MOLD_BUF as uni-chars.
**
***********************************************************************/
{
	REBUNI c;
	REBSER *buf;

	buf = BUF_MOLD;
	RESET_TAIL(buf);

	while (src < end && *src != term) {

		c = *src;

		// End of stream?
		if (c == 0) break;

		// If no term, then any white will terminate:
		if (!term && IS_WHITE(c)) break;

		// Ctrl chars are invalid:
		if (c < ' ') return 0;	// invalid char

		if (c == '\\') c = '/';

		// Accept %xx encoded char:
		else if (c == '%') {
			if (!Scan_Hex2(src+1, &c, FALSE)) return 0;
			src += 2;
		}

		// Accept ^X encoded char:
		else if (c == '^') {
			if (src+1 == end) return 0; // nothing follows ^
			c = Scan_Char(&src);
			if (!term && IS_WHITE(c)) break;
			src--;
		}

		// Accept UTF8 encoded char:
		else if (c >= 0x80) {
			c = Decode_UTF8_Char(&src, 0); // zero on error
			if (c == 0) return 0;
		}

		// Is char as literal valid? (e.g. () [] etc.)
		else if (invalid && strchr(invalid, c)) return 0;

		src++;

		*UNI_SKIP(buf, buf->tail) = c; // not affected by Extend_Series

		if (++(buf->tail) >= SERIES_REST(buf)) Extend_Series(buf, 1);
    }

	if (*src && *src == term) src++;

	UNI_TERM(buf);

	return src;
}
Ejemplo n.º 4
0
*/  REBYTE *Scan_Quote(REBYTE *src, SCAN_STATE *scan_state)
/*
**      Scan a quoted string, handling all the escape characters.
**
**		The result will be put into the temporary MOLD_BUF unistring.
**
***********************************************************************/
{
    REBINT nest = 0;
	REBUNI term;
	REBINT chr;
	REBCNT lines = 0;
	REBSER *buf = BUF_MOLD;

	RESET_TAIL(buf);

	term = (*src++ == '{') ? '}' : '"';	// pick termination

	while (*src != term || nest > 0) {

		chr = *src;

        switch (chr) {

		case 0:
			return 0; // Scan_state shows error location.
        
		case '^':
			chr = Scan_Char(&src);
			if (chr == -1) return 0;
			src--;
            break;

		case '{':
			if (term != '"') nest++;
			break;

		case '}':
			if (term != '"' && nest > 0) nest--;
			break;

		case CR:
			if (src[1] == LF) src++;
			// fall thru
        case LF:
			if (term == '"') return 0;
			lines++;
			chr = LF;
			break;

		default:
			if (chr >= 0x80) {
				chr = Decode_UTF8_Char(&src, 0); // zero on error
				if (chr == 0) return 0;
			}
		}

		src++;

		*UNI_SKIP(buf, buf->tail) = chr;

		if (++(buf->tail) >= SERIES_REST(buf)) Extend_Series(buf, 1);
    }

	src++; // Skip ending quote or brace.

	if (scan_state) scan_state->line_count += lines;

	UNI_TERM(buf);

	return src;
}
Ejemplo n.º 5
0
*/  static REBINT Scan_Char(REBYTE **bp)
/*
**      Scan a char, handling ^A, ^/, ^(null), ^(1234)
**
**		Returns the numeric value for char, or -1 for errors.
**
**		Advances the cp to just past the last position.
**
**		test: to-integer load to-binary mold to-char 1234
**
***********************************************************************/
{
	REBINT n;
	REBYTE *cp;
	REBYTE c;
	REBYTE lex;

	c = **bp;

	// Handle unicoded char:
	if (c >= 0x80) {
		n = Decode_UTF8_Char(bp, 0); // zero on error
		(*bp)++; // skip char
		return n;
	}

	(*bp)++;

	if (c != '^') return c;

	// Must be ^ escaped char:
	c = **bp;
	(*bp)++;

    switch (c) {

	case 0:
		n = 0;
		break;

	case '/':
		n = LF;
		break;

	case '^':
		n = c;
		break;

    case '-':
		n = TAB;
		break;

	case '!':
		n = '\036'; // record separator
		break;

	case '(':	// ^(tab) ^(1234)
		// Check for hex integers ^(1234):
		cp = *bp; // restart location
		n = 0;
		while ((lex = Lex_Map[*cp]) > LEX_WORD) {
			c = lex & LEX_VALUE;
			if (!c && lex < LEX_NUMBER) break;
			n = (n << 4) + c;
			cp++;
		}
		if ((cp - *bp) > 4) return -1;
		if (*cp == ')') {
			cp++;
			*bp = cp;
			return n;
		}

		// Check for identifiers:
		for (n = 0; n < ESC_MAX; n++) {
			if (NZ(cp = Match_Bytes(*bp, (REBYTE*)(Esc_Names[n])))) {
				if (cp && *cp == ')') {
					*bp = cp + 1;
					return Esc_Codes[n];
				}
			}
		}
		return -1;

    default:
		n = UP_CASE(c);
		if (n >= '@' && n <= '_') n -= '@';
		else if (n == '~') n = 0x7f; // special for DEL
		else n = c;  // includes: ^{ ^} ^"
    }

	return n;
}