Ejemplo n.º 1
0
static int lexstring(js_State *J)
{
	const char *s;

	int q = J->lexchar;
	jsY_next(J);

	textinit(J);

	while (J->lexchar != q) {
		if (J->lexchar == 0 || J->lexchar == '\n')
			jsY_error(J, "string not terminated");
		if (jsY_accept(J, '\\')) {
			if (lexescape(J))
				jsY_error(J, "malformed escape sequence");
		} else {
			textpush(J, J->lexchar);
			jsY_next(J);
		}
	}
	jsY_expect(J, q);

	s = textend(J);

	J->text = js_intern(J, s);
	return TK_STRING;
}
Ejemplo n.º 2
0
/*
 * Read a regular expression.
 */
static int
lexregexp(wint_t endc)
{
	(void) lexescape(endc, 1, 0);
	yylval.node = renode(linebuf);
	return (URE);
}
Ejemplo n.º 3
0
/*
 * Process a string, converting the escape characters as required by
 * 1003.2. The processed string ends up in the global linebuf[]. This
 * routine also changes the value of 'progfd' - the program file
 * descriptor, so it should be used with some care. It is presently used to
 * process -v (awk1.c) and var=str type arguments (awk2.c, nextrecord()).
 */
void
strescape(wchar_t *str)
{
	progptr = str;
	proglen = wcslen(str) + 1;	/* Include \0 */
	(void) lexescape('\0', 0, 1);
	progptr = NULL;
}
Ejemplo n.º 4
0
/*
 * Read a string for the lexical analyzer.
 * `endc' terminates the string.
 */
static int
lexstring(wint_t endc)
{
	size_t length = lexescape(endc, 0, 0);

	yylval.node = stringnode(linebuf, FALLOC, length);
	return (CONSTANT);
}
Ejemplo n.º 5
0
/*
 * next - get next token
 */
static int						/* 1 normal, 0 failure */
next(struct vars * v)
{
	chr			c;

	/* errors yield an infinite sequence of failures */
	if (ISERR())
		return 0;				/* the error has set nexttype to EOS */

	/* remember flavor of last token */
	v->lasttype = v->nexttype;

	/* REG_BOSONLY */
	if (v->nexttype == EMPTY && (v->cflags & REG_BOSONLY))
	{
		/* at start of a REG_BOSONLY RE */
		RETV(SBEGIN, 0);		/* same as \A */
	}

	/* if we're nested and we've hit end, return to outer level */
	if (v->savenow != NULL && ATEOS())
	{
		v->now = v->savenow;
		v->stop = v->savestop;
		v->savenow = v->savestop = NULL;
	}

	/* skip white space etc. if appropriate (not in literal or []) */
	if (v->cflags & REG_EXPANDED)
		switch (v->lexcon)
		{
			case L_ERE:
			case L_BRE:
			case L_EBND:
			case L_BBND:
				skip(v);
				break;
		}

	/* handle EOS, depending on context */
	if (ATEOS())
	{
		switch (v->lexcon)
		{
			case L_ERE:
			case L_BRE:
			case L_Q:
				RET(EOS);
				break;
			case L_EBND:
			case L_BBND:
				FAILW(REG_EBRACE);
				break;
			case L_BRACK:
			case L_CEL:
			case L_ECL:
			case L_CCL:
				FAILW(REG_EBRACK);
				break;
		}
		assert(NOTREACHED);
	}

	/* okay, time to actually get a character */
	c = *v->now++;

	/* deal with the easy contexts, punt EREs to code below */
	switch (v->lexcon)
	{
		case L_BRE:				/* punt BREs to separate function */
			return brenext(v, c);
			break;
		case L_ERE:				/* see below */
			break;
		case L_Q:				/* literal strings are easy */
			RETV(PLAIN, c);
			break;
		case L_BBND:			/* bounds are fairly simple */
		case L_EBND:
			switch (c)
			{
				case CHR('0'):
				case CHR('1'):
				case CHR('2'):
				case CHR('3'):
				case CHR('4'):
				case CHR('5'):
				case CHR('6'):
				case CHR('7'):
				case CHR('8'):
				case CHR('9'):
					RETV(DIGIT, (chr) DIGITVAL(c));
					break;
				case CHR(','):
					RET(',');
					break;
				case CHR('}'):	/* ERE bound ends with } */
					if (INCON(L_EBND))
					{
						INTOCON(L_ERE);
						if ((v->cflags & REG_ADVF) && NEXT1('?'))
						{
							v->now++;
							NOTE(REG_UNONPOSIX);
							RETV('}', 0);
						}
						RETV('}', 1);
					}
					else
						FAILW(REG_BADBR);
					break;
				case CHR('\\'):	/* BRE bound ends with \} */
					if (INCON(L_BBND) && NEXT1('}'))
					{
						v->now++;
						INTOCON(L_BRE);
						RET('}');
					}
					else
						FAILW(REG_BADBR);
					break;
				default:
					FAILW(REG_BADBR);
					break;
			}
			assert(NOTREACHED);
			break;
		case L_BRACK:			/* brackets are not too hard */
			switch (c)
			{
				case CHR(']'):
					if (LASTTYPE('['))
						RETV(PLAIN, c);
					else
					{
						INTOCON((v->cflags & REG_EXTENDED) ?
								L_ERE : L_BRE);
						RET(']');
					}
					break;
				case CHR('\\'):
					NOTE(REG_UBBS);
					if (!(v->cflags & REG_ADVF))
						RETV(PLAIN, c);
					NOTE(REG_UNONPOSIX);
					if (ATEOS())
						FAILW(REG_EESCAPE);
					(DISCARD) lexescape(v);
					switch (v->nexttype)
					{			/* not all escapes okay here */
						case PLAIN:
							return 1;
							break;
						case CCLASS:
							switch (v->nextvalue)
							{
								case 'd':
									lexnest(v, brbackd, ENDOF(brbackd));
									break;
								case 's':
									lexnest(v, brbacks, ENDOF(brbacks));
									break;
								case 'w':
									lexnest(v, brbackw, ENDOF(brbackw));
									break;
								default:
									FAILW(REG_EESCAPE);
									break;
							}
							/* lexnest done, back up and try again */
							v->nexttype = v->lasttype;
							return next(v);
							break;
					}
					/* not one of the acceptable escapes */
					FAILW(REG_EESCAPE);
					break;
				case CHR('-'):
					if (LASTTYPE('[') || NEXT1(']'))
						RETV(PLAIN, c);
					else
						RETV(RANGE, c);
					break;
				case CHR('['):
					if (ATEOS())
						FAILW(REG_EBRACK);
					switch (*v->now++)
					{
						case CHR('.'):
							INTOCON(L_CEL);
							/* might or might not be locale-specific */
							RET(COLLEL);
							break;
						case CHR('='):
							INTOCON(L_ECL);
							NOTE(REG_ULOCALE);
							RET(ECLASS);
							break;
						case CHR(':'):
							INTOCON(L_CCL);
							NOTE(REG_ULOCALE);
							RET(CCLASS);
							break;
						default:		/* oops */
							v->now--;
							RETV(PLAIN, c);
							break;
					}
					assert(NOTREACHED);
					break;
				default:
					RETV(PLAIN, c);
					break;
			}
			assert(NOTREACHED);
			break;
		case L_CEL:				/* collating elements are easy */
			if (c == CHR('.') && NEXT1(']'))
			{
				v->now++;
				INTOCON(L_BRACK);
				RETV(END, '.');
			}
			else
				RETV(PLAIN, c);
			break;
		case L_ECL:				/* ditto equivalence classes */
			if (c == CHR('=') && NEXT1(']'))
			{
				v->now++;
				INTOCON(L_BRACK);
				RETV(END, '=');
			}
			else
				RETV(PLAIN, c);
			break;
		case L_CCL:				/* ditto character classes */
			if (c == CHR(':') && NEXT1(']'))
			{
				v->now++;
				INTOCON(L_BRACK);
				RETV(END, ':');
			}
			else
				RETV(PLAIN, c);
			break;
		default:
			assert(NOTREACHED);
			break;
	}

	/* that got rid of everything except EREs and AREs */
	assert(INCON(L_ERE));

	/* deal with EREs and AREs, except for backslashes */
	switch (c)
	{
		case CHR('|'):
			RET('|');
			break;
		case CHR('*'):
			if ((v->cflags & REG_ADVF) && NEXT1('?'))
			{
				v->now++;
				NOTE(REG_UNONPOSIX);
				RETV('*', 0);
			}
			RETV('*', 1);
			break;
		case CHR('+'):
			if ((v->cflags & REG_ADVF) && NEXT1('?'))
			{
				v->now++;
				NOTE(REG_UNONPOSIX);
				RETV('+', 0);
			}
			RETV('+', 1);
			break;
		case CHR('?'):
			if ((v->cflags & REG_ADVF) && NEXT1('?'))
			{
				v->now++;
				NOTE(REG_UNONPOSIX);
				RETV('?', 0);
			}
			RETV('?', 1);
			break;
		case CHR('{'):			/* bounds start or plain character */
			if (v->cflags & REG_EXPANDED)
				skip(v);
			if (ATEOS() || !iscdigit(*v->now))
			{
				NOTE(REG_UBRACES);
				NOTE(REG_UUNSPEC);
				RETV(PLAIN, c);
			}
			else
			{
				NOTE(REG_UBOUNDS);
				INTOCON(L_EBND);
				RET('{');
			}
			assert(NOTREACHED);
			break;
		case CHR('('):			/* parenthesis, or advanced extension */
			if ((v->cflags & REG_ADVF) && NEXT1('?'))
			{
				NOTE(REG_UNONPOSIX);
				v->now++;
				if (ATEOS())
					FAILW(REG_BADRPT);
				switch (*v->now++)
				{
					case CHR(':'):		/* non-capturing paren */
						RETV('(', 0);
						break;
					case CHR('#'):		/* comment */
						while (!ATEOS() && *v->now != CHR(')'))
							v->now++;
						if (!ATEOS())
							v->now++;
						assert(v->nexttype == v->lasttype);
						return next(v);
						break;
					case CHR('='):		/* positive lookahead */
						NOTE(REG_ULOOKAROUND);
						RETV(LACON, LATYPE_AHEAD_POS);
						break;
					case CHR('!'):		/* negative lookahead */
						NOTE(REG_ULOOKAROUND);
						RETV(LACON, LATYPE_AHEAD_NEG);
						break;
					case CHR('<'):
						if (ATEOS())
							FAILW(REG_BADRPT);
						switch (*v->now++)
						{
							case CHR('='):		/* positive lookbehind */
								NOTE(REG_ULOOKAROUND);
								RETV(LACON, LATYPE_BEHIND_POS);
								break;
							case CHR('!'):		/* negative lookbehind */
								NOTE(REG_ULOOKAROUND);
								RETV(LACON, LATYPE_BEHIND_NEG);
								break;
							default:
								FAILW(REG_BADRPT);
								break;
						}
						assert(NOTREACHED);
						break;
					default:
						FAILW(REG_BADRPT);
						break;
				}
				assert(NOTREACHED);
			}
			if (v->cflags & REG_NOSUB)
				RETV('(', 0);	/* all parens non-capturing */
			else
				RETV('(', 1);
			break;
		case CHR(')'):
			if (LASTTYPE('('))
				NOTE(REG_UUNSPEC);
			RETV(')', c);
			break;
		case CHR('['):			/* easy except for [[:<:]] and [[:>:]] */
			if (HAVE(6) && *(v->now + 0) == CHR('[') &&
				*(v->now + 1) == CHR(':') &&
				(*(v->now + 2) == CHR('<') ||
				 *(v->now + 2) == CHR('>')) &&
				*(v->now + 3) == CHR(':') &&
				*(v->now + 4) == CHR(']') &&
				*(v->now + 5) == CHR(']'))
			{
				c = *(v->now + 2);
				v->now += 6;
				NOTE(REG_UNONPOSIX);
				RET((c == CHR('<')) ? '<' : '>');
			}
			INTOCON(L_BRACK);
			if (NEXT1('^'))
			{
				v->now++;
				RETV('[', 0);
			}
			RETV('[', 1);
			break;
		case CHR('.'):
			RET('.');
			break;
		case CHR('^'):
			RET('^');
			break;
		case CHR('$'):
			RET('$');
			break;
		case CHR('\\'): /* mostly punt backslashes to code below */
			if (ATEOS())
				FAILW(REG_EESCAPE);
			break;
		default:				/* ordinary character */
			RETV(PLAIN, c);
			break;
	}

	/* ERE/ARE backslash handling; backslash already eaten */
	assert(!ATEOS());
	if (!(v->cflags & REG_ADVF))
	{							/* only AREs have non-trivial escapes */
		if (iscalnum(*v->now))
		{
			NOTE(REG_UBSALNUM);
			NOTE(REG_UUNSPEC);
		}
		RETV(PLAIN, *v->now++);
	}
	(DISCARD) lexescape(v);
	if (ISERR())
		FAILW(REG_EESCAPE);
	if (v->nexttype == CCLASS)
	{							/* fudge at lexical level */
		switch (v->nextvalue)
		{
			case 'd':
				lexnest(v, backd, ENDOF(backd));
				break;
			case 'D':
				lexnest(v, backD, ENDOF(backD));
				break;
			case 's':
				lexnest(v, backs, ENDOF(backs));
				break;
			case 'S':
				lexnest(v, backS, ENDOF(backS));
				break;
			case 'w':
				lexnest(v, backw, ENDOF(backw));
				break;
			case 'W':
				lexnest(v, backW, ENDOF(backW));
				break;
			default:
				assert(NOTREACHED);
				FAILW(REG_ASSERT);
				break;
		}
		/* lexnest done, back up and try again */
		v->nexttype = v->lasttype;
		return next(v);
	}
	/* otherwise, lexescape has already done the work */
	return !ISERR();
}