static int lexstring(js_State *J) { const char *s; int q = J->lexchar; jsY_next(J); textinit(J); while (J->lexchar != q) { if (J->lexchar == 0 || J->lexchar == '\n') jsY_error(J, "string not terminated"); if (jsY_accept(J, '\\')) { if (lexescape(J)) jsY_error(J, "malformed escape sequence"); } else { textpush(J, J->lexchar); jsY_next(J); } } jsY_expect(J, q); s = textend(J); J->text = js_intern(J, s); return TK_STRING; }
/* * Read a regular expression. */ static int lexregexp(wint_t endc) { (void) lexescape(endc, 1, 0); yylval.node = renode(linebuf); return (URE); }
/* * Process a string, converting the escape characters as required by * 1003.2. The processed string ends up in the global linebuf[]. This * routine also changes the value of 'progfd' - the program file * descriptor, so it should be used with some care. It is presently used to * process -v (awk1.c) and var=str type arguments (awk2.c, nextrecord()). */ void strescape(wchar_t *str) { progptr = str; proglen = wcslen(str) + 1; /* Include \0 */ (void) lexescape('\0', 0, 1); progptr = NULL; }
/* * Read a string for the lexical analyzer. * `endc' terminates the string. */ static int lexstring(wint_t endc) { size_t length = lexescape(endc, 0, 0); yylval.node = stringnode(linebuf, FALLOC, length); return (CONSTANT); }
/* * next - get next token */ static int /* 1 normal, 0 failure */ next(struct vars * v) { chr c; /* errors yield an infinite sequence of failures */ if (ISERR()) return 0; /* the error has set nexttype to EOS */ /* remember flavor of last token */ v->lasttype = v->nexttype; /* REG_BOSONLY */ if (v->nexttype == EMPTY && (v->cflags & REG_BOSONLY)) { /* at start of a REG_BOSONLY RE */ RETV(SBEGIN, 0); /* same as \A */ } /* if we're nested and we've hit end, return to outer level */ if (v->savenow != NULL && ATEOS()) { v->now = v->savenow; v->stop = v->savestop; v->savenow = v->savestop = NULL; } /* skip white space etc. if appropriate (not in literal or []) */ if (v->cflags & REG_EXPANDED) switch (v->lexcon) { case L_ERE: case L_BRE: case L_EBND: case L_BBND: skip(v); break; } /* handle EOS, depending on context */ if (ATEOS()) { switch (v->lexcon) { case L_ERE: case L_BRE: case L_Q: RET(EOS); break; case L_EBND: case L_BBND: FAILW(REG_EBRACE); break; case L_BRACK: case L_CEL: case L_ECL: case L_CCL: FAILW(REG_EBRACK); break; } assert(NOTREACHED); } /* okay, time to actually get a character */ c = *v->now++; /* deal with the easy contexts, punt EREs to code below */ switch (v->lexcon) { case L_BRE: /* punt BREs to separate function */ return brenext(v, c); break; case L_ERE: /* see below */ break; case L_Q: /* literal strings are easy */ RETV(PLAIN, c); break; case L_BBND: /* bounds are fairly simple */ case L_EBND: switch (c) { case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'): case CHR('9'): RETV(DIGIT, (chr) DIGITVAL(c)); break; case CHR(','): RET(','); break; case CHR('}'): /* ERE bound ends with } */ if (INCON(L_EBND)) { INTOCON(L_ERE); if ((v->cflags & REG_ADVF) && NEXT1('?')) { v->now++; NOTE(REG_UNONPOSIX); RETV('}', 0); } RETV('}', 1); } else FAILW(REG_BADBR); break; case CHR('\\'): /* BRE bound ends with \} */ if (INCON(L_BBND) && NEXT1('}')) { v->now++; INTOCON(L_BRE); RET('}'); } else FAILW(REG_BADBR); break; default: FAILW(REG_BADBR); break; } assert(NOTREACHED); break; case L_BRACK: /* brackets are not too hard */ switch (c) { case CHR(']'): if (LASTTYPE('[')) RETV(PLAIN, c); else { INTOCON((v->cflags & REG_EXTENDED) ? L_ERE : L_BRE); RET(']'); } break; case CHR('\\'): NOTE(REG_UBBS); if (!(v->cflags & REG_ADVF)) RETV(PLAIN, c); NOTE(REG_UNONPOSIX); if (ATEOS()) FAILW(REG_EESCAPE); (DISCARD) lexescape(v); switch (v->nexttype) { /* not all escapes okay here */ case PLAIN: return 1; break; case CCLASS: switch (v->nextvalue) { case 'd': lexnest(v, brbackd, ENDOF(brbackd)); break; case 's': lexnest(v, brbacks, ENDOF(brbacks)); break; case 'w': lexnest(v, brbackw, ENDOF(brbackw)); break; default: FAILW(REG_EESCAPE); break; } /* lexnest done, back up and try again */ v->nexttype = v->lasttype; return next(v); break; } /* not one of the acceptable escapes */ FAILW(REG_EESCAPE); break; case CHR('-'): if (LASTTYPE('[') || NEXT1(']')) RETV(PLAIN, c); else RETV(RANGE, c); break; case CHR('['): if (ATEOS()) FAILW(REG_EBRACK); switch (*v->now++) { case CHR('.'): INTOCON(L_CEL); /* might or might not be locale-specific */ RET(COLLEL); break; case CHR('='): INTOCON(L_ECL); NOTE(REG_ULOCALE); RET(ECLASS); break; case CHR(':'): INTOCON(L_CCL); NOTE(REG_ULOCALE); RET(CCLASS); break; default: /* oops */ v->now--; RETV(PLAIN, c); break; } assert(NOTREACHED); break; default: RETV(PLAIN, c); break; } assert(NOTREACHED); break; case L_CEL: /* collating elements are easy */ if (c == CHR('.') && NEXT1(']')) { v->now++; INTOCON(L_BRACK); RETV(END, '.'); } else RETV(PLAIN, c); break; case L_ECL: /* ditto equivalence classes */ if (c == CHR('=') && NEXT1(']')) { v->now++; INTOCON(L_BRACK); RETV(END, '='); } else RETV(PLAIN, c); break; case L_CCL: /* ditto character classes */ if (c == CHR(':') && NEXT1(']')) { v->now++; INTOCON(L_BRACK); RETV(END, ':'); } else RETV(PLAIN, c); break; default: assert(NOTREACHED); break; } /* that got rid of everything except EREs and AREs */ assert(INCON(L_ERE)); /* deal with EREs and AREs, except for backslashes */ switch (c) { case CHR('|'): RET('|'); break; case CHR('*'): if ((v->cflags & REG_ADVF) && NEXT1('?')) { v->now++; NOTE(REG_UNONPOSIX); RETV('*', 0); } RETV('*', 1); break; case CHR('+'): if ((v->cflags & REG_ADVF) && NEXT1('?')) { v->now++; NOTE(REG_UNONPOSIX); RETV('+', 0); } RETV('+', 1); break; case CHR('?'): if ((v->cflags & REG_ADVF) && NEXT1('?')) { v->now++; NOTE(REG_UNONPOSIX); RETV('?', 0); } RETV('?', 1); break; case CHR('{'): /* bounds start or plain character */ if (v->cflags & REG_EXPANDED) skip(v); if (ATEOS() || !iscdigit(*v->now)) { NOTE(REG_UBRACES); NOTE(REG_UUNSPEC); RETV(PLAIN, c); } else { NOTE(REG_UBOUNDS); INTOCON(L_EBND); RET('{'); } assert(NOTREACHED); break; case CHR('('): /* parenthesis, or advanced extension */ if ((v->cflags & REG_ADVF) && NEXT1('?')) { NOTE(REG_UNONPOSIX); v->now++; if (ATEOS()) FAILW(REG_BADRPT); switch (*v->now++) { case CHR(':'): /* non-capturing paren */ RETV('(', 0); break; case CHR('#'): /* comment */ while (!ATEOS() && *v->now != CHR(')')) v->now++; if (!ATEOS()) v->now++; assert(v->nexttype == v->lasttype); return next(v); break; case CHR('='): /* positive lookahead */ NOTE(REG_ULOOKAROUND); RETV(LACON, LATYPE_AHEAD_POS); break; case CHR('!'): /* negative lookahead */ NOTE(REG_ULOOKAROUND); RETV(LACON, LATYPE_AHEAD_NEG); break; case CHR('<'): if (ATEOS()) FAILW(REG_BADRPT); switch (*v->now++) { case CHR('='): /* positive lookbehind */ NOTE(REG_ULOOKAROUND); RETV(LACON, LATYPE_BEHIND_POS); break; case CHR('!'): /* negative lookbehind */ NOTE(REG_ULOOKAROUND); RETV(LACON, LATYPE_BEHIND_NEG); break; default: FAILW(REG_BADRPT); break; } assert(NOTREACHED); break; default: FAILW(REG_BADRPT); break; } assert(NOTREACHED); } if (v->cflags & REG_NOSUB) RETV('(', 0); /* all parens non-capturing */ else RETV('(', 1); break; case CHR(')'): if (LASTTYPE('(')) NOTE(REG_UUNSPEC); RETV(')', c); break; case CHR('['): /* easy except for [[:<:]] and [[:>:]] */ if (HAVE(6) && *(v->now + 0) == CHR('[') && *(v->now + 1) == CHR(':') && (*(v->now + 2) == CHR('<') || *(v->now + 2) == CHR('>')) && *(v->now + 3) == CHR(':') && *(v->now + 4) == CHR(']') && *(v->now + 5) == CHR(']')) { c = *(v->now + 2); v->now += 6; NOTE(REG_UNONPOSIX); RET((c == CHR('<')) ? '<' : '>'); } INTOCON(L_BRACK); if (NEXT1('^')) { v->now++; RETV('[', 0); } RETV('[', 1); break; case CHR('.'): RET('.'); break; case CHR('^'): RET('^'); break; case CHR('$'): RET('$'); break; case CHR('\\'): /* mostly punt backslashes to code below */ if (ATEOS()) FAILW(REG_EESCAPE); break; default: /* ordinary character */ RETV(PLAIN, c); break; } /* ERE/ARE backslash handling; backslash already eaten */ assert(!ATEOS()); if (!(v->cflags & REG_ADVF)) { /* only AREs have non-trivial escapes */ if (iscalnum(*v->now)) { NOTE(REG_UBSALNUM); NOTE(REG_UUNSPEC); } RETV(PLAIN, *v->now++); } (DISCARD) lexescape(v); if (ISERR()) FAILW(REG_EESCAPE); if (v->nexttype == CCLASS) { /* fudge at lexical level */ switch (v->nextvalue) { case 'd': lexnest(v, backd, ENDOF(backd)); break; case 'D': lexnest(v, backD, ENDOF(backD)); break; case 's': lexnest(v, backs, ENDOF(backs)); break; case 'S': lexnest(v, backS, ENDOF(backS)); break; case 'w': lexnest(v, backw, ENDOF(backw)); break; case 'W': lexnest(v, backW, ENDOF(backW)); break; default: assert(NOTREACHED); FAILW(REG_ASSERT); break; } /* lexnest done, back up and try again */ v->nexttype = v->lasttype; return next(v); } /* otherwise, lexescape has already done the work */ return !ISERR(); }