int re1_5_compilecode(ByteProg *prog, const char *re) { prog->len = 0; prog->bytelen = 0; prog->sub = 0; // Add code to implement non-anchored operation ("search"), // for anchored operation ("match"), this code will be just skipped. // TODO: Implement search in much more efficient manner prog->insts[prog->bytelen++] = RSplit; prog->insts[prog->bytelen++] = 3; prog->insts[prog->bytelen++] = Any; prog->insts[prog->bytelen++] = Jmp; prog->insts[prog->bytelen++] = -5; prog->len += 3; prog->insts[prog->bytelen++] = Save; prog->insts[prog->bytelen++] = 0; prog->len++; _compilecode(re, prog); prog->insts[prog->bytelen++] = Save; prog->insts[prog->bytelen++] = 1; prog->len++; prog->insts[prog->bytelen++] = Match; prog->len++; return 0; }
static int _compilecode(const char **re_loc, ByteProg *prog, int sizecode) { const char *re = *re_loc; char *code = sizecode ? NULL : prog->insts; int start = PC; int term = PC; int alt_label = 0; for (; *re && *re != ')'; re++) { switch (*re) { case '\\': { re++; if (!*re) goto syntax_error; // Trailing backslash char c = *re | 0x20; if (c == 'd' || c == 's' || c == 'w') { term = PC; EMIT(PC++, NamedClass); EMIT(PC++, *re); prog->len++; break; } if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z')) { goto unsupported_escape; } } default: term = PC; EMIT(PC++, Char); EMIT(PC++, *re); prog->len++; break; case '.': term = PC; EMIT(PC++, Any); prog->len++; break; case '[': { int cnt; term = PC; re++; if (*re == '^') { EMIT(PC++, ClassNot); re++; } else { EMIT(PC++, Class); } PC++; // Skip "# of pairs" byte prog->len++; for (cnt = 0; *re != ']'; re++, cnt++) { if (!*re) goto syntax_error; if (*re == '\\') { re++; if (!*re) goto syntax_error; if (*re != '\\' && *re != ']') goto unsupported_escape; } EMIT(PC++, *re); if (re[1] == '-' && re[2] != ']') { re += 2; } EMIT(PC++, *re); } EMIT(term + 1, cnt); break; } case '(': { term = PC; int sub; int capture = 1; re++; if (*re == '?') { re++; if (*re == ':') { capture = 0; re++; } else { *re_loc = re; return RE1_5_UNSUPPORTED_SYNTAX; } } if (capture) { sub = ++prog->sub; EMIT(PC++, Save); EMIT(PC++, 2 * sub); prog->len++; } int res = _compilecode(&re, prog, sizecode); *re_loc = re; if (res < 0) return res; if (*re != ')') return RE1_5_SYNTAX_ERROR; if (capture) { EMIT(PC++, Save); EMIT(PC++, 2 * sub + 1); prog->len++; } break; } case '{': *re_loc = re; return RE1_5_UNSUPPORTED_SYNTAX; case '?': if (PC == term) goto syntax_error; // nothing to repeat INSERT_CODE(term, 2, PC); if (re[1] == '?') { EMIT(term, RSplit); re++; } else { EMIT(term, Split); } EMIT(term + 1, REL(term, PC)); prog->len++; term = PC; break; case '*': if (PC == term) goto syntax_error; // nothing to repeat INSERT_CODE(term, 2, PC); EMIT(PC, Jmp); EMIT(PC + 1, REL(PC, term)); PC += 2; if (re[1] == '?') { EMIT(term, RSplit); re++; } else { EMIT(term, Split); } EMIT(term + 1, REL(term, PC)); prog->len += 2; term = PC; break; case '+': if (PC == term) goto syntax_error; // nothing to repeat if (re[1] == '?') { EMIT(PC, Split); re++; } else { EMIT(PC, RSplit); } EMIT(PC + 1, REL(PC, term)); PC += 2; prog->len++; term = PC; break; case '|': if (alt_label) { EMIT(alt_label, REL(alt_label, PC) + 1); } INSERT_CODE(start, 2, PC); EMIT(PC++, Jmp); alt_label = PC++; EMIT(start, Split); EMIT(start + 1, REL(start, PC)); prog->len += 2; term = PC; break; case '^': EMIT(PC++, Bol); prog->len++; term = PC; break; case '$': EMIT(PC++, Eol); prog->len++; term = PC; break; } } if (alt_label) { EMIT(alt_label, REL(alt_label, PC) + 1); } *re_loc = re; return RE1_5_SUCCESS; syntax_error: *re_loc = re; return RE1_5_SYNTAX_ERROR; unsupported_escape: *re_loc = re; return RE1_5_UNSUPPORTED_ESCAPE; }
*re_loc = re; return RE1_5_SYNTAX_ERROR; unsupported_escape: *re_loc = re; return RE1_5_UNSUPPORTED_ESCAPE; } int re1_5_sizecode(const char *re) { ByteProg dummyprog = { // Save 0, Save 1, Match; more bytes for "search" (vs "match") prefix code .bytelen = 5 + NON_ANCHORED_PREFIX }; int res = _compilecode(&re, &dummyprog, /*sizecode*/1); if (res < 0) return res; // If unparsed chars left if (*re) return RE1_5_SYNTAX_ERROR; return dummyprog.bytelen; } int re1_5_compilecode(ByteProg *prog, const char *re) { prog->len = 0; prog->bytelen = 0; prog->sub = 0; // Add code to implement non-anchored operation ("search"). // For anchored operation ("match"), this code will be just skipped.
const char *_compilecode(const char *re, ByteProg *prog) { char *code = prog->insts; int pc = prog->bytelen; int start = pc; int term = pc; int alt_label = 0; for (; *re && *re != ')'; re++) { switch (*re) { case '\\': re++; default: term = pc; EMIT(pc++, Char); EMIT(pc++, *re); prog->len++; break; case '.': term = pc; EMIT(pc++, Any); prog->len++; break; case '[': { int cnt; term = pc; EMIT(pc++, Class); pc++; // Skip # of pair byte prog->len++; re++; for (cnt = 0; *re != ']'; re++, cnt++) { if (!*re) return NULL; EMIT(pc++, *re); if (re[1] == '-') { re += 2; } EMIT(pc++, *re); } EMIT(term + 1, cnt); break; } case '(': term = pc; EMIT(pc++, Save); EMIT(pc++, 2 * ++prog->sub); prog->len++; prog->bytelen = pc; re = _compilecode(re + 1, prog); pc = prog->bytelen; EMIT(pc++, Save); EMIT(pc++, 2 * prog->sub + 1); prog->len++; break; case '?': insert_code(code, term, 2, &pc); EMIT(term, Split); EMIT(term + 1, REL(term, pc)); prog->len++; break; case '*': insert_code(code, term, 2, &pc); EMIT(pc, Jmp); EMIT(pc + 1, REL(pc, term)); pc += 2; if (re[1] == '?') { EMIT(term, RSplit); re++; } else { EMIT(term, Split); } EMIT(term + 1, REL(term, pc)); prog->len += 2; break; case '+': if (re[1] == '?') { EMIT(pc, Split); re++; } else { EMIT(pc, RSplit); } EMIT(pc + 1, REL(pc, term)); pc += 2; prog->len++; break; case '|': if (alt_label) { EMIT(alt_label, REL(alt_label, pc) + 1); } insert_code(code, start, 2, &pc); EMIT(pc++, Jmp); alt_label = pc++; EMIT(start, Split); EMIT(start + 1, REL(start, pc)); prog->len += 2; break; case '^': EMIT(pc++, Bol); prog->len++; break; case '$': EMIT(pc++, Eol); prog->len++; break; } } if (alt_label) { EMIT(alt_label, REL(alt_label, pc) + 1); } prog->bytelen = pc; return re; }
static const char *_compilecode(const char *re, ByteProg *prog, int sizecode) { char *code = sizecode ? NULL : prog->insts; int start = PC; int term = PC; int alt_label = 0; for (; *re && *re != ')'; re++) { switch (*re) { case '\\': re++; if (!*re) return NULL; // Trailing backslash if ((*re | 0x20) == 'd' || (*re | 0x20) == 's' || (*re | 0x20) == 'w') { term = PC; EMIT(PC++, NamedClass); EMIT(PC++, *re); prog->len++; break; } default: term = PC; EMIT(PC++, Char); EMIT(PC++, *re); prog->len++; break; case '.': term = PC; EMIT(PC++, Any); prog->len++; break; case '[': { int cnt; term = PC; re++; if (*re == '^') { EMIT(PC++, ClassNot); re++; } else { EMIT(PC++, Class); } PC++; // Skip # of pair byte prog->len++; for (cnt = 0; *re != ']'; re++, cnt++) { if (!*re) return NULL; EMIT(PC++, *re); if (re[1] == '-') { re += 2; } EMIT(PC++, *re); } EMIT(term + 1, cnt); break; } case '(': { term = PC; int sub = 0; int capture = re[1] != '?' || re[2] != ':'; if (capture) { sub = ++prog->sub; EMIT(PC++, Save); EMIT(PC++, 2 * sub); prog->len++; } else { re += 2; } re = _compilecode(re + 1, prog, sizecode); if (re == NULL || *re != ')') return NULL; // error, or no matching paren if (capture) { EMIT(PC++, Save); EMIT(PC++, 2 * sub + 1); prog->len++; } break; } case '?': if (PC == term) return NULL; // nothing to repeat INSERT_CODE(term, 2, PC); if (re[1] == '?') { EMIT(term, RSplit); re++; } else { EMIT(term, Split); } EMIT(term + 1, REL(term, PC)); prog->len++; term = PC; break; case '*': if (PC == term) return NULL; // nothing to repeat INSERT_CODE(term, 2, PC); EMIT(PC, Jmp); EMIT(PC + 1, REL(PC, term)); PC += 2; if (re[1] == '?') { EMIT(term, RSplit); re++; } else { EMIT(term, Split); } EMIT(term + 1, REL(term, PC)); prog->len += 2; term = PC; break; case '+': if (PC == term) return NULL; // nothing to repeat if (re[1] == '?') { EMIT(PC, Split); re++; } else { EMIT(PC, RSplit); } EMIT(PC + 1, REL(PC, term)); PC += 2; prog->len++; term = PC; break; case '|': if (alt_label) { EMIT(alt_label, REL(alt_label, PC) + 1); } INSERT_CODE(start, 2, PC); EMIT(PC++, Jmp); alt_label = PC++; EMIT(start, Split); EMIT(start + 1, REL(start, PC)); prog->len += 2; term = PC; break; case '^': EMIT(PC++, Bol); prog->len++; term = PC; break; case '$': EMIT(PC++, Eol); prog->len++; term = PC; break; } } if (alt_label) { EMIT(alt_label, REL(alt_label, PC) + 1); } return re; }