Exemplo n.º 1
0
static int _compilecode(const char **re_loc, ByteProg *prog, int sizecode)
{
    const char *re = *re_loc;
    char *code = sizecode ? NULL : prog->insts;
    int start = PC;
    int term = PC;
    int alt_label = 0;

    for (; *re && *re != ')'; re++) {
        switch (*re) {
        case '\\': {
            re++;
            if (!*re) goto syntax_error; // Trailing backslash
            char c = *re | 0x20;
            if (c == 'd' || c == 's' || c == 'w') {
                term = PC;
                EMIT(PC++, NamedClass);
                EMIT(PC++, *re);
                prog->len++;
                break;
            }
            if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z')) {
                goto unsupported_escape;
            }
        }
        default:
            term = PC;
            EMIT(PC++, Char);
            EMIT(PC++, *re);
            prog->len++;
            break;
        case '.':
            term = PC;
            EMIT(PC++, Any);
            prog->len++;
            break;
        case '[': {
            int cnt;
            term = PC;
            re++;
            if (*re == '^') {
                EMIT(PC++, ClassNot);
                re++;
            } else {
                EMIT(PC++, Class);
            }
            PC++; // Skip "# of pairs" byte
            prog->len++;
            for (cnt = 0; *re != ']'; re++, cnt++) {
                if (!*re) goto syntax_error;
                if (*re == '\\') {
                    re++;
                    if (!*re) goto syntax_error;
                    if (*re != '\\' && *re != ']') goto unsupported_escape;
                }
                EMIT(PC++, *re);
                if (re[1] == '-' && re[2] != ']') {
                    re += 2;
                }
                EMIT(PC++, *re);
            }
            EMIT(term + 1, cnt);
            break;
        }
        case '(': {
            term = PC;
            int sub;
            int capture = 1;
            re++;
            if (*re == '?') {
                re++;
                if (*re == ':') {
                    capture = 0;
                    re++;
                } else {
                    *re_loc = re;
                    return RE1_5_UNSUPPORTED_SYNTAX;
                }
            }

            if (capture) {
                sub = ++prog->sub;
                EMIT(PC++, Save);
                EMIT(PC++, 2 * sub);
                prog->len++;
            }

            int res = _compilecode(&re, prog, sizecode);
            *re_loc = re;
            if (res < 0) return res;
            if (*re != ')') return RE1_5_SYNTAX_ERROR;

            if (capture) {
                EMIT(PC++, Save);
                EMIT(PC++, 2 * sub + 1);
                prog->len++;
            }

            break;
        }
        case '{':
            *re_loc = re;
            return RE1_5_UNSUPPORTED_SYNTAX;
        case '?':
            if (PC == term) goto syntax_error; // nothing to repeat
            INSERT_CODE(term, 2, PC);
            if (re[1] == '?') {
                EMIT(term, RSplit);
                re++;
            } else {
                EMIT(term, Split);
            }
            EMIT(term + 1, REL(term, PC));
            prog->len++;
            term = PC;
            break;
        case '*':
            if (PC == term) goto syntax_error; // nothing to repeat
            INSERT_CODE(term, 2, PC);
            EMIT(PC, Jmp);
            EMIT(PC + 1, REL(PC, term));
            PC += 2;
            if (re[1] == '?') {
                EMIT(term, RSplit);
                re++;
            } else {
                EMIT(term, Split);
            }
            EMIT(term + 1, REL(term, PC));
            prog->len += 2;
            term = PC;
            break;
        case '+':
            if (PC == term) goto syntax_error; // nothing to repeat
            if (re[1] == '?') {
                EMIT(PC, Split);
                re++;
            } else {
                EMIT(PC, RSplit);
            }
            EMIT(PC + 1, REL(PC, term));
            PC += 2;
            prog->len++;
            term = PC;
            break;
        case '|':
            if (alt_label) {
                EMIT(alt_label, REL(alt_label, PC) + 1);
            }
            INSERT_CODE(start, 2, PC);
            EMIT(PC++, Jmp);
            alt_label = PC++;
            EMIT(start, Split);
            EMIT(start + 1, REL(start, PC));
            prog->len += 2;
            term = PC;
            break;
        case '^':
            EMIT(PC++, Bol);
            prog->len++;
            term = PC;
            break;
        case '$':
            EMIT(PC++, Eol);
            prog->len++;
            term = PC;
            break;
        }
    }

    if (alt_label) {
        EMIT(alt_label, REL(alt_label, PC) + 1);
    }

    *re_loc = re;
    return RE1_5_SUCCESS;

syntax_error:
    *re_loc = re;
    return RE1_5_SYNTAX_ERROR;

unsupported_escape:
    *re_loc = re;
    return RE1_5_UNSUPPORTED_ESCAPE;
}
Exemplo n.º 2
0
static const char *_compilecode(const char *re, ByteProg *prog, int sizecode)
{
    char *code = sizecode ? NULL : prog->insts;
    int start = PC;
    int term = PC;
    int alt_label = 0;

    for (; *re && *re != ')'; re++) {
        switch (*re) {
        case '\\':
            re++;
            if (!*re) return NULL; // Trailing backslash
            if ((*re | 0x20) == 'd' || (*re | 0x20) == 's' || (*re | 0x20) == 'w') {
                term = PC;
                EMIT(PC++, NamedClass);
                EMIT(PC++, *re);
                prog->len++;
                break;
            }
        default:
            term = PC;
            EMIT(PC++, Char);
            EMIT(PC++, *re);
            prog->len++;
            break;
        case '.':
            term = PC;
            EMIT(PC++, Any);
            prog->len++;
            break;
        case '[': {
            int cnt;
            term = PC;
            re++;
            if (*re == '^') {
                EMIT(PC++, ClassNot);
                re++;
            } else {
                EMIT(PC++, Class);
            }
            PC++; // Skip # of pair byte
            prog->len++;
            for (cnt = 0; *re != ']'; re++, cnt++) {
                if (!*re) return NULL;
                EMIT(PC++, *re);
                if (re[1] == '-') {
                    re += 2;
                }
                EMIT(PC++, *re);
            }
            EMIT(term + 1, cnt);
            break;
        }
        case '(': {
            term = PC;
            int sub = 0;
            int capture = re[1] != '?' || re[2] != ':';

            if (capture) {
                sub = ++prog->sub;
                EMIT(PC++, Save);
                EMIT(PC++, 2 * sub);
                prog->len++;
            } else {
                    re += 2;
            }

            re = _compilecode(re + 1, prog, sizecode);
            if (re == NULL || *re != ')') return NULL; // error, or no matching paren

            if (capture) {
                EMIT(PC++, Save);
                EMIT(PC++, 2 * sub + 1);
                prog->len++;
            }

            break;
        }
        case '?':
            if (PC == term) return NULL; // nothing to repeat
            INSERT_CODE(term, 2, PC);
            if (re[1] == '?') {
                EMIT(term, RSplit);
                re++;
            } else {
                EMIT(term, Split);
            }
            EMIT(term + 1, REL(term, PC));
            prog->len++;
            term = PC;
            break;
        case '*':
            if (PC == term) return NULL; // nothing to repeat
            INSERT_CODE(term, 2, PC);
            EMIT(PC, Jmp);
            EMIT(PC + 1, REL(PC, term));
            PC += 2;
            if (re[1] == '?') {
                EMIT(term, RSplit);
                re++;
            } else {
                EMIT(term, Split);
            }
            EMIT(term + 1, REL(term, PC));
            prog->len += 2;
            term = PC;
            break;
        case '+':
            if (PC == term) return NULL; // nothing to repeat
            if (re[1] == '?') {
                EMIT(PC, Split);
                re++;
            } else {
                EMIT(PC, RSplit);
            }
            EMIT(PC + 1, REL(PC, term));
            PC += 2;
            prog->len++;
            term = PC;
            break;
        case '|':
            if (alt_label) {
                EMIT(alt_label, REL(alt_label, PC) + 1);
            }
            INSERT_CODE(start, 2, PC);
            EMIT(PC++, Jmp);
            alt_label = PC++;
            EMIT(start, Split);
            EMIT(start + 1, REL(start, PC));
            prog->len += 2;
            term = PC;
            break;
        case '^':
            EMIT(PC++, Bol);
            prog->len++;
            term = PC;
            break;
        case '$':
            EMIT(PC++, Eol);
            prog->len++;
            term = PC;
            break;
        }
    }

    if (alt_label) {
        EMIT(alt_label, REL(alt_label, PC) + 1);
    }
    return re;
}