示例#1
0
const wchar* get_extension(const wchar* file_name)
{
    const wchar* ext = NULL;
    const wchar* tmp = file_name;

    while(!char_eos(tmp))
    {
        if(*tmp == L'.')
            ext = char_next(tmp);

        char_inc(&tmp);
    }

    return ext;
}
示例#2
0
文件: tlexer.c 项目: fy0/tinyre
int tre_lexer_next(tre_Lexer* lex) {
    int len;
    uint32_t code;
    uint32_t* name;
    if (lex->scur == lex->slen) {
        lex->token.value = TK_END;
        return 0;
    }
    code = char_next(lex);
    bool is_lastone = (lex->scur == lex->slen);

    switch (lex->state) {
        case 0: // NORMAL STATE
            if (token_check(code)) {
                lex->token.extra.code = 0;
                lex->token.value = code; // token val is it's own ascii.

                switch (code) {
                    case '[':
                        lex->state = 1;
                        if ((!is_lastone) && char_lookahead(lex) == '^') {
                            lex->token.extra.code = 1;
                        }
                        break;
                    case '{': {
                        int count;
                        int scur_bak = lex->scur;
                        int llimit = 0, rlimit = -1;

                        // read left limit a{1
                        llimit = read_int(lex, 0, &count);
                        if (count == 0) goto __bad_token;
                        code = char_nextn(lex, count+1);

                        // read comma a{1,
                        if ((char)code == ',') {
                            //char_next(lex);
                        } else if ((char)code == '}') {
                            rlimit = llimit;
                            goto __write_code;
                        } else {
                            // falied, rollback
                            goto __bad_token;
                        }

                        // read left limit a{1, 2
                        rlimit = read_int(lex, 0, &count);
                        code = char_nextn(lex, count+1);

                        // read right brace a{1,2} or a{1,}
                        if ((char)code == '}') {
                            // ok, rlimit is -1
                        } else {
                            // falied, rollback
                            goto __bad_token;
                        }

                    __write_code:
                        lex->token.extra.code = llimit;
                        lex->token.extra.code2 = rlimit;
                        break;

                    __bad_token:
                        lex->token.value = TK_CHAR;
                        lex->token.extra.code = '{';
                        lex->scur = scur_bak;
                        break;
                    }
                    case '(': {
                        code = char_lookahead(lex);
                        // if next char is not ?
                        if (code != '?') {
                            lex->token.extra.group_type = GT_NORMAL;
                            lex->token.extra.group_name = NULL;
                            break;
                        } else {
                            code = char_nextn(lex, 2);
                            switch (code) {
                                case '#': { // just comment
                                    bool is_escape = false;
                                    code = char_next(lex);
                                    while (!(!is_escape && code == ')')) {
                                        code = char_next(lex);
                                        if (is_escape) is_escape = false;
                                        if (code == '\\') is_escape = true;
                                        if (code == '\0') return ERR_LEXER_UNBALANCED_PARENTHESIS;
                                    }
                                    lex->token.value = TK_COMMENT;
                                    break;
                                }
                                case ':': lex->token.extra.group_type = GT_NONGROUPING; break;
                                case '=': lex->token.extra.group_type = GT_IF_MATCH; break;
                                case '!': lex->token.extra.group_type = GT_IF_NOT_MATCH; break;
                                case '(':
                                    // code for conditional backref
                                    name = read_group_name(lex, ')', &len);
                                    if (name) {
                                        code = char_nextn(lex, len);
                                        lex->token.extra.group_type = GT_BACKREF_CONDITIONAL_GROUPNAME;
                                        lex->token.extra.group_name = name;
                                        lex->token.extra.group_name_len = len;
                                    } else {
                                        int i = read_int(lex, ')', &len);
                                        if (i == -1) {
                                            return ERR_LEXER_INVALID_GROUP_NAME_OR_INDEX;
                                        } else {
                                            code = char_nextn(lex, len);
                                            lex->token.extra.group_type = GT_BACKREF_CONDITIONAL_INDEX;
                                            lex->token.extra.index = i;
                                        }
                                    }
                                    code = char_next(lex);
                                    break;
                                case 'P':
                                    // group name
                                    code = char_lookahead(lex);
                                    if (code == '<') {
                                        code = char_next(lex);
                                        name = read_group_name(lex, '>', &len);
                                        if (!name) return ERR_LEXER_BAD_GROUP_NAME;
                                        code = char_nextn(lex, len+1); // name and '>'

                                        lex->token.extra.group_type = GT_NORMAL;
                                        lex->token.extra.group_name = name;
                                        lex->token.extra.group_name_len = len;
                                    } else if (code == '=') {
                                        // code for back reference (?P=)
                                        code = char_next(lex);
                                        name = read_group_name(lex, ')', &len);
                                        if (!name) return ERR_LEXER_BAD_GROUP_NAME_IN_BACKREF;
                                        code = char_nextn(lex, len); // skip name

                                        lex->token.extra.group_type = GT_BACKREF;
                                        lex->token.extra.group_name = name;
                                        lex->token.extra.group_name_len = len;
                                    } else {
                                        return ERR_LEXER_UNKNOW_SPECIFIER;
                                    }
                                    break;
                                case '<':
                                    code = char_next(lex);
                                    if (code == '=') {
                                        lex->token.extra.group_type = GT_IF_PRECEDED_BY;
                                    } else if (code == '!') {
                                        lex->token.extra.group_type = GT_IF_NOT_PRECEDED_BY;
                                    } else {
                                        return ERR_LEXER_UNKNOW_SPECIFIER;
                                    }
                                    break;
                                default:
                                    if (char_to_flag(code)) {
                                        int flag = 0;
                                        while (true) {
                                            flag = char_to_flag(code);
                                            if (flag) lex->extra_flag |= flag;
                                            else break;
                                            code = char_next(lex);
                                        }
                                    } else {
                                        return ERR_LEXER_UNEXPECTED_END_OF_PATTERN;
                                    }
                                    lex->token.value = TK_NOP;
                                    break;
                            }
                        }
                    }
                };
            } else {
                int ret = token_char_accept(lex, code, true);
                if (ret) return ret;
            }
            break;
        case 1: { // [...]
            bool is_escape = code == '\\';
            int ret = token_char_accept(lex, code, false);
            if (ret) return ret;

            if (!is_escape && lex->token.value == TK_CHAR) {
                // end the state
                if (code == ']') {
                    lex->state = 0;
                    lex->token.value = ']';
                    break;
                }
            }

            // [a-z] grammar
            code = char_lookahead(lex);
            if (code == '-') {
                uint32_t code2 = char_lookaheadn(lex, 2);
                // [a-]
                if (code2 == ']') break;

                // [\s-1] -> error
                if (lex->token.value == TK_CHAR_SPE) {
                    return ERR_LEXER_BAD_CHARACTER_RANGE;
                }

                // [a-z]
                code2 = lex->token.extra.code;
                code = char_nextn(lex, 2);
                ret = token_char_accept(lex, code, false);
                if (ret) return ret;

                // [1-\s] -> error
                if (lex->token.value == TK_CHAR_SPE) {
                    return ERR_LEXER_BAD_CHARACTER_RANGE;
                }

                // [z-a] -> error
                if (lex->token.extra.code < code2) {
                    return ERR_LEXER_BAD_CHARACTER_RANGE;
                }

                // everything is ok
                lex->token.value = '-';
                lex->token.extra.code2 = lex->token.extra.code;
                lex->token.extra.code = code2;
            }
            break;
        }
    }
    return 0;
}
示例#3
0
文件: tlexer.c 项目: fy0/tinyre
_INLINE static
int token_char_accept(tre_Lexer *lex, uint32_t code, bool use_back_ref) {
    if (code == '\\') {
        // 对转义字符做特殊处理
        if (lex->scur == lex->slen) {
            // 如果已经是最后一个字符,那么当作普通字符即可
            lex->token.extra.code = code;
            lex->token.value = TK_CHAR;
        } else {
            // 如果不是,读下一个字符
            code = char_lookahead(lex);
            if (is_spe_char(code)) {
                // 能确定为特殊匹配字符的话,读取结束
                lex->token.extra.code = code;
                lex->token.value = TK_CHAR_SPE;
                code = char_next(lex);
            } else {
                // 否则当做 hex/unicode 转义处理
                int num, len;
                bool is_ok = false;

                if (code == 'x') {
                    code = char_next(lex);
                    num = read_hex(lex, 2, &is_ok);
                    if (!is_ok) return ERR_LEXER_HEX_ESCAPE;
                    char_nextn(lex, 2);
                } else if (code == 'u') {
                    code = char_next(lex);
                    num = read_hex(lex, 4, &is_ok);
                    if (!is_ok) return ERR_LEXER_UNICODE_ESCAPE;
                    char_nextn(lex, 4);
                } else if (code == 'U') {
                    code = char_next(lex);
                    num = read_hex(lex, 8, &is_ok); // unicode 6.0 \U0000000A
                    if (!is_ok) return ERR_LEXER_UNICODE6_ESCAPE;
                    char_nextn(lex, 8);
                }

                if (is_ok) {
                    lex->token.value = TK_CHAR;
                    lex->token.extra.code = num;
                } else {
                    num = read_int(lex, 0, &len);
                    if (num != -1) {
                        // back reference or normal char
                        if (use_back_ref) {
                            if (num == 0) {
                                lex->token.value = TK_CHAR;
                                lex->token.extra.code = 0;
                            } else {
                                lex->token.value = TK_BACK_REF;
                                lex->token.extra.index = num;
                            }
                        } else {
                            lex->token.value = TK_CHAR;
                            lex->token.extra.code = num;
                        }
                        char_nextn(lex, len);
                    } else {
                        // 既不是转义,也不是前向引用,只是一个字符罢了
                        lex->token.value = TK_CHAR;
                        lex->token.extra.code = code;
                        char_next(lex);
                    }
                }
            }
        }
    } else {
        // 若非转义字符,那么一切都很简单
        lex->token.extra.code = code;
        lex->token.value = (code == '.') ? TK_CHAR_SPE : TK_CHAR;
    }
    return 0;
}