const wchar* get_extension(const wchar* file_name) { const wchar* ext = NULL; const wchar* tmp = file_name; while(!char_eos(tmp)) { if(*tmp == L'.') ext = char_next(tmp); char_inc(&tmp); } return ext; }
int tre_lexer_next(tre_Lexer* lex) { int len; uint32_t code; uint32_t* name; if (lex->scur == lex->slen) { lex->token.value = TK_END; return 0; } code = char_next(lex); bool is_lastone = (lex->scur == lex->slen); switch (lex->state) { case 0: // NORMAL STATE if (token_check(code)) { lex->token.extra.code = 0; lex->token.value = code; // token val is it's own ascii. switch (code) { case '[': lex->state = 1; if ((!is_lastone) && char_lookahead(lex) == '^') { lex->token.extra.code = 1; } break; case '{': { int count; int scur_bak = lex->scur; int llimit = 0, rlimit = -1; // read left limit a{1 llimit = read_int(lex, 0, &count); if (count == 0) goto __bad_token; code = char_nextn(lex, count+1); // read comma a{1, if ((char)code == ',') { //char_next(lex); } else if ((char)code == '}') { rlimit = llimit; goto __write_code; } else { // falied, rollback goto __bad_token; } // read left limit a{1, 2 rlimit = read_int(lex, 0, &count); code = char_nextn(lex, count+1); // read right brace a{1,2} or a{1,} if ((char)code == '}') { // ok, rlimit is -1 } else { // falied, rollback goto __bad_token; } __write_code: lex->token.extra.code = llimit; lex->token.extra.code2 = rlimit; break; __bad_token: lex->token.value = TK_CHAR; lex->token.extra.code = '{'; lex->scur = scur_bak; break; } case '(': { code = char_lookahead(lex); // if next char is not ? if (code != '?') { lex->token.extra.group_type = GT_NORMAL; lex->token.extra.group_name = NULL; break; } else { code = char_nextn(lex, 2); switch (code) { case '#': { // just comment bool is_escape = false; code = char_next(lex); while (!(!is_escape && code == ')')) { code = char_next(lex); if (is_escape) is_escape = false; if (code == '\\') is_escape = true; if (code == '\0') return ERR_LEXER_UNBALANCED_PARENTHESIS; } lex->token.value = TK_COMMENT; break; } case ':': lex->token.extra.group_type = GT_NONGROUPING; break; case '=': lex->token.extra.group_type = GT_IF_MATCH; break; case '!': lex->token.extra.group_type = GT_IF_NOT_MATCH; break; case '(': // code for conditional backref name = read_group_name(lex, ')', &len); if (name) { code = char_nextn(lex, len); lex->token.extra.group_type = GT_BACKREF_CONDITIONAL_GROUPNAME; lex->token.extra.group_name = name; lex->token.extra.group_name_len = len; } else { int i = read_int(lex, ')', &len); if (i == -1) { return ERR_LEXER_INVALID_GROUP_NAME_OR_INDEX; } else { code = char_nextn(lex, len); lex->token.extra.group_type = GT_BACKREF_CONDITIONAL_INDEX; lex->token.extra.index = i; } } code = char_next(lex); break; case 'P': // group name code = char_lookahead(lex); if (code == '<') { code = char_next(lex); name = read_group_name(lex, '>', &len); if (!name) return ERR_LEXER_BAD_GROUP_NAME; code = char_nextn(lex, len+1); // name and '>' lex->token.extra.group_type = GT_NORMAL; lex->token.extra.group_name = name; lex->token.extra.group_name_len = len; } else if (code == '=') { // code for back reference (?P=) code = char_next(lex); name = read_group_name(lex, ')', &len); if (!name) return ERR_LEXER_BAD_GROUP_NAME_IN_BACKREF; code = char_nextn(lex, len); // skip name lex->token.extra.group_type = GT_BACKREF; lex->token.extra.group_name = name; lex->token.extra.group_name_len = len; } else { return ERR_LEXER_UNKNOW_SPECIFIER; } break; case '<': code = char_next(lex); if (code == '=') { lex->token.extra.group_type = GT_IF_PRECEDED_BY; } else if (code == '!') { lex->token.extra.group_type = GT_IF_NOT_PRECEDED_BY; } else { return ERR_LEXER_UNKNOW_SPECIFIER; } break; default: if (char_to_flag(code)) { int flag = 0; while (true) { flag = char_to_flag(code); if (flag) lex->extra_flag |= flag; else break; code = char_next(lex); } } else { return ERR_LEXER_UNEXPECTED_END_OF_PATTERN; } lex->token.value = TK_NOP; break; } } } }; } else { int ret = token_char_accept(lex, code, true); if (ret) return ret; } break; case 1: { // [...] bool is_escape = code == '\\'; int ret = token_char_accept(lex, code, false); if (ret) return ret; if (!is_escape && lex->token.value == TK_CHAR) { // end the state if (code == ']') { lex->state = 0; lex->token.value = ']'; break; } } // [a-z] grammar code = char_lookahead(lex); if (code == '-') { uint32_t code2 = char_lookaheadn(lex, 2); // [a-] if (code2 == ']') break; // [\s-1] -> error if (lex->token.value == TK_CHAR_SPE) { return ERR_LEXER_BAD_CHARACTER_RANGE; } // [a-z] code2 = lex->token.extra.code; code = char_nextn(lex, 2); ret = token_char_accept(lex, code, false); if (ret) return ret; // [1-\s] -> error if (lex->token.value == TK_CHAR_SPE) { return ERR_LEXER_BAD_CHARACTER_RANGE; } // [z-a] -> error if (lex->token.extra.code < code2) { return ERR_LEXER_BAD_CHARACTER_RANGE; } // everything is ok lex->token.value = '-'; lex->token.extra.code2 = lex->token.extra.code; lex->token.extra.code = code2; } break; } } return 0; }
_INLINE static int token_char_accept(tre_Lexer *lex, uint32_t code, bool use_back_ref) { if (code == '\\') { // 对转义字符做特殊处理 if (lex->scur == lex->slen) { // 如果已经是最后一个字符,那么当作普通字符即可 lex->token.extra.code = code; lex->token.value = TK_CHAR; } else { // 如果不是,读下一个字符 code = char_lookahead(lex); if (is_spe_char(code)) { // 能确定为特殊匹配字符的话,读取结束 lex->token.extra.code = code; lex->token.value = TK_CHAR_SPE; code = char_next(lex); } else { // 否则当做 hex/unicode 转义处理 int num, len; bool is_ok = false; if (code == 'x') { code = char_next(lex); num = read_hex(lex, 2, &is_ok); if (!is_ok) return ERR_LEXER_HEX_ESCAPE; char_nextn(lex, 2); } else if (code == 'u') { code = char_next(lex); num = read_hex(lex, 4, &is_ok); if (!is_ok) return ERR_LEXER_UNICODE_ESCAPE; char_nextn(lex, 4); } else if (code == 'U') { code = char_next(lex); num = read_hex(lex, 8, &is_ok); // unicode 6.0 \U0000000A if (!is_ok) return ERR_LEXER_UNICODE6_ESCAPE; char_nextn(lex, 8); } if (is_ok) { lex->token.value = TK_CHAR; lex->token.extra.code = num; } else { num = read_int(lex, 0, &len); if (num != -1) { // back reference or normal char if (use_back_ref) { if (num == 0) { lex->token.value = TK_CHAR; lex->token.extra.code = 0; } else { lex->token.value = TK_BACK_REF; lex->token.extra.index = num; } } else { lex->token.value = TK_CHAR; lex->token.extra.code = num; } char_nextn(lex, len); } else { // 既不是转义,也不是前向引用,只是一个字符罢了 lex->token.value = TK_CHAR; lex->token.extra.code = code; char_next(lex); } } } } } else { // 若非转义字符,那么一切都很简单 lex->token.extra.code = code; lex->token.value = (code == '.') ? TK_CHAR_SPE : TK_CHAR; } return 0; }