Re_node parse_re(char **s, short end) { Stack stk = NULL, temp; Tok_node next_token; Re_node re = NULL; if (s == NULL || *s == NULL) return NULL; while (TRUE) { next_token = get_token(s); if (next_token == NULL) return NULL; switch (tok_type(next_token)) { case RPAREN: retract_token(s); case EOS: if (end == tok_type(next_token)) return Top(cat2(&stk)); else return NULL; case LPAREN: re = parse_re(s, RPAREN); if (Push(&stk, re) == NULL) return NULL; if (tok_type(get_token(s)) != RPAREN || re == NULL) return NULL; if (Size(stk) > 2) { temp = stk->next; stk->next = cat2(&temp); /* condense CAT nodes */ if (stk->next == NULL) return NULL; else stk->size = stk->next->size + 1; } break; case OPSTAR: if (wrap(&stk, OPSTAR) == NULL) return NULL; break; case OPOPT: if (wrap(&stk, OPOPT) == NULL) return NULL; break; case OPALT: if (cat2(&stk) == NULL) return NULL; re = parse_re(s, end); if (re == NULL) return NULL; if (mk_alt(&stk, re) == NULL) return NULL; break; case LITERAL: if (Push(&stk, tok_val(next_token)) == NULL) return NULL; if (Size(stk) > 2) { temp = stk->next; stk->next = cat2(&temp); /* condense CAT nodes */ if (stk->next == NULL) return NULL; else stk->size = stk->next->size + 1; } break; default: printf("parse_re: unknown token type %d\n", tok_type(next_token)); break; } } }
Re_node parse(char *s) { Re_node tree, temp; Stack stk = NULL; tree = parse_re(&s, NUL); if (tree == NULL || Push(&stk, tree) == NULL) return NULL; temp = mk_leaf(EOS, C_LIT, NUL, NULL); if (temp == NULL || Push(&stk, temp) == NULL) return NULL; final_pos = --pos_cnt; return Top(cat2(&stk)); }
void *regex_parser::parse_re(NFA* nfa, const char *re){ int ptr=0; bool tilde_re=false; NFA *non_anchored = *(nfa->get_epsilon()->begin()); NFA *anchored = *(++nfa->get_epsilon()->begin()); //check whether the text must match at the beginning of the regular expression if (re[ptr]==TILDE){ tilde_re=true; ptr++; } NFA *fa=parse_re(re,&ptr,false); fa->get_last()->accept(); if (!tilde_re){ non_anchored->link(fa->get_first()); }else{ anchored->link(fa->get_first()); } }
NFA *regex_parser::parse(FILE *file, int from, int to){ rewind(file); char *re=allocate_char_array(4000); char cmd[4000]; int i=0; int j=0; unsigned int c=fgetc(file); // NFA NFA *nfa=new NFA(); NFA *non_anchored = nfa->add_epsilon(); // for .* RegEx NFA *anchored = nfa->add_epsilon(); // for anchored RegEx (^) fprintf(stdout,"\n>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n"); //parsing the RegEx and putting them in a NFA while(c!=EOF){ if (c=='\n' || c=='\r'){ if(i!=0){ re[i]='\0'; if (re[0]!='#'){ j++; if (j>=from && (to==-1 || j<=to)){ fprintf(stdout,"%dth regex: %s\n",j,re); if (DEBUG) fprintf(stdout,"\n%d) processing regex:: <%s> ...\n",j,re); parse_re(nfa, re); } } i=0; free(re); re=allocate_char_array(4000); } }else{ re[i++]=c; } c=fgetc(file); } //end while if(i!=0){ re[i]='\0'; if (re[0]!= '#'){ j++; if (j>=from && (to==-1 || j<=to)){ fprintf(stdout,"%dth regex: %s\n",j,re); if (DEBUG) fprintf(stdout,"\n%d) processing regex:: <%s> ...\n",j,re); parse_re(nfa,re); } } free(re); } if (DEBUG) fprintf(stdout, "\nAll RegEx processed\n"); //if (re!=NULL) free(re); //handle -m modifier if (m_modifier && (!anchored->get_epsilon()->empty() || !anchored->get_transitions()->empty())){ non_anchored->add_transition('\n',anchored); non_anchored->add_transition('\r',anchored); } //delete non_anchored, if necessary if(non_anchored->get_epsilon()->empty() && non_anchored->get_transitions()->empty()){ nfa->get_epsilon()->remove(non_anchored); delete non_anchored; }else{ non_anchored->add_any(non_anchored); } return nfa->get_first(); }
NFA *regex_parser::parse_re(const char *re, int *ptr, bool bracket){ NFA *fa=new NFA(); NFA *to_link=NULL; bool open_b=bracket; bool close_b=false; while((*ptr)<strlen(re)){ if(re[(*ptr)]==ESCAPE){ int_set *chars=new int_set(CSIZE); (*ptr)=process_escape(re, (*ptr)+1,chars); if((*ptr)==strlen(re)||!is_repetition(re[(*ptr)])){ fa=fa->add_transition(chars); }else{ to_link=new NFA(); to_link=to_link->add_transition(chars); } delete chars; }else if (!is_special(re[(*ptr)]) && ((*ptr)==(strlen(re)-1)||!is_repetition(re[(*ptr)+1]))){ fa=fa->add_transition(re[(*ptr)++]); }else if(!is_special(re[(*ptr)])){ to_link=new NFA(); to_link=to_link->add_transition(re[(*ptr)++]); }else if (re[(*ptr)]==ANY && ((*ptr)==(strlen(re)-1)||!is_repetition(re[(*ptr)+1]))){ fa=fa->add_any(); (*ptr)++; }else if(re[(*ptr)]==ANY){ to_link=new NFA(); to_link=to_link->add_any(); (*ptr)++; }else if (re[(*ptr)]==STAR){ (*ptr)++; if (close_b) return fa->make_rep(0,_INFINITY); else{ to_link=to_link->make_rep(0,_INFINITY); fa=fa->link(to_link); } }else if (re[(*ptr)]==OPT){ (*ptr)++; if (close_b) return fa->make_rep(0,1); else{ to_link=to_link->make_rep(0,1); fa=fa->link(to_link); } }else if (re[(*ptr)]==PLUS){ (*ptr)++; if (close_b){ return fa->make_rep(1,_INFINITY); }else{ to_link=to_link->make_rep(1,_INFINITY); fa=fa->link(to_link); } }else if(re[(*ptr)]==OPEN_QBRACKET){ if ((*ptr)==(strlen(re)-1)) fatal("regex_parser:: parse_re: { in last position."); else{ int lb=0; int ub=_INFINITY; (*ptr)=process_quantifier(re,(*ptr)+1,&lb,&ub); if (close_b) return fa->make_rep(lb,ub); else{ to_link=to_link->make_rep(lb,ub); fa=fa->link(to_link); } } }else if(re[(*ptr)]==OPEN_SBRACKET){ if ((*ptr)==(strlen(re)-1)) fatal("regex_parser:: parse_re: [ in last position."); else (*ptr)=process_range(&fa,&to_link,re,(*ptr)+1); }else if(re[(*ptr)]==OR){ (*ptr)++; fa=fa->make_or(parse_re(re,ptr,false)); }else if(re[(*ptr)]==OPEN_RBRACKET){ (*ptr)++; fa=fa->get_last()->link(parse_re(re,ptr,true)); }else if(re[(*ptr)]==CLOSE_RBRACKET){ if (open_b){ close_b=true; (*ptr)++; if ((*ptr)==strlen(re) || !is_repetition(re[(*ptr)])) return fa; } //fatal("parse:: parse_re : close ) without opening it."); else{ return fa; } } } return fa->get_first(); }
NFA *regex_parser::group_regex(FILE *file, int group[]){ rewind(file); char *re=allocate_char_array(1000); int i=0, j=0, k=0; int size=group[0]; unsigned int c=fgetc(file); // NFA NFA *nfa=new NFA(); NFA *non_anchored = nfa->add_epsilon(); // for .* RegEx NFA *anchored = nfa->add_epsilon(); // for anchored RegEx (^) // parsing the RegEx and putting them in a NFA int min_j = group[1]; int max_j = group[1]; for (k=1; k<=size; k++){ if (group[k] > max_j) max_j = group[k]; if (group[k] < min_j) min_j = group[k]; } if (DEBUG) fprintf(stdout, "@\n"); while (c!=EOF){ if (c=='\n' || c=='\r'){ if (i!=0){ re[i]='\0'; if (re[0]!='#'){ j++; if (j>max_j) break; for (k=1; k<=size; k++){ if (j==group[k]){ if (DEBUG) fprintf(stdout,"%d) preprocessing regex:: <%s> ...\n",j,re); //if (DEBUG) printf("%d ", j); parse_re(nfa, re); break; } if (j<min_j) break; } } i=0; free(re); re=NULL; re=allocate_char_array(1000); } } else{ re[i++]=c; } c=fgetc(file); } //end while if (i!=0){ re[i]='\0'; if (re[0]!= '#'){ j++; if (j<=max_j) for (k=1; k<=size; k++){ if (j==group[k]){ if (DEBUG) fprintf(stdout,"\n%d) preprocessing regex:: <%s> ...\n",j,re); //if (DEBUG) printf("%d ", j); parse_re(nfa, re); break; } if (j<min_j) break; } } free(re); re=NULL; } if (DEBUG) fprintf(stdout, "@\n"); if (DEBUG) fprintf(stdout, "All RegEx processed\n"); if (re!=NULL) free(re); //handle -m modifier if (m_modifier && (!anchored->get_epsilon()->empty() || !anchored->get_transitions()->empty())){ non_anchored->add_transition('\n',anchored); non_anchored->add_transition('\r',anchored); } //delete non_anchored, if necessary if (non_anchored->get_epsilon()->empty() && non_anchored->get_transitions()->empty()){ nfa->get_epsilon()->remove(non_anchored); delete non_anchored; } else{ non_anchored->add_any(non_anchored); } return nfa->get_first(); }