/* * States = 0,1,2, ..., numStates-1 * numStates-2 = Start * numStates-1 = final */ nfa::nfa (string regex) { num_states = 2; // start & final alphabet.insert (EPSILON); // initialize the transitions vector & push start & final states transitions.resize (2); transitions[0].resize (MAX_ALPHABET); transitions[1].resize (MAX_ALPHABET); // base regular expression if (regex.length () == 1) { alphabet.insert (regex[0]); transitions[0][regex[0]].insert (1); return; } // complex regular expression [DEAL WITH ISSUES IF your 0->1 e-transtion creates] // epsilon transition transitions[0][EPSILON].insert (1); // create the nfa for the nfa build_nfa (regex); }
int main(int argc, char** argv) { // read all used flags while (1) { static struct option long_options[] = { {"invert-match", no_argument, 0, 'v'}, {"line-regexp", no_argument, 0, 'x'}, {"count", no_argument, 0, 'c'}, {"help", no_argument, &display_help, 1}, {"version", no_argument, 0, 'V'}, {"cache-limit", required_argument, 0, OPTION_CACHE_LIMIT}, {0, 0, 0, 0} }; int option_index = 0; int c = getopt_long(argc, argv, "vxcV", long_options, &option_index); if (c == -1) break; switch (c) { case 'v': invert_match = true; break; case 'x': whole_lines = true; break; case 'c': count_matches = true; break; case OPTION_CACHE_LIMIT: cache_mem_limit = parse_size_in_kb(optarg); if (cache_mem_limit == -1U) { print_usage(argv[0]); return 2; } break; case 'V': display_version = true; break; case 0: break; default: print_usage(argv[0]); return 2; } } if (display_help) { printf(help_message, argv[0]); return 0; } if (display_version) { printf(version_message); return 0; } // check if there is exactly one regular expression if (optind != argc-1) { print_usage(argv[0]); return 2; } char* regex = argv[optind]; // parse the regular expression to abstract syntax tree struct syntree* tree; tree = parse(regex, strlen(regex)); // build nfa for the regular expression struct nfa* nfa; nfa = build_nfa(tree, whole_lines); free_tree(tree); // initialize simulation state sim_init(&state, nfa, invert_match, cache_mem_limit); // initialize buffer uintptr_t buffer_size = 65536; buffer_init(&buffer, STDIN_FILENO, STDOUT_FILENO, buffer_size); // main matching loop uintmax_t match_count = 0; bool new_line = true; bool some_match = false; intptr_t res = buffer_next(&buffer); if (!count_matches) buffer_mark(&buffer); while (res == 1) { // save next input byte to ch uint_fast8_t ch = buffer_get(&buffer); if (ch == '\n') { // handle the end of line here new_line = true; // simulate the special line end character if (!state.dfa_state->accept) sim_step(&state, CHAR_INPUT_END); if (sim_is_match(&state)) { // the last line matched, either print it or count it some_match = true; if (count_matches) { ++match_count; } else { int_fast8_t res = buffer_print(&buffer, true); if (res != 1) die(2, (res == -1) ? errno : 0, "Error writing to stdout"); } } // reset simulation state state.dfa_state = state.after_begin; // read next character from input res = buffer_next(&buffer); if (!count_matches) buffer_mark(&buffer); } else { new_line = false; // simulate only if there was no match on the current line if (!state.dfa_state->accept) { // if the character is not valid ASCII, no transitions will take place if (ch > MAX_CHAR) state.dfa_state = state.before_begin; // otherwise do the simulation else sim_step(&state, ch); } // read next character from input res = buffer_next(&buffer); } } if (res == -1) die(2, errno, "Error reading from stdin"); // check if there is no \n at the end of the input // if it is the case, behave as if it was there if (!new_line) { if (!state.dfa_state->accept) sim_step(&state, CHAR_INPUT_END); if (sim_is_match(&state)) { some_match = true; if (count_matches) { ++match_count; } else { int_fast8_t res = buffer_print(&buffer, false); puts(""); if (res != 1) die(2, (res == -1) ? errno : 0, "Error writing to stdout"); } } } // if -c is used, print the count of matches if (count_matches) { printf("%" PRIuMAX "\n", match_count); } // clean up buffer_cleanup(&buffer); sim_cleanup(&state); free_nfa(nfa); // return status is based on whether there was a match or not return some_match ? 0 : 1; }