bool fileReadWrite ( string inFile, string outFile ) { ifstream fpCal; ofstream fpOut; fpCal.open( strdup(inFile.c_str()) ); fpOut.open( strdup(outFile.c_str()) ); if (!fpCal.is_open() || !fpOut.is_open()){ perror("Regular expression or output file doesn't exist or could not be opened\n"); exit (1); } string discardAlphabetString; getline (fpCal,discardAlphabetString ); vector < NFAStateSet > allRegexNFA; vector <string> tokenClassName; string infixExp; while ( getline ( fpCal,infixExp ) ){ stringstream ss(infixExp); vector <string> tokenRegEx; while (ss){ string sub; ss >> sub; tokenRegEx.push_back( sub ); } tokenClassName.push_back( tokenRegEx[0] ); string escapedSeq = convertStringToEscapedString ( tokenRegEx[1] ); allRegexNFA.push_back( generateOpStack ( infixToPostfix( implicitConcat ( escapedSeq ) ), inFile ) ); } /* * Now allRegexNFA contains all NFAs for each expression as a vector * of vectors. Now call function to join these together, single start * state goes to each start state of these NFAs, and then each end * is stored separately with TOKEN_IDs (contained in tokenClassName) */ NFAStateSet resultCombined = combineAllNFA ( allRegexNFA ); int tokenPtr = 0; for ( unsigned int i=0; i < resultCombined.size(); i++ ) if ( resultCombined[i][0] == -1 ) fpOut<<tokenClassName[tokenPtr++]<<" "<<resultCombined[i][2]<<","; fpOut<<"\n"; fpOut<<uniqueStateID<<"\n"; for ( unsigned int i=0; i < resultCombined.size(); i++ ){ bool endReached = false; for ( unsigned int j=0; j<resultCombined[i].size(); j++ ) if ( resultCombined[i][0] != -1 ){ fpOut<<resultCombined[i][j]<<"\t"; endReached = true; } if ( endReached ) fpOut<<"\n"; } fpCal.close(); fpOut.close(); return (0); }
NFAStateSet sortStatesForDFA ( NFAStateSet unsortedStates ) { sort (unsortedStates.begin(), unsortedStates.end(), compare); int startState = unsortedStates[0][1]; vector < int > temp; temp.push_back(operators[EPSILON]); temp.push_back(0); temp.push_back(startState); unsortedStates.insert( unsortedStates.begin(), temp ); return unsortedStates; }
int getFinalState ( NFAStateSet states ) { int finalState; for ( unsigned int i=0; i<states.size(); i++) if ( states[i][0] == -1 ) finalState = states[i][2]; return finalState; }
static void buildPred(NFAStateSet &pred, const NGHolder &g, NFAVertex v) { for (auto u : inv_adjacent_vertices_range(v, g)) { if (!is_special(u, g)) { pred.set(g[u].index); } } }
static void buildSucc(NFAStateSet &succ, const NGHolder &g, NFAVertex v) { for (auto w : adjacent_vertices_range(v, g)) { if (!is_special(w, g)) { succ.set(g[w].index); } } }
/** Some squash states are clearly not advantageous in the NFA, as they do * incur the cost of an exception: * -# acyclic states * -# squash only a few acyclic states */ void filterSquashers(const NGHolder &g, map<NFAVertex, NFAStateSet> &squash) { DEBUG_PRINTF("filtering\n"); map<u32, NFAVertex> rev; /* vertex_index -> vertex */ for (auto v : vertices_range(g)) { rev[g[v].index] = v; } for (auto v : vertices_range(g)) { if (!contains(squash, v)) { continue; } DEBUG_PRINTF("looking at squash set for vertex %u\n", g[v].index); if (!hasSelfLoop(v, g)) { DEBUG_PRINTF("acyclic\n"); squash.erase(v); continue; } NFAStateSet squashed = squash[v]; squashed.flip(); /* default sense for mask of survivors */ for (NFAStateSet::size_type sq = squashed.find_first(); sq != squashed.npos; sq = squashed.find_next(sq)) { NFAVertex u = rev[sq]; if (hasSelfLoop(u, g)) { DEBUG_PRINTF("squashing a cyclic (%zu) is always good\n", sq); goto next_vertex; } } if (squashed.count() < MIN_PURE_ACYCLIC_SQUASH) { DEBUG_PRINTF("squash set too small\n"); squash.erase(v); continue; } next_vertex:; DEBUG_PRINTF("squash set ok\n"); } }
NFAStateSet allTermStates ( string fileName ) { ifstream fpCal; fpCal.open( strdup(fileName.c_str()) ); if (!fpCal.is_open()){ perror("Regular expression file doesn't exist or could not be opened\n"); exit (1); } string terminalString; NFAStateSet singletonStates; getline (fpCal,terminalString ); stringstream ss(terminalString); // Insert the string into a stream /* * Push the EPSILON state first as state 1->2 on EPSILON */ vector <int> epsilonState; epsilonState.push_back( (int)operators[EPSILON] ); epsilonState.push_back( -1 ); epsilonState.push_back( -1 ); singletonStates.push_back(epsilonState); /* * Now do the same for all other terminals */ while (ss){ string sub; ss >> sub; vector <int> inputState; inputState.push_back((int)(sub.c_str())[0]); inputState.push_back(-1); inputState.push_back(-1); singletonStates.push_back(inputState); } singletonStates.pop_back(); fpCal.close(); return singletonStates; }
NFAStateSet combineAllNFA ( vector < NFAStateSet> allRegexNFA ) { NFAStateSet combined; for ( unsigned int i=0; i<allRegexNFA.size(); i++){ for ( unsigned int j=0; j<allRegexNFA[i].size(); j++){ if ( allRegexNFA[i][j][0] != -1 ) combined.push_back(allRegexNFA[i][j]); if ( allRegexNFA[i][j][0] == -1 ){ vector <int> temp; temp.push_back( (int)operators[EPSILON] ); temp.push_back( 0 ); temp.push_back( allRegexNFA[i][j][1] ); combined.push_back ( temp ); vector <int> startEndInfo; startEndInfo.push_back( -1 ); startEndInfo.push_back( 0 ); startEndInfo.push_back( allRegexNFA[i][j][2] ); combined.push_back ( startEndInfo ); } } } return combined; }
static void findDerivedSquashers(const NGHolder &g, const vector<NFAVertex> &vByIndex, const PostDomTree &pdom_tree, const NFAStateSet &init, map<NFAVertex, NFAStateSet> *squash, som_type som, const vector<DepthMinMax> &som_depths, const ue2::unordered_map<NFAVertex, u32> ®ion_map, smgb_cache &cache) { deque<NFAVertex> remaining; for (const auto &m : *squash) { remaining.push_back(m.first); } while (!remaining.empty()) { NFAVertex v = remaining.back(); remaining.pop_back(); for (auto u : inv_adjacent_vertices_range(v, g)) { if (is_special(u, g)) { continue; } if (g[v].char_reach != g[u].char_reach) { continue; } if (out_degree(u, g) != 1) { continue; } NFAStateSet u_squash(init.size()); u32 u_index = g[u].index; buildSquashMask(u_squash, g, u, g[u].char_reach, init, vByIndex, pdom_tree, som, som_depths, region_map, cache); u_squash.set(u_index); /* never clear ourselves */ if ((~u_squash).any()) { // i.e. some bits unset in mask DEBUG_PRINTF("%u is an upstream squasher of %u\n", u_index, g[v].index); (*squash)[u] = u_squash; remaining.push_back(u); } } } }
NFAStateSet generateOpStack ( string postfix, string fileName ) { stack < NFAStateSet > stateStack; NFAStateSet inputState = allTermStates( fileName ); bool isSingleton = true; for ( unsigned int i=0; i<postfix.length(); i++ ){ if ( isOperator( postfix[i] ) == -1 ){ //use the nfa for this state for ( unsigned int j = 0; j<inputState.size(); j++ ){ if ( inputState[j][0] == (int)postfix[i] ){ NFAStateSet tempToBePushed; /* * Copy this to uidGiven to assign unique IDs */ vector <int> uidGiven = inputState[j]; uidGiven[1]=uniqueStateID; uniqueStateID++; uidGiven[2]=uniqueStateID; uniqueStateID++; tempToBePushed.push_back(uidGiven); stateStack.push ( tempToBePushed ); break; } } } if ( isOperator ( postfix[i] ) == OR ){ //send the two preceding states to the function to OR NFAStateSet output; NFAStateSet operand2 = stateStack.top(); stateStack.pop(); NFAStateSet operand1 = stateStack.top(); stateStack.pop(); output = operationOR ( operand1, operand2 ); stateStack.push(output); isSingleton = false; } if ( isOperator ( postfix[i] ) == CONCAT ){ //send the two preceding states to the function to CONCAT NFAStateSet output; NFAStateSet operand2 = stateStack.top(); stateStack.pop(); NFAStateSet operand1 = stateStack.top(); stateStack.pop(); output = operationCONCAT ( operand1, operand2 ); stateStack.push(output); isSingleton = false; } if ( isOperator ( postfix[i] ) == STAR ){ //send the preceding state to the function to STAR NFAStateSet output; NFAStateSet operand = stateStack.top(); stateStack.pop(); output = operationSTAR ( operand ); stateStack.push(output); isSingleton = false; } } if (!isSingleton){ NFAStateSet finalStateSet = stateStack.top(); if (!stateStack.empty()) stateStack.pop(); if (stateStack.empty()) return finalStateSet; else{ cout<<"ERROR: NFA stack didn't empty itself\n"; exit(1); } } else if ( isSingleton ){ NFAStateSet finalStateSet = stateStack.top(); vector <int> startEndInfo; startEndInfo.push_back(-1); startEndInfo.push_back(finalStateSet[0][1]); startEndInfo.push_back(finalStateSet[0][2]); finalStateSet.push_back( startEndInfo ); if (!stateStack.empty()) stateStack.pop(); if ( stateStack.empty() ) return finalStateSet; else{ cout<<"ERROR: Stack didn't empty itself\n"; exit(1); } } NFAStateSet dummyReturnVal; return (dummyReturnVal); }
NFAStateSet operationSTAR ( NFAStateSet a ) { /* * CONCATs the states pointed by a and b, and then returns the NFA for * a* */ NFAStateSet output; for ( unsigned int i=0; i<a.size() && a[i][0] != -1; i++ ) output.push_back( a[i] ); int oldStart; int oldFinish; if ( a[a.size()-1][0] == -1 ){ oldStart = a[a.size()-1][1]; oldFinish = a[a.size()-1][2]; } else{ oldStart = a[0][1]; oldFinish = a[0][2]; } int newStart = uniqueStateID; uniqueStateID++; int newFinal = uniqueStateID; uniqueStateID++; vector<int> startEpsilon, startFinishEpsilon; startEpsilon.push_back ( operators[EPSILON] ); startEpsilon.push_back ( newStart ); startEpsilon.push_back ( oldStart ); startFinishEpsilon.push_back ( operators[EPSILON] ); startFinishEpsilon.push_back ( newStart ); startFinishEpsilon.push_back ( newFinal ); vector<int> loopBack, finishEpsilon; loopBack.push_back ( operators[EPSILON] ); loopBack.push_back ( oldFinish ); loopBack.push_back ( oldStart ); finishEpsilon.push_back ( operators[EPSILON] ); finishEpsilon.push_back ( oldFinish ); finishEpsilon.push_back ( newFinal ); vector<int> startEndInfo; startEndInfo.push_back( -1 ); startEndInfo.push_back( newStart ); startEndInfo.push_back( newFinal ); output.push_back ( startEpsilon ); output.push_back ( startFinishEpsilon ); output.push_back ( loopBack ); output.push_back ( finishEpsilon ); output.push_back ( startEndInfo ); return output; }
NFAStateSet operationCONCAT ( NFAStateSet a, NFAStateSet b ) { /* * CONCATs the states pointed by a and b, and then returns the NFA for * a@b */ NFAStateSet output; for ( unsigned int i=0; i<a.size() && a[i][0] != -1; i++ ) output.push_back( a[i] ); for ( unsigned int i=0; i<b.size() && b[i][0] != -1; i++ ) output.push_back( b[i] ); int oldStart1; int oldStart2; int oldFinish1; int oldFinish2; if ( a[a.size()-1][0] == -1 ){ oldStart1 = a[a.size()-1][1]; oldFinish1 = a[a.size()-1][2]; } else{ oldStart1 = a[0][1]; oldFinish1 = a[0][2]; } if ( b[b.size()-1][0] == -1 ){ oldStart2 = b[b.size()-1][1]; oldFinish2 = b[b.size()-1][2]; } else { oldStart2 = b[0][1]; oldFinish2 = b[0][2]; } int newStart = uniqueStateID; vector<int> startEpsilon, middleEpsilon; startEpsilon.push_back ( operators[EPSILON] ); startEpsilon.push_back ( newStart ); startEpsilon.push_back ( oldStart1 ); middleEpsilon.push_back ( operators[EPSILON] ); middleEpsilon.push_back ( oldFinish1 ); middleEpsilon.push_back ( oldStart2 ); uniqueStateID++; int newFinal = uniqueStateID; vector<int> finalEpsilon; finalEpsilon.push_back ( operators[EPSILON] ); finalEpsilon.push_back ( oldFinish2 ); finalEpsilon.push_back ( newFinal ); uniqueStateID++; vector<int> startEndInfo; startEndInfo.push_back( -1 ); startEndInfo.push_back( newStart ); startEndInfo.push_back( newFinal ); output.push_back ( startEpsilon ); output.push_back ( middleEpsilon ); output.push_back ( finalEpsilon ); output.push_back ( startEndInfo ); return output; }
/** * Builds a squash mask based on the pdom tree of v and the given char reach. * The built squash mask is a bit conservative for non-dot cases and could * be improved with a bit of thought. */ static void buildSquashMask(NFAStateSet &mask, const NGHolder &g, NFAVertex v, const CharReach &cr, const NFAStateSet &init, const vector<NFAVertex> &vByIndex, const PostDomTree &tree, som_type som, const vector<DepthMinMax> &som_depths, const ue2::unordered_map<NFAVertex, u32> ®ion_map, smgb_cache &cache) { DEBUG_PRINTF("build base squash mask for vertex %u)\n", g[v].index); vector<NFAVertex> q; PostDomTree::const_iterator it = tree.find(v); if (it != tree.end()) { q.insert(q.end(), it->second.begin(), it->second.end()); } const u32 v_index = g[v].index; while (!q.empty()) { NFAVertex u = q.back(); q.pop_back(); const CharReach &cru = g[u].char_reach; if ((cru & ~cr).any()) { /* bail: bad cr on vertex u */ /* TODO: this could be better * * we still need to ensure that we record any paths leading to u. * Hence all vertices R which can reach u must be excluded from the * squash mask. Note: R != pdom(u) and there may exist an x in (R - * pdom(u)) which is in pdom(y) where y is in q. Clear ? */ mask.set(); return; } const u32 u_index = g[u].index; if (som) { /* We cannot add a state u to the squash mask of v if it may have an * earlier start of match offset. ie for us to add a state u to v * maxSomDist(u) <= minSomDist(v) */ const depth &max_som_dist_u = som_depths[u_index].max; const depth &min_som_dist_v = som_depths[v_index].min; if (max_som_dist_u.is_infinite()) { /* it is hard to tell due to the INF if u can actually store an * earlier SOM than w (state we are building the squash mask * for) - need to think more deeply */ if (mustBeSetBefore(u, v, g, cache) && !somMayGoBackwards(u, g, region_map, cache)) { DEBUG_PRINTF("u %u v %u\n", u_index, v_index); goto squash_ok; } } if (max_som_dist_u > min_som_dist_v) { /* u can't be squashed as it may be storing an earlier SOM */ goto add_children_to_queue; } } squash_ok: mask.set(u_index); DEBUG_PRINTF("pdom'ed %u\n", u_index); add_children_to_queue: it = tree.find(u); if (it != tree.end()) { q.insert(q.end(), it->second.begin(), it->second.end()); } } if (cr.all()) { /* the init states aren't in the pdom tree. If all their succ states * are set (or v), we can consider them post dominated */ /* Note: init states will always result in a later som */ for (size_t i = init.find_first(); i != init.npos; i = init.find_next(i)) { /* Yes vacuous patterns do exist */ NFAVertex iv = vByIndex[i]; for (auto w : adjacent_vertices_range(iv, g)) { if (w == g.accept || w == g.acceptEod) { DEBUG_PRINTF("skipping %zu due to vacuous accept\n", i); goto next_init_state; } u32 vert_id = g[w].index; if (w != iv && w != v && !mask.test(vert_id)) { DEBUG_PRINTF("skipping %zu due to %u\n", i, vert_id); goto next_init_state; } } DEBUG_PRINTF("pdom'ed %zu\n", i); mask.set(i); next_init_state:; } } mask.flip(); }