예제 #1
0
bool fileReadWrite ( string inFile, string outFile )
{
	ifstream fpCal;
	ofstream fpOut;
	fpCal.open( strdup(inFile.c_str()) );
	fpOut.open( strdup(outFile.c_str()) );
	if (!fpCal.is_open() || !fpOut.is_open()){
		perror("Regular expression or output file doesn't exist or could not be opened\n");
		exit (1);
	}
	string discardAlphabetString;	
	getline (fpCal,discardAlphabetString );	
	vector < NFAStateSet > allRegexNFA;
	vector <string> tokenClassName;
	
	string infixExp;
	while (	getline ( fpCal,infixExp ) ){
		stringstream ss(infixExp);
		vector <string> tokenRegEx;
		while (ss){
			string sub;
			ss >> sub;
			tokenRegEx.push_back( sub );
		}
		tokenClassName.push_back( tokenRegEx[0] );
		string escapedSeq = convertStringToEscapedString ( tokenRegEx[1] );
		allRegexNFA.push_back( generateOpStack ( infixToPostfix( implicitConcat ( escapedSeq ) ), inFile ) );
	}
	/*
	 * Now allRegexNFA contains all NFAs for each expression as a vector
	 * of vectors. Now call function to join these together, single start
	 * state goes to each start state of these NFAs, and then each end
	 * is stored separately with TOKEN_IDs (contained in tokenClassName)
	 */
	NFAStateSet resultCombined = combineAllNFA ( allRegexNFA );
	
	int tokenPtr = 0;
	for ( unsigned int i=0; i < resultCombined.size(); i++ )
		if ( resultCombined[i][0] == -1 )
			fpOut<<tokenClassName[tokenPtr++]<<" "<<resultCombined[i][2]<<",";
	fpOut<<"\n";
	fpOut<<uniqueStateID<<"\n";
	for ( unsigned int i=0; i < resultCombined.size(); i++ ){
		bool endReached = false;
		for ( unsigned int j=0; j<resultCombined[i].size(); j++ )
			if ( resultCombined[i][0] != -1 ){
				fpOut<<resultCombined[i][j]<<"\t";
				endReached = true;
			}
		if ( endReached )
			fpOut<<"\n";
	}

	fpCal.close();
	fpOut.close();
	return (0);
}
예제 #2
0
NFAStateSet sortStatesForDFA ( NFAStateSet unsortedStates )
{
	sort (unsortedStates.begin(), unsortedStates.end(), compare);
	int startState = unsortedStates[0][1];
	vector < int > temp;
	temp.push_back(operators[EPSILON]);
	temp.push_back(0);
	temp.push_back(startState);
	unsortedStates.insert( unsortedStates.begin(), temp );
	
	return unsortedStates;
}
예제 #3
0
int getFinalState ( NFAStateSet states )
{
	int finalState;
	for ( unsigned int i=0; i<states.size(); i++)
		if ( states[i][0] == -1 )
			finalState = states[i][2];
	return finalState;
}
예제 #4
0
static
void buildPred(NFAStateSet &pred, const NGHolder &g, NFAVertex v) {
    for (auto u : inv_adjacent_vertices_range(v, g)) {
        if (!is_special(u, g)) {
            pred.set(g[u].index);
        }
    }
}
예제 #5
0
static
void buildSucc(NFAStateSet &succ, const NGHolder &g, NFAVertex v) {
    for (auto w : adjacent_vertices_range(v, g)) {
        if (!is_special(w, g)) {
            succ.set(g[w].index);
        }
    }
}
예제 #6
0
/** Some squash states are clearly not advantageous in the NFA, as they do
 * incur the cost of an exception:
 * -# acyclic states
 * -# squash only a few acyclic states
 */
void filterSquashers(const NGHolder &g,
                     map<NFAVertex, NFAStateSet> &squash) {
    DEBUG_PRINTF("filtering\n");
    map<u32, NFAVertex> rev; /* vertex_index -> vertex */
    for (auto v : vertices_range(g)) {
        rev[g[v].index] = v;
    }

    for (auto v : vertices_range(g)) {
        if (!contains(squash, v)) {
            continue;
        }
        DEBUG_PRINTF("looking at squash set for vertex %u\n",
                     g[v].index);

        if (!hasSelfLoop(v, g)) {
            DEBUG_PRINTF("acyclic\n");
            squash.erase(v);
            continue;
        }

        NFAStateSet squashed = squash[v];
        squashed.flip(); /* default sense for mask of survivors */
        for (NFAStateSet::size_type sq = squashed.find_first();
             sq != squashed.npos; sq = squashed.find_next(sq)) {
            NFAVertex u = rev[sq];
            if (hasSelfLoop(u, g)) {
                DEBUG_PRINTF("squashing a cyclic (%zu) is always good\n", sq);
                goto next_vertex;
            }
        }

        if (squashed.count() < MIN_PURE_ACYCLIC_SQUASH) {
            DEBUG_PRINTF("squash set too small\n");
            squash.erase(v);
            continue;
        }

    next_vertex:;
        DEBUG_PRINTF("squash set ok\n");
    }
}
예제 #7
0
NFAStateSet allTermStates ( string fileName )
{
	ifstream fpCal;
	fpCal.open( strdup(fileName.c_str()) );
	if (!fpCal.is_open()){
		perror("Regular expression file doesn't exist or could not be opened\n");
		exit (1);
	}
	
	string terminalString;
	NFAStateSet singletonStates;
	
	getline (fpCal,terminalString );	
    stringstream ss(terminalString); // Insert the string into a stream
    /*
     * Push the EPSILON state first as state 1->2 on EPSILON
     */
	vector <int> epsilonState;
	epsilonState.push_back( (int)operators[EPSILON] );
	epsilonState.push_back( -1 );
	epsilonState.push_back( -1 );
	singletonStates.push_back(epsilonState);
	/*
	 * Now do the same for all other terminals
	 */
    while (ss){
		string sub;
		ss >> sub;
		vector <int> inputState;
		inputState.push_back((int)(sub.c_str())[0]);
		inputState.push_back(-1);
		inputState.push_back(-1);
        singletonStates.push_back(inputState);
    }
	singletonStates.pop_back();
	
	fpCal.close();
	return singletonStates;
}
예제 #8
0
NFAStateSet combineAllNFA ( vector < NFAStateSet> allRegexNFA )
{
	NFAStateSet combined;
	for ( unsigned int i=0; i<allRegexNFA.size(); i++){
		for ( unsigned int j=0; j<allRegexNFA[i].size(); j++){
			if ( allRegexNFA[i][j][0] != -1 )
				combined.push_back(allRegexNFA[i][j]);
			if ( allRegexNFA[i][j][0] == -1 ){
				vector <int> temp;
				temp.push_back( (int)operators[EPSILON] );
				temp.push_back( 0 );
				temp.push_back( allRegexNFA[i][j][1] );
				combined.push_back ( temp );

				vector <int> startEndInfo;
				startEndInfo.push_back( -1 );
				startEndInfo.push_back( 0 );
				startEndInfo.push_back( allRegexNFA[i][j][2] );
				combined.push_back ( startEndInfo );
			}
		}
	}
	return combined;
}
예제 #9
0
static
void findDerivedSquashers(const NGHolder &g, const vector<NFAVertex> &vByIndex,
                          const PostDomTree &pdom_tree, const NFAStateSet &init,
                          map<NFAVertex, NFAStateSet> *squash, som_type som,
                          const vector<DepthMinMax> &som_depths,
                          const ue2::unordered_map<NFAVertex, u32> &region_map,
                          smgb_cache &cache) {
    deque<NFAVertex> remaining;
    for (const auto &m : *squash) {
        remaining.push_back(m.first);
    }

    while (!remaining.empty()) {
        NFAVertex v = remaining.back();
        remaining.pop_back();

        for (auto u : inv_adjacent_vertices_range(v, g)) {
            if (is_special(u, g)) {
                continue;
            }

            if (g[v].char_reach != g[u].char_reach) {
                continue;
            }

            if (out_degree(u, g) != 1) {
                continue;
            }

            NFAStateSet u_squash(init.size());
            u32 u_index = g[u].index;

            buildSquashMask(u_squash, g, u, g[u].char_reach, init, vByIndex,
                            pdom_tree, som, som_depths, region_map, cache);

            u_squash.set(u_index); /* never clear ourselves */

            if ((~u_squash).any()) { // i.e. some bits unset in mask
                DEBUG_PRINTF("%u is an upstream squasher of %u\n", u_index,
                             g[v].index);
                (*squash)[u] = u_squash;
                remaining.push_back(u);
            }
        }
    }
}
예제 #10
0
NFAStateSet generateOpStack ( string postfix, string fileName )
{
	stack < NFAStateSet > stateStack;
	NFAStateSet inputState = allTermStates( fileName );
	bool isSingleton = true;
	for ( unsigned int i=0; i<postfix.length(); i++ ){
		if ( isOperator( postfix[i] ) == -1 ){
			//use the nfa for this state
			for ( unsigned int j = 0; j<inputState.size(); j++ ){
				if ( inputState[j][0] == (int)postfix[i] ){
					NFAStateSet tempToBePushed;
					/*
					 * Copy this to uidGiven to assign unique IDs
					 */
					vector <int> uidGiven = inputState[j];
					uidGiven[1]=uniqueStateID;
					uniqueStateID++;
					uidGiven[2]=uniqueStateID;
					uniqueStateID++;
					
					tempToBePushed.push_back(uidGiven);
					stateStack.push ( tempToBePushed );
					break;
				}
			}
		}
		if ( isOperator ( postfix[i] ) == OR ){
			//send the two preceding states to the function to OR
			NFAStateSet output;
			NFAStateSet operand2 = stateStack.top();
			stateStack.pop();
			NFAStateSet operand1 = stateStack.top();
			stateStack.pop();
			output = operationOR ( operand1, operand2 );
			stateStack.push(output);
			isSingleton = false;
		}
		if ( isOperator ( postfix[i] ) == CONCAT ){
			//send the two preceding states to the function to CONCAT
			NFAStateSet output;
			NFAStateSet operand2 = stateStack.top();
			stateStack.pop();
			NFAStateSet operand1 = stateStack.top();
			stateStack.pop();
			output = operationCONCAT ( operand1, operand2 );
			stateStack.push(output);
			isSingleton = false;
		}
		if ( isOperator ( postfix[i] ) == STAR ){
			//send the preceding state to the function to STAR
			NFAStateSet output;
			NFAStateSet operand = stateStack.top();
			stateStack.pop();
			output = operationSTAR ( operand );
			stateStack.push(output);
			isSingleton = false;
		}
	}
	if (!isSingleton){
		NFAStateSet finalStateSet = stateStack.top();
		if (!stateStack.empty())
			stateStack.pop();
		if (stateStack.empty())
			return finalStateSet;
		else{
			cout<<"ERROR: NFA stack didn't empty itself\n";
			exit(1);
		}
	}
	
	else if ( isSingleton ){
		NFAStateSet finalStateSet = stateStack.top();
		vector <int> startEndInfo;
		startEndInfo.push_back(-1);
		startEndInfo.push_back(finalStateSet[0][1]);
		startEndInfo.push_back(finalStateSet[0][2]);
		finalStateSet.push_back( startEndInfo );
	
		if (!stateStack.empty())
			stateStack.pop();
		if ( stateStack.empty() )
			return finalStateSet;
		else{
			cout<<"ERROR: Stack didn't empty itself\n";
			exit(1);
		}
	}
	NFAStateSet dummyReturnVal;
	return (dummyReturnVal);

}
예제 #11
0
NFAStateSet operationSTAR ( NFAStateSet a )
{
	/*
	 * CONCATs the states pointed by a and b, and then returns the NFA for
	 * a*
	 */
	NFAStateSet output;
	for ( unsigned int i=0; i<a.size() && a[i][0] != -1; i++ )
		output.push_back( a[i] );
	int oldStart;
	int oldFinish;
	if ( a[a.size()-1][0] == -1 ){
		oldStart = a[a.size()-1][1];
		oldFinish = a[a.size()-1][2];
	}
	else{
		oldStart = a[0][1];
		oldFinish = a[0][2];
	}
	int newStart = uniqueStateID;
	uniqueStateID++;
	int newFinal = uniqueStateID;
	uniqueStateID++;

	vector<int> startEpsilon, startFinishEpsilon;
	startEpsilon.push_back ( operators[EPSILON] );
	startEpsilon.push_back ( newStart );
	startEpsilon.push_back ( oldStart );
	
	startFinishEpsilon.push_back ( operators[EPSILON] );
	startFinishEpsilon.push_back ( newStart );
	startFinishEpsilon.push_back ( newFinal );
	
	vector<int> loopBack, finishEpsilon;
	loopBack.push_back ( operators[EPSILON] );
	loopBack.push_back ( oldFinish );
	loopBack.push_back ( oldStart );
	
	finishEpsilon.push_back ( operators[EPSILON] );
	finishEpsilon.push_back ( oldFinish );
	finishEpsilon.push_back ( newFinal );
	
	vector<int> startEndInfo;
	startEndInfo.push_back( -1 );
	startEndInfo.push_back( newStart );
	startEndInfo.push_back( newFinal );
	
	output.push_back ( startEpsilon );
	output.push_back ( startFinishEpsilon );
	output.push_back ( loopBack );
	output.push_back ( finishEpsilon );
	output.push_back ( startEndInfo );
	return output;
}
예제 #12
0
NFAStateSet operationCONCAT ( NFAStateSet a, NFAStateSet b )
{
	/*
	 * CONCATs the states pointed by a and b, and then returns the NFA for
	 * a@b
	 */
	NFAStateSet output;
	for ( unsigned int i=0; i<a.size() && a[i][0] != -1; i++ )
		output.push_back( a[i] );
	for ( unsigned int i=0; i<b.size() && b[i][0] != -1; i++ )
		output.push_back( b[i] );
	int oldStart1;
	int oldStart2;
	int oldFinish1;
	int oldFinish2;
	if ( a[a.size()-1][0] == -1 ){
		oldStart1 = a[a.size()-1][1];
		oldFinish1 = a[a.size()-1][2];
	}
	else{
		oldStart1 = a[0][1];
		oldFinish1 = a[0][2];
	}
	if ( b[b.size()-1][0] == -1 ){
		oldStart2 = b[b.size()-1][1];
		oldFinish2 = b[b.size()-1][2];
	}
	else {
		oldStart2 = b[0][1];
		oldFinish2 = b[0][2];
	}
	int newStart = uniqueStateID;
	vector<int> startEpsilon, middleEpsilon;
	startEpsilon.push_back ( operators[EPSILON] );
	startEpsilon.push_back ( newStart );
	startEpsilon.push_back ( oldStart1 );
	
	middleEpsilon.push_back ( operators[EPSILON] );
	middleEpsilon.push_back ( oldFinish1 );
	middleEpsilon.push_back ( oldStart2 );
	
	uniqueStateID++;
	int newFinal = uniqueStateID;
	vector<int> finalEpsilon;
	finalEpsilon.push_back ( operators[EPSILON] );
	finalEpsilon.push_back ( oldFinish2 );
	finalEpsilon.push_back ( newFinal );
	uniqueStateID++;
	vector<int> startEndInfo;
	startEndInfo.push_back( -1 );
	startEndInfo.push_back( newStart );
	startEndInfo.push_back( newFinal );
	
	output.push_back ( startEpsilon );
	output.push_back ( middleEpsilon );
	output.push_back ( finalEpsilon );
	output.push_back ( startEndInfo );
	return output;
}
예제 #13
0
/**
 * Builds a squash mask based on the pdom tree of v and the given char reach.
 * The built squash mask is a bit conservative for non-dot cases and could
 * be improved with a bit of thought.
 */
static
void buildSquashMask(NFAStateSet &mask, const NGHolder &g, NFAVertex v,
                     const CharReach &cr, const NFAStateSet &init,
                     const vector<NFAVertex> &vByIndex, const PostDomTree &tree,
                     som_type som, const vector<DepthMinMax> &som_depths,
                     const ue2::unordered_map<NFAVertex, u32> &region_map,
                     smgb_cache &cache) {
    DEBUG_PRINTF("build base squash mask for vertex %u)\n",
                 g[v].index);

    vector<NFAVertex> q;

    PostDomTree::const_iterator it = tree.find(v);
    if (it != tree.end()) {
        q.insert(q.end(), it->second.begin(), it->second.end());
    }

    const u32 v_index = g[v].index;

    while (!q.empty()) {
        NFAVertex u = q.back();
        q.pop_back();
        const CharReach &cru = g[u].char_reach;

        if ((cru & ~cr).any()) {
            /* bail: bad cr on vertex u */
            /* TODO: this could be better
             *
             * we still need to ensure that we record any paths leading to u.
             * Hence all vertices R which can reach u must be excluded from the
             * squash mask. Note: R != pdom(u) and there may exist an x in (R -
             * pdom(u)) which is in pdom(y) where y is in q. Clear ?
             */
            mask.set();
            return;
        }

        const u32 u_index = g[u].index;

        if (som) {
            /* We cannot add a state u to the squash mask of v if it may have an
             * earlier start of match offset. ie for us to add a state u to v
             * maxSomDist(u) <= minSomDist(v)
             */
            const depth &max_som_dist_u = som_depths[u_index].max;
            const depth &min_som_dist_v = som_depths[v_index].min;

            if (max_som_dist_u.is_infinite()) {
                /* it is hard to tell due to the INF if u can actually store an
                 * earlier SOM than w (state we are building the squash mask
                 * for) - need to think more deeply
                 */

                if (mustBeSetBefore(u, v, g, cache)
                    && !somMayGoBackwards(u, g, region_map, cache)) {
                    DEBUG_PRINTF("u %u v %u\n", u_index, v_index);
                    goto squash_ok;
                }
            }

           if (max_som_dist_u > min_som_dist_v) {
                /* u can't be squashed as it may be storing an earlier SOM */
                goto add_children_to_queue;
            }

        }

    squash_ok:
        mask.set(u_index);
        DEBUG_PRINTF("pdom'ed %u\n", u_index);
    add_children_to_queue:
        it = tree.find(u);
        if (it != tree.end()) {
            q.insert(q.end(), it->second.begin(), it->second.end());
        }
    }

    if (cr.all()) {
        /* the init states aren't in the pdom tree. If all their succ states
         * are set (or v), we can consider them post dominated */

        /* Note: init states will always result in a later som */
        for (size_t i = init.find_first(); i != init.npos;
             i = init.find_next(i)) {
            /* Yes vacuous patterns do exist */
            NFAVertex iv = vByIndex[i];
            for (auto w : adjacent_vertices_range(iv, g)) {
                if (w == g.accept || w == g.acceptEod) {
                    DEBUG_PRINTF("skipping %zu due to vacuous accept\n", i);
                    goto next_init_state;
                }

                u32 vert_id = g[w].index;
                if (w != iv && w != v && !mask.test(vert_id)) {
                    DEBUG_PRINTF("skipping %zu due to %u\n", i, vert_id);
                    goto next_init_state;
                }
            }
            DEBUG_PRINTF("pdom'ed %zu\n", i);
            mask.set(i);
        next_init_state:;
        }
    }

    mask.flip();
}