//merges dfas to one dfa for traversal lexer_word_repr* lexer_dfa_builder::mergeDfas(const std::vector<lexer_word_repr*>* const words, DfaManager& dfaManager) const { lexer_word_repr* start = dfaManager.createLexerWordRepr(); //So each JOB consists of: // 1) a ptr to dfa node where we left off in mergeToWord // 2) a transition that we suspect is placeable in mergetToWord // 3) a vector of dfa nodes already visited in mergeTo, this prevents folding the fromDfa back "itself" // but in reality there should be at path dinstinguishing mergeFrom from mergeTo that doesn't visit // a node in MergeTo twice. This only makes sense, because the "going back itself" thing is strictly for // kleen closure like behaviour on runtime. The minimal "description" of automata is actually constant // and doesn't need to have two nodes twice. // repeated. auto jobQueue = new std::vector<std::vector<std::tuple<lexer_dfa*, LexerTransition, std::vector<lexer_dfa*>>>*>(); for (int i = 0; i < words->size(); i++) { auto jobVector = new std::vector<std::tuple<lexer_dfa*, LexerTransition, std::vector<lexer_dfa*>>>(); jobQueue->push_back(jobVector); } std::cout << std::endl << "Merging dfas to one, words size: " << words->size() << std::endl; //iterate through words int jobLineIndex = 0; for (int i =0; i < words->size(); i++) { lexer_dfa* word = words->at(i); std::cout << word << std::endl; std::cout << word->getId() << std::endl; lexer_dfa* mergeToDfaPtr = start; lexer_dfa* mergeFromDfaPtr = word; std::vector<LexerTransition> nextTransitions = mergeFromDfaPtr->getTransitions(); DeLOG("Getting transitions for word\n") for (auto transition : nextTransitions) { DeLOG("Adding job to job Queue\n"); std::tuple<lexer_dfa*, LexerTransition, std::vector<lexer_dfa*>> job(mergeToDfaPtr, transition, std::vector<lexer_dfa*>{}); (jobQueue->at(jobLineIndex))->push_back(job); } jobLineIndex++; } DeLOG(std::string("\nMerge Process: ").append(std::to_string(jobQueue->size())).append(" job lines(queues) total\n").c_str()); for (jobLineIndex = 0; jobLineIndex < jobQueue->size(); jobLineIndex++) { DeLOG(std::string("Processing Job line #").append(std::to_string(jobLineIndex+1)).append("\n").c_str()); auto jobVector = jobQueue->at(jobLineIndex); while (jobVector->size() != 0) { DeLOG(std::string("\nThere are ").append(std::to_string(jobVector->size())).append(" in job vector. Processing Job #").append(std::to_string(jobLineIndex + 1)).append("\n").c_str()); auto currJobTuple = jobVector->back(); jobVector->pop_back(); auto currMergeToDfaPtr = std::get<0>(currJobTuple); //here we'll check the currMergeToDfaPtr against the previously visited ptrs in MergeTo //if we've already visited it, we know to ignore it this transition (and NOT to put job back in queue) auto ptrsInMergeToAlreadyVisited = std::get<2>(currJobTuple); auto skipAndContinue = false; for (auto mergeToDfaPtrVisited : ptrsInMergeToAlreadyVisited) { if (currMergeToDfaPtr == mergeToDfaPtrVisited) { skipAndContinue = true; break; } } if (skipAndContinue) { continue; } ptrsInMergeToAlreadyVisited.push_back(currMergeToDfaPtr); auto transitionFromCurrMergeFromDfaPtr = std::get<1>(currJobTuple); auto nextMergeFromDfaPtr = transitionFromCurrMergeFromDfaPtr.getDfaNode(); const auto si = transitionFromCurrMergeFromDfaPtr.getStateAndInput(); DeLOG(std::string{"si = {"}.append(std::to_string(si.getState())).append(1, si.getInput()).append("}\n").c_str()); auto nextMergeToDfaPtrCandidateInfo1 = currMergeToDfaPtr->getNextDfaForInput(si.getInput(), false); auto nextMergeToDfaPtrCandidateProperties1 = nextMergeToDfaPtrCandidateInfo1.second; auto nextMergeToDfaPtrCandidateInfo2 = currMergeToDfaPtr->getNextDfaForInput(si.getInput(), true); auto nextMergeToDfaPtrCandidateProperties2 = nextMergeToDfaPtrCandidateInfo2.second; //aka mergeFromDfaProperties const auto currentMergeFromTransitionProperties = transitionFromCurrMergeFromDfaPtr.getProperties(); if (checkForProperty(currentMergeFromTransitionProperties, Lexer_Dfa_Properties::ISA_PUSH_DOWN_CONTINUANCE) || checkForProperty(currentMergeFromTransitionProperties, Lexer_Dfa_Properties::ISA_PUSH_DOWN_EJECT)) { //If the transition ~to~ current ~mergeFrom~ dfa has the // properties: PUSH_DOWN_EJECT or PUSH_DOWN_CONTINUANCE, // then we assume we are in a state of recursion. If the // recursive pathway in ~mergeTo~ dfa is not taken, // nextMergeaToDfaPtrCandidatePropertes is 0x0, then we // add the current ~mergeFrom~ dfa (noting that it is for a // recursion (stack-count > 0) to the currMergeToDfa. // If the position is filled, properties != 0x0 in dfaInfo // when we query with param indicating stackCount > 0, // then we push the ~nextMergeTo~ dfa into thejobQueue // (along with the properties of this is not already being // done (like w\ the entire transition)) lexer_dfa* nextMergeToDfaPtr = nullptr; if (nextMergeToDfaPtrCandidateProperties2 != 0x0) { nextMergeToDfaPtr = nextMergeToDfaPtrCandidateInfo2.first; } currMergeToDfaPtr->_printTransitions(); std::cout << "is there nextDfaPtr? " << (nextMergeToDfaPtr != nullptr ? "yes" : "no") << nextMergeToDfaPtr << std::endl; if (nextMergeToDfaPtr == nullptr) { LexerStateAndInput aLexerStateAndInput = transitionFromCurrMergeFromDfaPtr.getStateAndInput(); DeLOG(std::string{"::adding transition(["}.append(std::to_string(aLexerStateAndInput.getState())).append(", '").append(1, aLexerStateAndInput.getInput()).append("']->").append(1, nextMergeFromDfaPtr->getId()).append(") to dfa(").append(1, currMergeToDfaPtr->getId()).append(")\n").c_str()); StateAndInput<int,char> aStateAndInput(aLexerStateAndInput.getState(), aLexerStateAndInput.getInput(), transitionFromCurrMergeFromDfaPtr.getIsRanged()); currMergeToDfaPtr->add_next_dfa(aStateAndInput, nextMergeFromDfaPtr, currentMergeFromTransitionProperties); //Sanity check auto nextMergeToDfaPtrCandidateInfoSanity = currMergeToDfaPtr->getNextDfaForInput(si.getInput(), true); auto nextMergeToDfaPtrCandidatePropertiesSanity = nextMergeToDfaPtrCandidateInfoSanity.second; if (nextMergeToDfaPtrCandidateInfoSanity.first == nullptr) { perror("\nCould not find Node. Exiting.\n"); exit(EXIT_FAILURE); } } else { std::vector<LexerTransition> nextTransitions = nextMergeFromDfaPtr->getTransitions(); for (auto transitionFromNextMergeFromDfa : nextTransitions) { const auto si = transitionFromNextMergeFromDfa.getStateAndInput(); DeLOG(std::string{"Couldn't find an opening, pushing back job { to add ("}.append(std::to_string(si.getState())).append(",").append(1, si.getInput()).append(") from dfa-id(").append(std::to_string(nextMergeToDfaPtr->getId())).append(")\n").c_str()); std::tuple<lexer_dfa*, LexerTransition, std::vector<lexer_dfa*>> job(const_cast<lexer_dfa*>(nextMergeToDfaPtr), transitionFromNextMergeFromDfa, ptrsInMergeToAlreadyVisited); jobVector->push_back(job); } } } else if (checkForProperty(currentMergeFromTransitionProperties, Lexer_Dfa_Properties::ISA_NORMAL) || checkForProperty(currentMergeFromTransitionProperties, Lexer_Dfa_Properties::ISA_PUSH_DOWN_ACTIVATOR)) { //Likewise if the transition ~to~ current ~mergeFrom~ dfa // has the properties: NORMAL or PUSH_DOWN_ACTIVATOR, // then we assume then we assume a non-recursive (even if // by the top top level lexer perspective this has yet to // be determined. If the non-recursive pathway in ~mergeTo~ // dfa is not taken, then we add the current ~mergeTo~ dfa // (noting that it is for a recursion (stack-count > 0). // If the position is filled (non-nullptr) dfa result for // query with stackCount > 0 param, then we push the // ~nextMergeTo~ into thejobQueue. lexer_dfa* nextMergeToDfaPtr = nullptr; if (nextMergeToDfaPtrCandidateProperties1 != 0x0) { nextMergeToDfaPtr = nextMergeToDfaPtrCandidateInfo1.first; } currMergeToDfaPtr->_printTransitions(); std::cout << "is there nextDfaPtr? " << (nextMergeToDfaPtr != nullptr ? "yes" : "no") << nextMergeToDfaPtr << std::endl; if (nextMergeToDfaPtr == nullptr) { LexerStateAndInput aLexerStateAndInput = transitionFromCurrMergeFromDfaPtr.getStateAndInput(); DeLOG(std::string{"::adding transition(["}.append(std::to_string(aLexerStateAndInput.getState())).append(", '").append(1, aLexerStateAndInput.getInput()).append("']->").append(1, nextMergeFromDfaPtr->getId()).append(") to dfa(").append(1, currMergeToDfaPtr->getId()).append(")\n").c_str()); StateAndInput<int,char> aStateAndInput(aLexerStateAndInput.getState(), aLexerStateAndInput.getInput(), transitionFromCurrMergeFromDfaPtr.getIsRanged()); currMergeToDfaPtr->add_next_dfa(aStateAndInput, nextMergeFromDfaPtr, currentMergeFromTransitionProperties); //Sanity check auto nextMergeToDfaPtrCandidateInfoSanity = currMergeToDfaPtr->getNextDfaForInput(si.getInput(), false); auto nextMergeToDfaPtrCandidatePropertiesSanity = nextMergeToDfaPtrCandidateInfoSanity; if (nextMergeToDfaPtrCandidateInfoSanity.first == nullptr) { perror("\nYeah, this is bad. After we just added our new transition to merged rep, we can't query for it. The effect of adding a new transition should be immediate (I don't know why it should ever not be...). Exiting.\n"); exit(EXIT_FAILURE); } } else { std::vector<LexerTransition> nextTransitions = nextMergeFromDfaPtr->getTransitions(); for (auto transitionFromNextMergeFromDfa : nextTransitions) { const auto si = transitionFromNextMergeFromDfa.getStateAndInput(); DeLOG(std::string{"Couldn't find an opening, pushing back job { to add ("}.append(std::to_string(si.getState())).append(",").append(1, si.getInput()).append(") from dfa-id(").append(std::to_string(nextMergeToDfaPtr->getId())).append(")\n").c_str()); std::tuple<lexer_dfa*, LexerTransition, std::vector<lexer_dfa*>> job(const_cast<lexer_dfa*>(nextMergeToDfaPtr), transitionFromNextMergeFromDfa, ptrsInMergeToAlreadyVisited); jobVector->push_back(job); } } } else { std::cout << "Undefined language specification: duplicate lexer words?" << std::endl; exit(1); } } delete jobVector; } delete jobQueue; std::cout << "Finished jobs!" << std::endl << std::endl; return start; }
const lexer_dfa* lexer_dfa::getNextDfa(const LexerStateAndInput& lexerStateAndInput) const { const StateAndInput<int,char> stateAndInput(lexerStateAndInput.getState(), lexerStateAndInput.getInput(), false); DONT _printInputHash(stateAndInput, "stateAndInput"); DeLOG(std::string{"\t_nextStates::size = "}.append(std::to_string(_nextStates.size())).append("\n").c_str()); DONT _printTransitions(); lexer_dfa* ret; std::unordered_map<StateAndInput<int,char>, lexer_dfa*, StateAndInputHashFunction, StateAndInputEquals>::const_iterator fetched = _nextStates.find(stateAndInput); //if we can't find anything, it may be possible we enountered //the special "ranged" stateAndInput - which is guaranteed to be mapped //to a unique index in hashmap (its in the formulae) if (fetched == _nextStates.end()) { ret = nullptr; char input = stateAndInput.getInput(); if (input == '0') { StateAndInput<int,char> rangedInput(stateAndInput.getState(), SI_NUMBERS_0, true); DONT std::cout << "\t\t"; DONT _printInputHash(rangedInput, "rangedInput"); std::unordered_map<StateAndInput<int,char>, lexer_dfa*, StateAndInputHashFunction, StateAndInputEquals>::const_iterator fetchedNumbers0 = _nextStates.find(rangedInput); if (fetchedNumbers0 != _nextStates.end()) { DONT std::cout << "\tFound rangedNumber! (0)" << std::endl; ret = fetchedNumbers0->second; } else { StateAndInput<int,char> rangedInput2(stateAndInput.getState(), SI_NUMBERS_0to9, true); DONT std::cout << "\t\t"; DONT _printInputHash(rangedInput2, "SI_NUMBERS_0to9"); std::unordered_map<StateAndInput<int,char>, lexer_dfa*, StateAndInputHashFunction, StateAndInputEquals>::const_iterator fetchedNumbers0to9 = _nextStates.find(rangedInput2); if (fetchedNumbers0to9 != _nextStates.end()) { DONT std::cout << "\trangedNumber:[0-9]" << std::endl; ret = fetchedNumbers0to9->second; } } } else if (input >= '1' && input <= '9') { StateAndInput<int,char> rangedInput0to9(stateAndInput.getState(), SI_NUMBERS_0to9, true); DONT std::cout << "\t\t"; DONT _printInputHash(rangedInput0to9, "rangedInputNumbers0to9"); std::unordered_map<StateAndInput<int,char>, lexer_dfa*, StateAndInputHashFunction, StateAndInputEquals>::const_iterator fetchedNumbers0to9 = _nextStates.find(rangedInput0to9); if (fetchedNumbers0to9 != _nextStates.end()) { DONT std::cout << "\tFound rangedNumber! ([0-9])" << std::endl; ret = fetchedNumbers0to9->second; } else { StateAndInput<int,char> rangedInput1to9(stateAndInput.getState(), SI_NUMBERS_1to9, true); DONT std::cout << "\t\t"; DONT _printInputHash(rangedInput1to9, "rangedInputNumbers1to9"); std::unordered_map<StateAndInput<int,char>, lexer_dfa*, StateAndInputHashFunction, StateAndInputEquals>::const_iterator fetchedNumbers1to9 = _nextStates.find(rangedInput1to9); if (fetchedNumbers1to9 != _nextStates.end()) { DONT std::cout << "\trangedNumber:[1-9]" << std::endl; ret = fetchedNumbers1to9->second; } } } else if (input >= 'a' && input <= 'z') { DONT std::cout << "\tChecking lowerase ranged" << std::endl; StateAndInput<int,char> rangedInput(stateAndInput.getState(), SI_CHARS_LOWER, true); DONT std::cout << "\t\t"; DONT _printInputHash(rangedInput, "rangedInput"); DONT std::cout << "\t\tlexer_dfa::getNextState(...): (state,input) = (" << stateAndInput.getState() << ", SI_CHARS_LOWER)" << std::endl; std::unordered_map<StateAndInput<int,char>, lexer_dfa*, StateAndInputHashFunction, StateAndInputEquals>::const_iterator fetchedCharsLower = _nextStates.find(rangedInput); if (fetchedCharsLower != _nextStates.end()) { DONT std::cout << "\tFound rangedChars:[a-z]" << std::endl; ret = fetchedCharsLower->second; } else { StateAndInput<int,char> rangedInput2(stateAndInput.getState(), SI_CHARS_ANY, true); DONT std::cout << "\t\t"; DONT _printInputHash(rangedInput2, "rangedInput2"); DONT std::cout << "\t\tlexer_dfa::getNextState(...): (state,input) = (" << stateAndInput.getState() << ", SI_CHARS_ANY)" << std::endl; std::unordered_map<StateAndInput<int,char>, lexer_dfa*, StateAndInputHashFunction, StateAndInputEquals>::const_iterator fetchedCharsAny = _nextStates.find(rangedInput2); if (fetchedCharsAny != _nextStates.end()) { DONT std::cout << "rangedChars:([a-z]|[A-Z])" << std::endl; ret = fetchedCharsAny->second; } } } else if (input >= 'A' && input <= 'Z') { DONT std::cout << "\tChecking uppercase ranged" << std::endl; StateAndInput<int,char> rangedInput(stateAndInput.getState(), SI_CHARS_UPPER, true); DONT std::cout << "\t\t"; DONT _printInputHash(rangedInput, "rangedInput"); DONT std::cout << "\t\tlexer_dfa::getNextState(...): (state,input) = (" << stateAndInput.getState() << ", SI_CHARS_UPPER)" << std::endl; std::unordered_map<StateAndInput<int,char>, lexer_dfa*, StateAndInputHashFunction, StateAndInputEquals>::const_iterator fetchedCharsUpper = _nextStates.find(rangedInput); if (fetchedCharsUpper != _nextStates.end()) { //std::cout << "\tFound rangedChars:[A-Z]" << std::endl; //commented in order to benchmark diff between ScanWords ret = fetchedCharsUpper->second; } else { StateAndInput<int,char> rangedInput2(stateAndInput.getState(), SI_CHARS_ANY, true); DONT std::cout << "\t\t"; DONT _printInputHash(rangedInput, "rangedInput"); DONT std::cout << "\t\tlexer_dfa::getNextState(...): (state,input) = (" << stateAndInput.getState() << ", SI_CHARS_ANY)" << std::endl; std::unordered_map<StateAndInput<int,char>, lexer_dfa*, StateAndInputHashFunction, StateAndInputEquals>::const_iterator fetchedCharsAny = _nextStates.find(rangedInput2); if (fetchedCharsAny != _nextStates.end()) { DONT std::cout << "rangedChars:([a-z]|[A-Z])" << std::endl; ret = fetchedCharsAny->second; } } } //if by now ret has not been set to soemething other than nullptr, we have one last restort in the empty char if (ret == nullptr) { //we check for 'anythingBut' before we finally check for empty string -- this is the going protocol for now if (_anythingButTransition != nullptr) { if (_anythingButTransition->getIsRanged()) { //todo: perform range checks for anything buts } else if (input != _anythingButTransition->getStateAndInput().getInput()) { ret = const_cast<lexer_dfa*>(_anythingButTransition->getDfaNode()); } } if (ret == nullptr) { //check case of empty char StateAndInput<int,char> stateAndEmptyCharInput(stateAndInput.getState(), '\0'); DONT std::cout << "\t\t"; DONT _printInputHash(stateAndEmptyCharInput, "stateAndEmptyInput"); std::unordered_map<StateAndInput<int,char>, lexer_dfa*, StateAndInputHashFunction, StateAndInputEquals>::const_iterator fetchedEmptyChar = _nextStates.find(stateAndEmptyCharInput); if (fetchedEmptyChar != _nextStates.end()) { DONT std::cout << "\tfound empty char!!!" << std::endl; ret = fetchedEmptyChar->second; } else { DONT std::cout << "\tkey not found" << std::endl; ret = nullptr; } } } } else { ret = fetched->second; } return ret; }