/* operates on normalized and localized sequences and patterns only alphabet can be modified */ list<string> extendSequence(const list<string>& seq, string pat) { list<string> results; std::string result; boost::regex_constants::syntax_option_type flags = boost::regex_constants::perl; boost::regex re; boost::smatch what; pat = "^" + pat; re.assign(pat, flags); for (list<string>::const_iterator it = seq.begin(); it!=seq.end(); it++) { string activeSeq = *it; Normalizer n; n.norm(activeSeq); string input = n.getInput() + "z"; for (int i=0; i<input.length(); i++) { string ref = activeSeq+input[i]+"~~~~~~~~~~~~~~~~~~"; DBG cout << "pattern " << pat << " against " << ref << endl; try { if (regex_search(ref,what,re)) { string m = what[0]; if (m.length()>activeSeq.length()) { string bit; bit += input[i]; string ext = activeSeq + n.norm(bit); DBG cout << " got something " << ext << endl; results.push_back(string(ext)); } } } catch(boost::regex_error& regErr) { cerr << "regular expression failed: " << regErr.what() << endl; } } } return results; }
std::string sequenceToPattern(std::string seq) { checkSequence(seq); Normalizer n; std::string sub = ""; if (seq.length()>10) { sub = seq.substr(0,seq.length()-10); seq = seq.substr(seq.length()-10,seq.length()); } string pat = n.denorm(submain(n.norm(seq))); pat = sub + pat; return pat; }
std::string extendSequence(std::string seq, std::string pattern, int len) { srand (std::time(0)); checkSequence(seq); Normalizer n; int pruneLen = 100; seq = n.norm(seq); pattern = n.norm(pattern); string refPattern = pattern; pattern = localizePattern(pattern); DBG cout << "extend " << seq << " with " << pattern << endl; n.add("ABCDEFGHIJKLMNOPQRSTUVWXYZ"); list<string> lst; lst.push_back(seq); for (int k=0; k<len; k++) { lst = extendSequence(lst,pattern); list<string> lst2; for (list<string>::const_iterator it = lst.begin(); it!=lst.end(); it++) { string nextSeq = *it; string nextPat = refPattern; if (nextSeq.length()<=10) { nextPat = sequenceToPattern(nextSeq); } if (nextPat==refPattern) { lst2.push_back(nextSeq); } else { DBG cout << nextSeq << ": " << "mismatch " << nextPat << " versus " << refPattern << endl; } } if (lst2.size()>0) { lst = lst2; } if (lst.size()>pruneLen) { DBG cout << "NEED TO PRUNE" << endl; vector<string> v(lst.begin(),lst.end()); random_shuffle(v.begin(),v.end()); lst.clear(); lst = list<string>(v.begin(),v.begin()+pruneLen); //lst.erase((++(++(lst.begin()))),lst.end()); } DBG { cout << "possibilities: " << endl; for (list<string>::const_iterator it = lst.begin(); it!=lst.end(); it++) { cout << " -- " << n.denorm(*it) << endl; } } } for (list<string>::const_iterator it = lst.begin(); it!=lst.end(); it++) { DBG cout << " final possibility " << n.denorm(*it) << endl; } vector<string> v(lst.begin(),lst.end()); random_shuffle(v.begin(),v.end()); if (v.size()>0) { string result = n.denorm(v[0]); result = result.substr(seq.length(),result.length()); return result; } return ""; }