Path_n_count_pair IRKE::inchworm_step (KmerCounter& kcounter, char direction, Kmer_Occurence_Pair kmer, Kmer_visitor& visitor, Kmer_visitor& eliminator, unsigned int inchworm_round, unsigned int depth, float MIN_CONNECTIVITY_RATIO, unsigned int max_recurse) { // cout << "inchworm_step" << endl; if (IRKE_COMMON::MONITOR >= 2) { cerr << "\rinchworm: " << string(1,direction) << " A:" << INCHWORM_ASSEMBLY_COUNTER << " " << " rnd:" << inchworm_round << " D:" << depth << " "; } // check to see if kmer exists. If not, return empty container Path_n_count_pair best_path_n_pair; if ( !kmer.second || visitor.exists(kmer.first) // visited || eliminator.exists(kmer.first) // eliminated ) { // base case, already visited or kmer doesn't exist. //cout << kmer << "already visited or doesn't exist. ending recursion at depth: " << depth << endl; return(best_path_n_pair); } visitor.add(kmer.first); if (PACMAN && depth > 0) { // cerr << "pacman eliminated kmer: " << kmer << endl; eliminator.add(kmer.first); } if (depth < max_recurse) { vector<Kmer_Occurence_Pair> kmer_candidates; if (direction == 'F') { // forward search kmer_candidates = kcounter.get_forward_kmer_candidates(kmer.first); } else { // reverse search kmer_candidates = kcounter.get_reverse_kmer_candidates(kmer.first); } bool tie = true; unsigned int recurse_cap = max_recurse; unsigned int best_path_length = 0; while (tie) { vector<Path_n_count_pair> paths; for (unsigned int i = 0; i < kmer_candidates.size(); i++) { Kmer_Occurence_Pair kmer_candidate = kmer_candidates[i]; if (kmer_candidate.second // ) { && !visitor.exists(kmer_candidate.first) // avoid creating already visited kmers since they're unvisited below... && exceeds_min_connectivity(kcounter, kmer, kmer_candidate, MIN_CONNECTIVITY_RATIO) ) { //cout << endl << "\ttrying " << kmer_candidate << endl; // recursive call here for extension Path_n_count_pair p = inchworm_step(kcounter, direction, kmer_candidate, visitor, eliminator, inchworm_round, depth+1, MIN_CONNECTIVITY_RATIO, recurse_cap); paths.push_back(p); visitor.erase(kmer_candidate.first); // un-visiting } } // end for kmer if (paths.size() > 1) { sort(paths.begin(), paths.end(), compare); if (paths[0].second == paths[1].second // same cumulative coverage values for both paths. && // check last kmer to be sure they're different. // Not interested in breaking ties between identically scoring paths that end up at the same kmer. paths[0].first[0] != paths[1].first[0] ) { // got tie, two different paths and two different endpoints: if (IRKE_COMMON::MONITOR >= 3) { cerr << "Got tie! " << ", score: " << paths[0].second << ", recurse at: " << recurse_cap << endl; vector<unsigned int> v; cerr << reconstruct_path_sequence(kcounter, paths[0].first, v) << endl; cerr << reconstruct_path_sequence(kcounter, paths[1].first, v) << endl; } if (paths[0].first.size() > best_path_length) { recurse_cap++; best_path_length = paths[0].first.size(); } else { // cerr << "not able to delve further into the graph, though... Stopping here." << endl; tie = false; } } else if ((paths[0].second == paths[1].second // same cumulative coverage values for both paths. && paths[0].first[0] == paths[1].first[0] ) // same endpoint ) { if (IRKE_COMMON::MONITOR >= 3) { cerr << "Tied, but two different paths join to the same kmer. Choosing first path arbitrarily." << endl; } tie = false; best_path_n_pair = paths[0]; } else { // no tie. tie = false; best_path_n_pair = paths[0]; } } else if (paths.size() == 1) { tie = false; best_path_n_pair = paths[0]; } else { // no extensions possible. tie = false; } } // end while tie } // add current kmer to path, as long as not the original seed kmer! if (depth > 0) { best_path_n_pair.first.push_back(kmer.first); best_path_n_pair.second += kmer.second; } return(best_path_n_pair); }
Path_n_count_pair IRKE::inchworm_step(KmerCounter &kcounter, char direction, Kmer_Occurence_Pair kmer, Kmer_visitor &visitor, Kmer_visitor &eliminator, unsigned int inchworm_round, unsigned int depth, float MIN_CONNECTIVITY_RATIO, unsigned int max_recurse) { // cout << "inchworm_step" << endl; if (IRKE_COMMON::MONITOR >= 2) { cerr << "\rinchworm: " << string(1, direction) << " A:" << INCHWORM_ASSEMBLY_COUNTER << " " << " rnd:" << inchworm_round << " D:" << depth << " "; } // check to see if kmer exists. If not, return empty container Path_n_count_pair best_path_n_pair; best_path_n_pair.second = 0; // init if ( // !kmer.second || visitor.exists(kmer.first) // visited || eliminator.exists(kmer.first) // eliminated ) { if (IRKE_COMMON::MONITOR >= 3) { cerr << "base case, already visited or kmer doesn't exist." << endl; cerr << kmer.first << " already visited or doesn't exist. ending recursion at depth: " << depth << endl; } return (best_path_n_pair); } visitor.add(kmer.first); if (PACMAN && depth > 0) { // cerr << "pacman eliminated kmer: " << kmer << endl; eliminator.add(kmer.first); } if (depth < max_recurse) { vector<Kmer_Occurence_Pair> kmer_candidates; if (direction == 'F') { // forward search kmer_candidates = kcounter.get_forward_kmer_candidates(kmer.first); } else { // reverse search kmer_candidates = kcounter.get_reverse_kmer_candidates(kmer.first); } if (IRKE_COMMON::MONITOR >= 3) { cerr << "Got " << kmer_candidates.size() << " kmer extension candidates." << endl; } bool tie = true; unsigned int recurse_cap = max_recurse; unsigned int best_path_length = 0; while (tie) { // keep trying to break ties if ties encountered. // this is done by increasing the allowed recursion depth until the tie is broken. // Recursion depth set via: recurse_cap and incremented if tie is found vector<Path_n_count_pair> paths; // to collect all the paths rooting from this point for (unsigned int i = 0; i < kmer_candidates.size(); i++) { Kmer_Occurence_Pair kmer_candidate = kmer_candidates[i]; if (kmer_candidate.second && !visitor.exists(kmer_candidate .first) // avoid creating already visited kmers since they're unvisited below... && exceeds_min_connectivity(kcounter, kmer, kmer_candidate, MIN_CONNECTIVITY_RATIO)) { //cout << endl << "\ttrying " << kmer_candidate << endl; // recursive call here for extension Path_n_count_pair p = inchworm_step(kcounter, direction, kmer_candidate, visitor, eliminator, inchworm_round, depth + 1, MIN_CONNECTIVITY_RATIO, recurse_cap); if (p.first.size() >= 1) { // only retain paths that include visited nodes. paths.push_back(p); } visitor.erase(kmer_candidate.first); // un-visiting } } // end for kmer if (paths.size() > 1) { sort(paths.begin(), paths.end(), compare); if (IRKE_COMMON::__DEVEL_no_greedy_extend) { // pick a path at random int rand_index = rand() % paths.size(); tie = false; if (IRKE_COMMON::MONITOR) { cerr << "IRKE_COMMON::__DEVEL_no_greedy_extend -- picking random path index: " << rand_index << " from size(): " << paths.size() << endl; } best_path_n_pair = paths[rand_index]; } else if (paths[0].second == paths[1].second // same cumulative coverage values for both paths. && // check last kmer to be sure they're different. // Not interested in breaking ties between identically scoring paths that end up at the same kmer. paths[0].first[0] != paths[1].first[0] ) { // got tie, two different paths and two different endpoints: if (IRKE_COMMON::MONITOR >= 3) { cerr << "Got tie! " << ", score: " << paths[0].second << ", recurse at: " << recurse_cap << endl; vector<unsigned int> v; cerr << reconstruct_path_sequence(kcounter, paths[0].first, v) << endl; cerr << reconstruct_path_sequence(kcounter, paths[1].first, v) << endl; } if (IRKE_COMMON::__DEVEL_no_tie_breaking || recurse_cap >= MAX_RECURSION_HARD_STOP) { tie = false; int rand_index = rand() % 2; if (IRKE_COMMON::MONITOR >= 2) { cerr << "IRKE_COMMON::__DEVEL_no_tie_breaking, so picking path: " << rand_index << " at random." << endl; } best_path_n_pair = paths[rand_index]; } else if (paths[0].first.size() > best_path_length) { // still making progress in extending to try to break the tie. Keep going. // note, this is the only test that keeps us in this while loop. (tie stays true) recurse_cap++; best_path_length = paths[0].first.size(); } else { // cerr << "not able to delve further into the graph, though... Stopping here." << endl; tie = false; best_path_n_pair = paths[0]; // pick one } } else if ((paths[0].second == paths[1].second // same cumulative coverage values for both paths. && paths[0].first[0] == paths[1].first[0]) // same endpoint ) { if (IRKE_COMMON::MONITOR >= 3) { cerr << "Tied, but two different paths join to the same kmer. Choosing first path arbitrarily." << endl; } tie = false; best_path_n_pair = paths[0]; } else { // no tie. tie = false; best_path_n_pair = paths[0]; } } else if (paths.size() == 1) { tie = false; best_path_n_pair = paths[0]; } else { // no extensions possible. tie = false; } } // end while tie } // add current kmer to path, as long as not the original seed kmer! if (depth > 0) { best_path_n_pair.first.push_back(kmer.first); best_path_n_pair.second += kmer.second; } return (best_path_n_pair); }
Path_n_count_pair IRKE::inchworm (KmerCounter& kcounter, char direction, kmer_int_type_t kmer, Kmer_visitor& visitor, float min_connectivity) { // cout << "inchworm" << endl; Path_n_count_pair entire_path; unsigned int inchworm_round = 0; unsigned long num_total_kmers = kcounter.size(); Kmer_visitor eliminator(kcounter.get_kmer_length(), DOUBLE_STRANDED_MODE); while (true) { inchworm_round++; eliminator.clear(); if (inchworm_round > num_total_kmers) { throw(string ("Error, inchworm rounds have exceeded the number of possible seed kmers")); } if (IRKE_COMMON::MONITOR >= 3) { cerr << endl << "Inchworm round(" << string(1,direction) << "): " << inchworm_round << " searching kmer: " << kmer << endl; string kmer_str = kcounter.get_kmer_string(kmer); cerr << kcounter.describe_kmer(kmer_str) << endl; } visitor.erase(kmer); // seed kmer must be not visited already. Kmer_Occurence_Pair kmer_pair(kmer, kcounter.get_kmer_count(kmer)); Path_n_count_pair best_path = inchworm_step(kcounter, direction, kmer_pair, visitor, eliminator, inchworm_round, 0, min_connectivity, MAX_RECURSION); if (best_path.second > 0) { // append info to entire path in reverse order, so starts just after seed kmer vector<kmer_int_type_t>& kmer_list = best_path.first; unsigned int num_kmers = kmer_list.size(); int first_index = num_kmers - 1; int last_index = 0; if (CRAWL) { last_index = first_index - CRAWL_LENGTH + 1; if (last_index < 0) { last_index = 0; } } for (int i = first_index; i >= last_index; i--) { kmer_int_type_t kmer_extend = kmer_list[i]; entire_path.first.push_back(kmer_extend); visitor.add(kmer_extend); entire_path.second += kcounter.get_kmer_count(kmer_extend); } kmer = entire_path.first[ entire_path.first.size() -1 ]; } else { // no extension possible break; } } if (IRKE_COMMON::MONITOR >= 3) cerr << endl; return(entire_path); }
Path_n_count_pair IRKE::inchworm(KmerCounter &kcounter, char direction, kmer_int_type_t kmer, Kmer_visitor &visitor, float min_connectivity) { // cout << "inchworm" << endl; Path_n_count_pair entire_path; entire_path.second = 0; // init cumulative path coverage unsigned int inchworm_round = 0; unsigned long num_total_kmers = kcounter.size(); Kmer_visitor eliminator(kcounter.get_kmer_length(), DOUBLE_STRANDED_MODE); while (true) { if (IRKE_COMMON::__DEVEL_rand_fracture) { // terminate extension with probability of __DEVEL_rand_fracture_prob float prob_to_fracture = rand() / (float) RAND_MAX; //cerr << "prob: " << prob_to_fracture << endl; if (prob_to_fracture <= IRKE_COMMON::__DEVEL_rand_fracture_prob) { // cerr << "Fracturing at iworm round: " << inchworm_round << " given P: " << prob_to_fracture << endl; return (entire_path); } } inchworm_round++; eliminator.clear(); if (inchworm_round > num_total_kmers) { throw (string("Error, inchworm rounds have exceeded the number of possible seed kmers")); } if (IRKE_COMMON::MONITOR >= 3) { cerr << endl << "Inchworm round(" << string(1, direction) << "): " << inchworm_round << " searching kmer: " << kmer << endl; string kmer_str = kcounter.get_kmer_string(kmer); cerr << kcounter.describe_kmer(kmer_str) << endl; } visitor.erase(kmer); // seed kmer must be not visited already. Kmer_Occurence_Pair kmer_pair(kmer, kcounter.get_kmer_count(kmer)); Path_n_count_pair best_path = inchworm_step(kcounter, direction, kmer_pair, visitor, eliminator, inchworm_round, 0, min_connectivity, MAX_RECURSION); vector<kmer_int_type_t> &kmer_list = best_path.first; unsigned int num_kmers = kmer_list.size(); if ((IRKE_COMMON::__DEVEL_zero_kmer_on_use && num_kmers >= 1) || best_path.second > 0) { // append info to entire path in reverse order, so starts just after seed kmer int first_index = num_kmers - 1; int last_index = 0; if (CRAWL) { last_index = first_index - CRAWL_LENGTH + 1; if (last_index < 0) { last_index = 0; } } for (int i = first_index; i >= last_index; i--) { kmer_int_type_t kmer_extend = kmer_list[i]; entire_path.first.push_back(kmer_extend); visitor.add(kmer_extend); //entire_path.second += kcounter.get_kmer_count(kmer_extend); // selected here, zero out: if (IRKE_COMMON::__DEVEL_zero_kmer_on_use) { kcounter.clear_kmer(kmer_extend); } } kmer = entire_path.first[entire_path.first.size() - 1]; entire_path.second += best_path.second; } else { // no extension possible break; } } if (IRKE_COMMON::MONITOR >= 3) cerr << "No extension possible." << endl << endl; return (entire_path); }