Beispiel #1
0
Path_n_count_pair IRKE::inchworm_step (KmerCounter& kcounter, char direction, Kmer_Occurence_Pair kmer, Kmer_visitor& visitor,
									   Kmer_visitor& eliminator, unsigned int inchworm_round, unsigned int depth, 
									   float MIN_CONNECTIVITY_RATIO, unsigned int max_recurse) {
	
	// cout << "inchworm_step" << endl;
	
	if (IRKE_COMMON::MONITOR >= 2) {
		cerr << "\rinchworm: " << string(1,direction) 
			 << " A:" << INCHWORM_ASSEMBLY_COUNTER << " "
			 << " rnd:" << inchworm_round << " D:" << depth << "         "; 
	}
	
	// check to see if kmer exists.  If not, return empty container
	Path_n_count_pair best_path_n_pair;
	
		
	if ( !kmer.second
		 || visitor.exists(kmer.first) // visited
		 || eliminator.exists(kmer.first) // eliminated
		 
		 ) {
		
		// base case, already visited or kmer doesn't exist.
		//cout << kmer << "already visited or doesn't exist.  ending recursion at depth: " << depth << endl;
		
		return(best_path_n_pair);
		
	}
	
	visitor.add(kmer.first);
	
	if (PACMAN && depth > 0) {
		// cerr << "pacman eliminated kmer: " << kmer << endl;
		eliminator.add(kmer.first);
	}
	
	
	if (depth < max_recurse) {
		
		vector<Kmer_Occurence_Pair> kmer_candidates;
		if (direction == 'F') {
			// forward search
			kmer_candidates = kcounter.get_forward_kmer_candidates(kmer.first);
		}
		else {
			// reverse search
			kmer_candidates = kcounter.get_reverse_kmer_candidates(kmer.first);
		}
		
		bool tie = true;
		unsigned int recurse_cap = max_recurse;
		unsigned int best_path_length = 0;
		while (tie) {
			vector<Path_n_count_pair> paths;
			
			for (unsigned int i = 0; i < kmer_candidates.size(); i++) {
				Kmer_Occurence_Pair kmer_candidate = kmer_candidates[i];
				
				if (kmer_candidate.second // ) {
					&& !visitor.exists(kmer_candidate.first)  // avoid creating already visited kmers since they're unvisited below...
					&& exceeds_min_connectivity(kcounter, kmer, kmer_candidate, MIN_CONNECTIVITY_RATIO) ) {
					//cout << endl << "\ttrying " << kmer_candidate << endl;
					

                    // recursive call here for extension
					Path_n_count_pair p = inchworm_step(kcounter, direction, kmer_candidate, visitor, eliminator, inchworm_round, depth+1, MIN_CONNECTIVITY_RATIO, recurse_cap);
					
					paths.push_back(p);
					
					visitor.erase(kmer_candidate.first); // un-visiting
					
                }
				
			} // end for kmer
			
			
			if (paths.size() > 1) {
				
				sort(paths.begin(), paths.end(), compare);
				
				if (paths[0].second == paths[1].second   // same cumulative coverage values for both paths.
					&&
					// check last kmer to be sure they're different. 
					// Not interested in breaking ties between identically scoring paths that end up at the same kmer.
					paths[0].first[0] != paths[1].first[0]
					
					) {
					
					// got tie, two different paths and two different endpoints:
					if (IRKE_COMMON::MONITOR >= 3) {
						
						
						cerr << "Got tie! " << ", score: " << paths[0].second << ", recurse at: " << recurse_cap << endl;
						vector<unsigned int> v;
						cerr << reconstruct_path_sequence(kcounter, paths[0].first, v) << endl;
						cerr << reconstruct_path_sequence(kcounter, paths[1].first, v) << endl;
						
						
						
					}
					if (paths[0].first.size() > best_path_length) {
						recurse_cap++;
						best_path_length = paths[0].first.size();
					}
					else {
						// cerr << "not able to delve further into the graph, though...  Stopping here." << endl;
						tie = false;
                        
					}
				}
				
				else if ((paths[0].second == paths[1].second   // same cumulative coverage values for both paths.
						  &&
						  paths[0].first[0] == paths[1].first[0] ) // same endpoint
						 ) {
					
					if (IRKE_COMMON::MONITOR >= 3) {
						cerr << "Tied, but two different paths join to the same kmer.  Choosing first path arbitrarily." << endl;
					}
					tie = false;
					best_path_n_pair = paths[0];
				}
				
				else {
					// no tie.
					tie = false;
					best_path_n_pair = paths[0];
				}
				
				
			}
			else if (paths.size() == 1) {
				tie = false;
				best_path_n_pair = paths[0];
			}
			else {
				// no extensions possible.
				tie = false;
			}
			
			
		} // end while tie
	}
	
	// add current kmer to path, as long as not the original seed kmer!
	if (depth > 0) {
		best_path_n_pair.first.push_back(kmer.first);
		best_path_n_pair.second += kmer.second;
	}
	
	return(best_path_n_pair);
	
	
}
Beispiel #2
0
Path_n_count_pair IRKE::inchworm_step(KmerCounter &kcounter,
                                      char direction,
                                      Kmer_Occurence_Pair kmer,
                                      Kmer_visitor &visitor,
                                      Kmer_visitor &eliminator,
                                      unsigned int inchworm_round,
                                      unsigned int depth,
                                      float MIN_CONNECTIVITY_RATIO,
                                      unsigned int max_recurse)
{

    // cout << "inchworm_step" << endl;

    if (IRKE_COMMON::MONITOR >= 2) {
        cerr << "\rinchworm: " << string(1, direction)
            << " A:" << INCHWORM_ASSEMBLY_COUNTER << " "
            << " rnd:" << inchworm_round << " D:" << depth << "         ";
    }

    // check to see if kmer exists.  If not, return empty container
    Path_n_count_pair best_path_n_pair;
    best_path_n_pair.second = 0; // init

    if ( // !kmer.second ||

        visitor.exists(kmer.first) // visited
            || eliminator.exists(kmer.first) // eliminated

        ) {

        if (IRKE_COMMON::MONITOR >= 3) {
            cerr << "base case, already visited or kmer doesn't exist." << endl;
            cerr << kmer.first << " already visited or doesn't exist.  ending recursion at depth: " << depth << endl;
        }

        return (best_path_n_pair);

    }

    visitor.add(kmer.first);

    if (PACMAN && depth > 0) {
        // cerr << "pacman eliminated kmer: " << kmer << endl;
        eliminator.add(kmer.first);
    }


    if (depth < max_recurse) {

        vector<Kmer_Occurence_Pair> kmer_candidates;
        if (direction == 'F') {
            // forward search
            kmer_candidates = kcounter.get_forward_kmer_candidates(kmer.first);
        }
        else {
            // reverse search
            kmer_candidates = kcounter.get_reverse_kmer_candidates(kmer.first);
        }

        if (IRKE_COMMON::MONITOR >= 3) {
            cerr << "Got " << kmer_candidates.size() << " kmer extension candidates." << endl;
        }

        bool tie = true;
        unsigned int recurse_cap = max_recurse;
        unsigned int best_path_length = 0;
        while (tie) {

            // keep trying to break ties if ties encountered.
            // this is done by increasing the allowed recursion depth until the tie is broken.
            //  Recursion depth set via: recurse_cap and incremented if tie is found


            vector<Path_n_count_pair> paths; // to collect all the paths rooting from this point

            for (unsigned int i = 0; i < kmer_candidates.size(); i++) {
                Kmer_Occurence_Pair kmer_candidate = kmer_candidates[i];

                if (kmer_candidate.second &&

                    !visitor.exists(kmer_candidate
                                        .first)  // avoid creating already visited kmers since they're unvisited below...
                    && exceeds_min_connectivity(kcounter, kmer, kmer_candidate, MIN_CONNECTIVITY_RATIO)) {
                    //cout << endl << "\ttrying " << kmer_candidate << endl;


                    // recursive call here for extension
                    Path_n_count_pair p = inchworm_step(kcounter,
                                                        direction,
                                                        kmer_candidate,
                                                        visitor,
                                                        eliminator,
                                                        inchworm_round,
                                                        depth + 1,
                                                        MIN_CONNECTIVITY_RATIO,
                                                        recurse_cap);

                    if (p.first.size() >= 1) {
                        // only retain paths that include visited nodes.
                        paths.push_back(p);
                    }
                    visitor.erase(kmer_candidate.first); // un-visiting

                }

            } // end for kmer


            if (paths.size() > 1) {

                sort(paths.begin(), paths.end(), compare);

                if (IRKE_COMMON::__DEVEL_no_greedy_extend) {
                    // pick a path at random
                    int rand_index = rand() % paths.size();
                    tie = false;
                    if (IRKE_COMMON::MONITOR) {
                        cerr << "IRKE_COMMON::__DEVEL_no_greedy_extend -- picking random path index: " << rand_index
                            << " from size(): " << paths.size() << endl;
                    }
                    best_path_n_pair = paths[rand_index];
                }

                else if (paths[0].second == paths[1].second   // same cumulative coverage values for both paths.
                    &&
                        // check last kmer to be sure they're different.
                        // Not interested in breaking ties between identically scoring paths that end up at the same kmer.
                        paths[0].first[0] != paths[1].first[0]

                    ) {

                    // got tie, two different paths and two different endpoints:
                    if (IRKE_COMMON::MONITOR >= 3) {

                        cerr << "Got tie! " << ", score: " << paths[0].second << ", recurse at: " << recurse_cap
                            << endl;
                        vector<unsigned int> v;
                        cerr << reconstruct_path_sequence(kcounter, paths[0].first, v) << endl;
                        cerr << reconstruct_path_sequence(kcounter, paths[1].first, v) << endl;

                    }

                    if (IRKE_COMMON::__DEVEL_no_tie_breaking || recurse_cap >= MAX_RECURSION_HARD_STOP) {
                        tie = false;

                        int rand_index = rand() % 2;

                        if (IRKE_COMMON::MONITOR >= 2) {
                            cerr << "IRKE_COMMON::__DEVEL_no_tie_breaking, so picking path: " << rand_index
                                << " at random." << endl;
                        }

                        best_path_n_pair = paths[rand_index];
                    }

                    else if (paths[0].first.size() > best_path_length) {
                        // still making progress in extending to try to break the tie.  Keep going.
                        // note, this is the only test that keeps us in this while loop. (tie stays true)
                        recurse_cap++;
                        best_path_length = paths[0].first.size();
                    }
                    else {
                        // cerr << "not able to delve further into the graph, though...  Stopping here." << endl;
                        tie = false;
                        best_path_n_pair = paths[0]; // pick one
                    }
                }

                else if ((paths[0].second == paths[1].second   // same cumulative coverage values for both paths.
                    &&
                        paths[0].first[0] == paths[1].first[0]) // same endpoint
                    ) {

                    if (IRKE_COMMON::MONITOR >= 3) {
                        cerr << "Tied, but two different paths join to the same kmer.  Choosing first path arbitrarily."
                            << endl;
                    }
                    tie = false;
                    best_path_n_pair = paths[0];
                }

                else {
                    // no tie.
                    tie = false;
                    best_path_n_pair = paths[0];
                }

            }
            else if (paths.size() == 1) {
                tie = false;
                best_path_n_pair = paths[0];
            }
            else {
                // no extensions possible.
                tie = false;
            }

        } // end while tie
    }

    // add current kmer to path, as long as not the original seed kmer!
    if (depth > 0) {
        best_path_n_pair.first.push_back(kmer.first);
        best_path_n_pair.second += kmer.second;

    }

    return (best_path_n_pair);

}
Beispiel #3
0
Path_n_count_pair IRKE::inchworm (KmerCounter& kcounter, char direction, kmer_int_type_t kmer, Kmer_visitor& visitor, float min_connectivity) {
	
	// cout << "inchworm" << endl;
	
	Path_n_count_pair entire_path;
	
	unsigned int inchworm_round = 0;
	
	unsigned long num_total_kmers = kcounter.size();
	
	Kmer_visitor eliminator(kcounter.get_kmer_length(), DOUBLE_STRANDED_MODE);
	
	while (true) {
		
		inchworm_round++;
		eliminator.clear();
		
		if (inchworm_round > num_total_kmers) {
			throw(string ("Error, inchworm rounds have exceeded the number of possible seed kmers"));
		}
		
		if (IRKE_COMMON::MONITOR >= 3) {
			cerr << endl << "Inchworm round(" << string(1,direction) << "): " << inchworm_round << " searching kmer: " << kmer << endl;
			string kmer_str = kcounter.get_kmer_string(kmer);
			cerr << kcounter.describe_kmer(kmer_str) << endl;
		}
		
		visitor.erase(kmer); // seed kmer must be not visited already.
		
		Kmer_Occurence_Pair kmer_pair(kmer, kcounter.get_kmer_count(kmer));
		Path_n_count_pair best_path = inchworm_step(kcounter, direction, kmer_pair, visitor, eliminator, inchworm_round, 0, min_connectivity, MAX_RECURSION);
		
		if (best_path.second > 0) {
			// append info to entire path in reverse order, so starts just after seed kmer
			vector<kmer_int_type_t>& kmer_list = best_path.first;
			
			unsigned int num_kmers = kmer_list.size();
			int first_index = num_kmers - 1;
			int last_index = 0;
			if (CRAWL) {
				last_index = first_index - CRAWL_LENGTH + 1;
				if (last_index < 0) {
					last_index = 0;
				}
			}
			
			for (int i = first_index; i >= last_index; i--) {
				kmer_int_type_t kmer_extend = kmer_list[i];
				entire_path.first.push_back(kmer_extend);
				visitor.add(kmer_extend);
				entire_path.second += kcounter.get_kmer_count(kmer_extend);

			}
			
			kmer = entire_path.first[ entire_path.first.size() -1 ];
			
		}
		else {
			// no extension possible
			break;
		}
	}
	
	if (IRKE_COMMON::MONITOR >= 3) 
		cerr << endl;
	
	
	return(entire_path);
}
Beispiel #4
0
Path_n_count_pair IRKE::inchworm(KmerCounter &kcounter,
                                 char direction,
                                 kmer_int_type_t kmer,
                                 Kmer_visitor &visitor,
                                 float min_connectivity)
{

    // cout << "inchworm" << endl;

    Path_n_count_pair entire_path;
    entire_path.second = 0; // init cumulative path coverage

    unsigned int inchworm_round = 0;

    unsigned long num_total_kmers = kcounter.size();

    Kmer_visitor eliminator(kcounter.get_kmer_length(), DOUBLE_STRANDED_MODE);

    while (true) {


        if (IRKE_COMMON::__DEVEL_rand_fracture) {

            // terminate extension with probability of __DEVEL_rand_fracture_prob

            float prob_to_fracture = rand() / (float) RAND_MAX;
            //cerr << "prob: " << prob_to_fracture << endl;

            if (prob_to_fracture <= IRKE_COMMON::__DEVEL_rand_fracture_prob) {

                // cerr << "Fracturing at iworm round: " << inchworm_round << " given P: " << prob_to_fracture << endl;

                return (entire_path);
            }
        }

        inchworm_round++;
        eliminator.clear();

        if (inchworm_round > num_total_kmers) {
            throw (string("Error, inchworm rounds have exceeded the number of possible seed kmers"));
        }

        if (IRKE_COMMON::MONITOR >= 3) {
            cerr << endl << "Inchworm round(" << string(1, direction) << "): " << inchworm_round << " searching kmer: "
                << kmer << endl;
            string kmer_str = kcounter.get_kmer_string(kmer);
            cerr << kcounter.describe_kmer(kmer_str) << endl;
        }

        visitor.erase(kmer); // seed kmer must be not visited already.

        Kmer_Occurence_Pair kmer_pair(kmer, kcounter.get_kmer_count(kmer));
        Path_n_count_pair best_path = inchworm_step(kcounter,
                                                    direction,
                                                    kmer_pair,
                                                    visitor,
                                                    eliminator,
                                                    inchworm_round,
                                                    0,
                                                    min_connectivity,
                                                    MAX_RECURSION);

        vector<kmer_int_type_t> &kmer_list = best_path.first;
        unsigned int num_kmers = kmer_list.size();

        if ((IRKE_COMMON::__DEVEL_zero_kmer_on_use && num_kmers >= 1) || best_path.second > 0) {
            // append info to entire path in reverse order, so starts just after seed kmer

            int first_index = num_kmers - 1;
            int last_index = 0;
            if (CRAWL) {
                last_index = first_index - CRAWL_LENGTH + 1;
                if (last_index < 0) {
                    last_index = 0;
                }
            }

            for (int i = first_index; i >= last_index; i--) {
                kmer_int_type_t kmer_extend = kmer_list[i];
                entire_path.first.push_back(kmer_extend);
                visitor.add(kmer_extend);
                //entire_path.second += kcounter.get_kmer_count(kmer_extend);

                // selected here, zero out:


                if (IRKE_COMMON::__DEVEL_zero_kmer_on_use) {
                    kcounter.clear_kmer(kmer_extend);
                }

            }

            kmer = entire_path.first[entire_path.first.size() - 1];

            entire_path.second += best_path.second;

        }
        else {
            // no extension possible
            break;
        }
    }

    if (IRKE_COMMON::MONITOR >= 3)
        cerr << "No extension possible." << endl << endl;


    return (entire_path);
}