string IRKE::reconstruct_path_sequence(vector<kmer_int_type_t>& path, vector<unsigned int>& cov_counter) { // use kcounter member return(reconstruct_path_sequence(kcounter, path, cov_counter)); }
Path_n_count_pair IRKE::inchworm_step (KmerCounter& kcounter, char direction, Kmer_Occurence_Pair kmer, Kmer_visitor& visitor, Kmer_visitor& eliminator, unsigned int inchworm_round, unsigned int depth, float MIN_CONNECTIVITY_RATIO, unsigned int max_recurse) { // cout << "inchworm_step" << endl; if (IRKE_COMMON::MONITOR >= 2) { cerr << "\rinchworm: " << string(1,direction) << " A:" << INCHWORM_ASSEMBLY_COUNTER << " " << " rnd:" << inchworm_round << " D:" << depth << " "; } // check to see if kmer exists. If not, return empty container Path_n_count_pair best_path_n_pair; if ( !kmer.second || visitor.exists(kmer.first) // visited || eliminator.exists(kmer.first) // eliminated ) { // base case, already visited or kmer doesn't exist. //cout << kmer << "already visited or doesn't exist. ending recursion at depth: " << depth << endl; return(best_path_n_pair); } visitor.add(kmer.first); if (PACMAN && depth > 0) { // cerr << "pacman eliminated kmer: " << kmer << endl; eliminator.add(kmer.first); } if (depth < max_recurse) { vector<Kmer_Occurence_Pair> kmer_candidates; if (direction == 'F') { // forward search kmer_candidates = kcounter.get_forward_kmer_candidates(kmer.first); } else { // reverse search kmer_candidates = kcounter.get_reverse_kmer_candidates(kmer.first); } bool tie = true; unsigned int recurse_cap = max_recurse; unsigned int best_path_length = 0; while (tie) { vector<Path_n_count_pair> paths; for (unsigned int i = 0; i < kmer_candidates.size(); i++) { Kmer_Occurence_Pair kmer_candidate = kmer_candidates[i]; if (kmer_candidate.second // ) { && !visitor.exists(kmer_candidate.first) // avoid creating already visited kmers since they're unvisited below... && exceeds_min_connectivity(kcounter, kmer, kmer_candidate, MIN_CONNECTIVITY_RATIO) ) { //cout << endl << "\ttrying " << kmer_candidate << endl; // recursive call here for extension Path_n_count_pair p = inchworm_step(kcounter, direction, kmer_candidate, visitor, eliminator, inchworm_round, depth+1, MIN_CONNECTIVITY_RATIO, recurse_cap); paths.push_back(p); visitor.erase(kmer_candidate.first); // un-visiting } } // end for kmer if (paths.size() > 1) { sort(paths.begin(), paths.end(), compare); if (paths[0].second == paths[1].second // same cumulative coverage values for both paths. && // check last kmer to be sure they're different. // Not interested in breaking ties between identically scoring paths that end up at the same kmer. paths[0].first[0] != paths[1].first[0] ) { // got tie, two different paths and two different endpoints: if (IRKE_COMMON::MONITOR >= 3) { cerr << "Got tie! " << ", score: " << paths[0].second << ", recurse at: " << recurse_cap << endl; vector<unsigned int> v; cerr << reconstruct_path_sequence(kcounter, paths[0].first, v) << endl; cerr << reconstruct_path_sequence(kcounter, paths[1].first, v) << endl; } if (paths[0].first.size() > best_path_length) { recurse_cap++; best_path_length = paths[0].first.size(); } else { // cerr << "not able to delve further into the graph, though... Stopping here." << endl; tie = false; } } else if ((paths[0].second == paths[1].second // same cumulative coverage values for both paths. && paths[0].first[0] == paths[1].first[0] ) // same endpoint ) { if (IRKE_COMMON::MONITOR >= 3) { cerr << "Tied, but two different paths join to the same kmer. Choosing first path arbitrarily." << endl; } tie = false; best_path_n_pair = paths[0]; } else { // no tie. tie = false; best_path_n_pair = paths[0]; } } else if (paths.size() == 1) { tie = false; best_path_n_pair = paths[0]; } else { // no extensions possible. tie = false; } } // end while tie } // add current kmer to path, as long as not the original seed kmer! if (depth > 0) { best_path_n_pair.first.push_back(kmer.first); best_path_n_pair.second += kmer.second; } return(best_path_n_pair); }
Path_n_count_pair IRKE::inchworm_step(KmerCounter &kcounter, char direction, Kmer_Occurence_Pair kmer, Kmer_visitor &visitor, Kmer_visitor &eliminator, unsigned int inchworm_round, unsigned int depth, float MIN_CONNECTIVITY_RATIO, unsigned int max_recurse) { // cout << "inchworm_step" << endl; if (IRKE_COMMON::MONITOR >= 2) { cerr << "\rinchworm: " << string(1, direction) << " A:" << INCHWORM_ASSEMBLY_COUNTER << " " << " rnd:" << inchworm_round << " D:" << depth << " "; } // check to see if kmer exists. If not, return empty container Path_n_count_pair best_path_n_pair; best_path_n_pair.second = 0; // init if ( // !kmer.second || visitor.exists(kmer.first) // visited || eliminator.exists(kmer.first) // eliminated ) { if (IRKE_COMMON::MONITOR >= 3) { cerr << "base case, already visited or kmer doesn't exist." << endl; cerr << kmer.first << " already visited or doesn't exist. ending recursion at depth: " << depth << endl; } return (best_path_n_pair); } visitor.add(kmer.first); if (PACMAN && depth > 0) { // cerr << "pacman eliminated kmer: " << kmer << endl; eliminator.add(kmer.first); } if (depth < max_recurse) { vector<Kmer_Occurence_Pair> kmer_candidates; if (direction == 'F') { // forward search kmer_candidates = kcounter.get_forward_kmer_candidates(kmer.first); } else { // reverse search kmer_candidates = kcounter.get_reverse_kmer_candidates(kmer.first); } if (IRKE_COMMON::MONITOR >= 3) { cerr << "Got " << kmer_candidates.size() << " kmer extension candidates." << endl; } bool tie = true; unsigned int recurse_cap = max_recurse; unsigned int best_path_length = 0; while (tie) { // keep trying to break ties if ties encountered. // this is done by increasing the allowed recursion depth until the tie is broken. // Recursion depth set via: recurse_cap and incremented if tie is found vector<Path_n_count_pair> paths; // to collect all the paths rooting from this point for (unsigned int i = 0; i < kmer_candidates.size(); i++) { Kmer_Occurence_Pair kmer_candidate = kmer_candidates[i]; if (kmer_candidate.second && !visitor.exists(kmer_candidate .first) // avoid creating already visited kmers since they're unvisited below... && exceeds_min_connectivity(kcounter, kmer, kmer_candidate, MIN_CONNECTIVITY_RATIO)) { //cout << endl << "\ttrying " << kmer_candidate << endl; // recursive call here for extension Path_n_count_pair p = inchworm_step(kcounter, direction, kmer_candidate, visitor, eliminator, inchworm_round, depth + 1, MIN_CONNECTIVITY_RATIO, recurse_cap); if (p.first.size() >= 1) { // only retain paths that include visited nodes. paths.push_back(p); } visitor.erase(kmer_candidate.first); // un-visiting } } // end for kmer if (paths.size() > 1) { sort(paths.begin(), paths.end(), compare); if (IRKE_COMMON::__DEVEL_no_greedy_extend) { // pick a path at random int rand_index = rand() % paths.size(); tie = false; if (IRKE_COMMON::MONITOR) { cerr << "IRKE_COMMON::__DEVEL_no_greedy_extend -- picking random path index: " << rand_index << " from size(): " << paths.size() << endl; } best_path_n_pair = paths[rand_index]; } else if (paths[0].second == paths[1].second // same cumulative coverage values for both paths. && // check last kmer to be sure they're different. // Not interested in breaking ties between identically scoring paths that end up at the same kmer. paths[0].first[0] != paths[1].first[0] ) { // got tie, two different paths and two different endpoints: if (IRKE_COMMON::MONITOR >= 3) { cerr << "Got tie! " << ", score: " << paths[0].second << ", recurse at: " << recurse_cap << endl; vector<unsigned int> v; cerr << reconstruct_path_sequence(kcounter, paths[0].first, v) << endl; cerr << reconstruct_path_sequence(kcounter, paths[1].first, v) << endl; } if (IRKE_COMMON::__DEVEL_no_tie_breaking || recurse_cap >= MAX_RECURSION_HARD_STOP) { tie = false; int rand_index = rand() % 2; if (IRKE_COMMON::MONITOR >= 2) { cerr << "IRKE_COMMON::__DEVEL_no_tie_breaking, so picking path: " << rand_index << " at random." << endl; } best_path_n_pair = paths[rand_index]; } else if (paths[0].first.size() > best_path_length) { // still making progress in extending to try to break the tie. Keep going. // note, this is the only test that keeps us in this while loop. (tie stays true) recurse_cap++; best_path_length = paths[0].first.size(); } else { // cerr << "not able to delve further into the graph, though... Stopping here." << endl; tie = false; best_path_n_pair = paths[0]; // pick one } } else if ((paths[0].second == paths[1].second // same cumulative coverage values for both paths. && paths[0].first[0] == paths[1].first[0]) // same endpoint ) { if (IRKE_COMMON::MONITOR >= 3) { cerr << "Tied, but two different paths join to the same kmer. Choosing first path arbitrarily." << endl; } tie = false; best_path_n_pair = paths[0]; } else { // no tie. tie = false; best_path_n_pair = paths[0]; } } else if (paths.size() == 1) { tie = false; best_path_n_pair = paths[0]; } else { // no extensions possible. tie = false; } } // end while tie } // add current kmer to path, as long as not the original seed kmer! if (depth > 0) { best_path_n_pair.first.push_back(kmer.first); best_path_n_pair.second += kmer.second; } return (best_path_n_pair); }
void IRKE::compute_sequence_assemblies(KmerCounter& kcounter, float min_connectivity, unsigned int MIN_ASSEMBLY_LENGTH, unsigned int MIN_ASSEMBLY_COVERAGE, bool WRITE_COVERAGE, string COVERAGE_OUTPUT_FILENAME) { if (! got_sorted_kmers_flag) { stringstream error; error << stacktrace() << " Error, must populate_sorted_kmers_list() before computing sequence assemblies" << endl; throw(error.str()); } unsigned int kmer_length = kcounter.get_kmer_length(); ofstream coverage_writer; if (WRITE_COVERAGE) { coverage_writer.open(COVERAGE_OUTPUT_FILENAME.c_str()); } vector<Kmer_counter_map_iterator>& kmers = sorted_kmers; //kcounter.get_kmers_sort_descending_counts(); unsigned long init_size = kcounter.size(); // string s = "before.kmers"; // kcounter.dump_kmers_to_file(s); for (unsigned int i = 0; i < kmers.size(); i++) { // cerr << "round: " << i << endl; unsigned long kmer_counter_size = kcounter.size(); if (kmer_counter_size > init_size) { // string s = "after.kmers"; // kcounter.dump_kmers_to_file(s); stringstream error; error << stacktrace() << "Error, Kcounter size has grown from " << init_size << " to " << kmer_counter_size << endl; throw (error.str()); } kmer_int_type_t kmer = kmers[i]->first; unsigned int kmer_count = kmers[i]->second; if (kmer_count == 0) { continue; } if (IRKE_COMMON::MONITOR >= 2) { cerr << "SEED kmer: " << kcounter.get_kmer_string(kmer) << ", count: " << kmer_count << endl; } if (kmer == revcomp_val(kmer, kmer_length)) { // palindromic kmer, avoid palindromes as seeds if (IRKE_COMMON::MONITOR >= 2) { cerr << "SEED kmer: " << kcounter.get_kmer_string(kmer) << " is palidnromic. Skipping. " << endl; } continue; } if (kmer_count < MIN_SEED_COVERAGE) { if (IRKE_COMMON::MONITOR >= 2) { cerr << "-seed has insufficient coverage, skipping" << endl; } continue; } float entropy = compute_entropy(kmer, kmer_length); if (entropy < MIN_SEED_ENTROPY) { if (IRKE_COMMON::MONITOR >= 2) { cerr << "-skipping seed due to low entropy: " << entropy << endl; } continue; } /* Extend to the right */ Kmer_visitor visitor(kmer_length, DOUBLE_STRANDED_MODE); Path_n_count_pair selected_path_n_pair_forward = inchworm(kcounter, 'F', kmer, visitor, min_connectivity); visitor.clear(); // add selected path to visitor vector<kmer_int_type_t>& forward_path = selected_path_n_pair_forward.first; if (IRKE_COMMON::MONITOR >= 2) { cerr << "Forward path contains: " << forward_path.size() << " kmers. " << endl; } for (unsigned int i = 0; i < forward_path.size(); i++) { kmer_int_type_t kmer = forward_path[i]; visitor.add(kmer); if (IRKE_COMMON::MONITOR >= 2) { cerr << "\tForward path kmer: " << kcounter.get_kmer_string(kmer) << endl; } } /* Extend to the left */ visitor.erase(kmer); // reset the seed Path_n_count_pair selected_path_n_pair_reverse = inchworm(kcounter, 'R', kmer, visitor, min_connectivity); if (IRKE_COMMON::MONITOR >= 2) { vector<kmer_int_type_t>& reverse_path = selected_path_n_pair_reverse.first; cerr << "Reverse path contains: " << reverse_path.size() << " kmers. " << endl; for (unsigned int i = 0; i < reverse_path.size(); i++) { cerr << "\tReverse path kmer: " << kcounter.get_kmer_string(reverse_path[i]) << endl; } } unsigned int total_counts = selected_path_n_pair_forward.second + selected_path_n_pair_reverse.second + kcounter.get_kmer_count(kmer); vector<kmer_int_type_t>& reverse_path = selected_path_n_pair_reverse.first; vector<kmer_int_type_t> joined_path = _join_forward_n_reverse_paths(reverse_path, kmer, forward_path); // report sequence reconstructed from path. vector<unsigned int> assembly_base_coverage; string sequence = reconstruct_path_sequence(kcounter, joined_path, assembly_base_coverage); unsigned int avg_cov = static_cast<unsigned int> ( (float)total_counts/(sequence.length()-kcounter.get_kmer_length() +1) + 0.5); /* cout << "Inchworm-reconstructed sequence, length: " << sequence.length() << ", avgCov: " << avg_cov << " " << sequence << endl; */ if (sequence.length() >= MIN_ASSEMBLY_LENGTH && avg_cov >= MIN_ASSEMBLY_COVERAGE) { INCHWORM_ASSEMBLY_COUNTER++; stringstream headerstream; headerstream << ">a" << INCHWORM_ASSEMBLY_COUNTER << ";" << avg_cov << " K: " << kmer_length << " length: " << sequence.length(); string header = headerstream.str(); sequence = add_fasta_seq_line_breaks(sequence, 60); cout << header << endl << sequence << endl; if (WRITE_COVERAGE) { coverage_writer << header << endl; for (unsigned int i = 0; i < assembly_base_coverage.size(); i++) { coverage_writer << assembly_base_coverage[i]; if ( (i+1) % 30 == 0) { coverage_writer << endl; } else { coverage_writer << " "; } } coverage_writer << endl; } } // remove path for (unsigned int i = 0; i < joined_path.size(); i++) { kmer_int_type_t kmer = joined_path[i]; /* if (DEBUG) { cout << "\tpruning kmer: " << kmer << endl; } */ kcounter.clear_kmer(kmer); } /* if (DEBUG) { cout << "done pruning kmers." << endl; } */ } if (IRKE_COMMON::MONITOR) { cerr << endl; } if (WRITE_COVERAGE) { coverage_writer.close(); } // drop sorted kmer list as part of cleanup clear_sorted_kmers_list(); return; // end of runIRKE }
void IRKE::compute_sequence_assemblies(KmerCounter &kcounter, float min_connectivity, unsigned int MIN_ASSEMBLY_LENGTH, unsigned int MIN_ASSEMBLY_COVERAGE, bool PARALLEL_IWORM, bool WRITE_COVERAGE, string COVERAGE_OUTPUT_FILENAME) { if (!got_sorted_kmers_flag) { stringstream error; error << stacktrace() << " Error, must populate_sorted_kmers_list() before computing sequence assemblies" << endl; throw (error.str()); } //vector<Kmer_counter_map_iterator>& kmers = sorted_kmers; vector<Kmer_Occurence_Pair> &kmers = sorted_kmers; // note, these are not actually sorted if PARALLEL_IWORM mode. unsigned long init_size = kcounter.size(); cerr << "Total kcounter hash size: " << init_size << " vs. sorted list size: " << kmers.size() << endl; unsigned int kmer_length = kcounter.get_kmer_length(); ofstream coverage_writer; if (WRITE_COVERAGE) { coverage_writer.open(COVERAGE_OUTPUT_FILENAME.c_str()); } // string s = "before.kmers"; // kcounter.dump_kmers_to_file(s); /* ----------------------------------------------------------- Two reconstruction modes. (bhaas, Jan-4-2014) 1. Original (not setting PARALLEL_IWORM): kmer list is sorted descendingly by abundance. Seed kmer is chosen as most abundant, and contig construction extends from this seed. 2. PARALLEL_IWORM: the kmer list is not sorted. A random kmer (just ordered by hash iterator, nothing special, probably not so random) is used as a seed for draft contig reconstruction. The most abundant kmer in the draft contig is chosen as a proper seed. An inchworm contig is then constructed using this new seed, and reported. So, in this case, the draft contig in the first phase is just used to find a better seed, from which the inchworm contig is then properly reconstructed. --------------------------------------------------------------- */ // try building an inchworm contig from each seed kmer int myTid; if (PARALLEL_IWORM) { omp_set_num_threads(IRKE_COMMON::NUM_THREADS); } else { omp_set_num_threads(1); // turn off multithreading for the contig building. } //----------------------------------------------------------------- // Prep writing to separate inchworm output files for each thread //----------------------------------------------------------------- vector<iworm_tmp_file> tmpfiles; int num_threads = omp_get_max_threads(); cerr << "num threads set to: " << num_threads << endl; for (int i = 0; i < num_threads; i++) { iworm_tmp_file tmpfile_struct; tmpfiles.push_back(tmpfile_struct); iworm_tmp_file &itmp = tmpfiles[i]; stringstream filename_constructor; filename_constructor << "tmp.iworm.fa.pid_" << getpid() << ".thread_" << i; itmp.tmp_filename = new char[100]; strcpy(itmp.tmp_filename, filename_constructor.str().c_str()); itmp.tmp_filename[filename_constructor.str().length()] = '\0'; itmp.fh = new ofstream(); itmp.fh->open(itmp.tmp_filename); cerr << "Done opening file. " << itmp.tmp_filename << endl; } //------------------- // Build contigs. //------------------- #pragma omp parallel for private (myTid) schedule (dynamic, 1000) for (unsigned int i = 0; i < kmers.size(); i++) { // cerr << "round: " << i << endl; myTid = omp_get_thread_num(); unsigned long kmer_counter_size = kcounter.size(); if (kmer_counter_size > init_size) { // string s = "after.kmers"; // kcounter.dump_kmers_to_file(s); stringstream error; error << stacktrace() << "Error, Kcounter size has grown from " << init_size << " to " << kmer_counter_size << endl; throw (error.str()); } //kmer_int_type_t kmer = kmers[i]->first; //unsigned int kmer_count = kmers[i]->second; kmer_int_type_t kmer = kmers[i].first; // unsigned int kmer_count = kmers[i].second; // NO!!! Use for sorting, but likely zeroed out in the hashtable after contig construction unsigned int kmer_count = kcounter.get_kmer_count(kmer); if (!is_good_seed_kmer(kmer, kmer_count, kmer_length, min_connectivity)) { continue; } // cout << "SEED kmer: " << kcounter.get_kmer_string(kmer) << ", count: " << kmer_count << endl; if (IRKE_COMMON::MONITOR >= 2) { cerr << "SEED kmer: " << kcounter.get_kmer_string(kmer) << ", count: " << kmer_count << endl; } if (IRKE_COMMON::MONITOR >= 2) { #pragma omp critical cerr << "Seed for thread: " << myTid << " is " << kcounter.get_kmer_string(kmer) << " with count: " << kmer_count << endl; } unsigned int total_counts; vector<kmer_int_type_t> joined_path = build_inchworm_contig_from_seed(kmer, kcounter, min_connectivity, total_counts, PARALLEL_IWORM); if (PARALLEL_IWORM && TWO_PHASE) { // get a new seed based on the draft contig // choose the 'good' kmer with highest abundance kmer_int_type_t new_seed = extract_best_seed(joined_path, kcounter, min_connectivity); if (kcounter.get_kmer_count(new_seed) == 0) { continue; // must have been zapped by another thread } joined_path = build_inchworm_contig_from_seed(new_seed, kcounter, min_connectivity, total_counts, PARALLEL_IWORM); } // report sequence reconstructed from path. vector<unsigned int> assembly_base_coverage; string sequence = reconstruct_path_sequence(kcounter, joined_path, assembly_base_coverage); unsigned int avg_cov = static_cast<unsigned int> ((float) total_counts / (sequence.length() - kcounter.get_kmer_length() + 1) + 0.5); /* cout << "Inchworm-reconstructed sequence, length: " << sequence.length() << ", avgCov: " << avg_cov << " " << sequence << endl; */ size_t contig_length = sequence.length(); if (contig_length >= MIN_ASSEMBLY_LENGTH && avg_cov >= MIN_ASSEMBLY_COVERAGE) { *(tmpfiles[myTid].fh) << total_counts << endl << avg_cov << endl << kmer_count << endl << sequence << endl; } // remove path if (IRKE_COMMON::__DEVEL_zero_kmer_on_use) { // dont forget the seed. The forward/reverse path kmers already cleared. kcounter.clear_kmer(kmer); } else { for (unsigned int i = 0; i < joined_path.size(); i++) { kmer_int_type_t kmer = joined_path[i]; kcounter.clear_kmer(kmer); } } } if (IRKE_COMMON::MONITOR) { cerr << endl; } if (WRITE_COVERAGE) { coverage_writer.close(); } // drop sorted kmer list as part of cleanup clear_sorted_kmers_list(); //------------------------------------------------------------------------------ // examine the contigs generated by the individual threads, remove redundancies. //------------------------------------------------------------------------------ map<unsigned long long, bool> seen_contig_already; for (unsigned int i = 0; i < tmpfiles.size(); i++) { tmpfiles[i].fh->close(); ifstream tmpreader(tmpfiles[i].tmp_filename); while (!tmpreader.eof()) { unsigned int total_counts; unsigned int avg_cov; unsigned int kmer_count; string sequence; tmpreader >> total_counts >> avg_cov >> kmer_count >> sequence; if (tmpreader.eof()) // apparently only happens on the read after the last line is read. break; //cerr << "Read sequence: " << sequence << endl; unsigned int contig_hash = generateHash(sequence); if (!seen_contig_already[contig_hash] ) { seen_contig_already[contig_hash] = true; INCHWORM_ASSEMBLY_COUNTER++; stringstream headerstream; headerstream << ">a" << INCHWORM_ASSEMBLY_COUNTER << ";" << avg_cov << " total_counts: " << total_counts //<< " Fpath: " << selected_path_n_pair_forward.second << " Rpath: " << selected_path_n_pair_reverse.second << " Seed: " << kmer_count << " K: " << kmer_length << " length: " << sequence.length(); string header = headerstream.str(); sequence = add_fasta_seq_line_breaks(sequence, 60); cout << header << endl << sequence << endl; } } // cleanup from earlier dynamic allocation delete (tmpfiles[i].fh); if (!IRKE_COMMON::KEEP_TMP_FILES) { remove(tmpfiles[i].tmp_filename); } delete (tmpfiles[i].tmp_filename); } /* if (WRITE_COVERAGE) { coverage_writer << header << endl; for (unsigned int i = 0; i < assembly_base_coverage.size(); i++) { coverage_writer << assembly_base_coverage[i]; if ( (i+1) % 30 == 0) { coverage_writer << endl; } else { coverage_writer << " "; } } coverage_writer << endl; } } */ return; // end of runIRKE }