void IRKE::compute_sequence_assemblies(KmerCounter& kcounter, float min_connectivity, unsigned int MIN_ASSEMBLY_LENGTH, unsigned int MIN_ASSEMBLY_COVERAGE, bool WRITE_COVERAGE, string COVERAGE_OUTPUT_FILENAME) { if (! got_sorted_kmers_flag) { stringstream error; error << stacktrace() << " Error, must populate_sorted_kmers_list() before computing sequence assemblies" << endl; throw(error.str()); } unsigned int kmer_length = kcounter.get_kmer_length(); ofstream coverage_writer; if (WRITE_COVERAGE) { coverage_writer.open(COVERAGE_OUTPUT_FILENAME.c_str()); } vector<Kmer_counter_map_iterator>& kmers = sorted_kmers; //kcounter.get_kmers_sort_descending_counts(); unsigned long init_size = kcounter.size(); // string s = "before.kmers"; // kcounter.dump_kmers_to_file(s); for (unsigned int i = 0; i < kmers.size(); i++) { // cerr << "round: " << i << endl; unsigned long kmer_counter_size = kcounter.size(); if (kmer_counter_size > init_size) { // string s = "after.kmers"; // kcounter.dump_kmers_to_file(s); stringstream error; error << stacktrace() << "Error, Kcounter size has grown from " << init_size << " to " << kmer_counter_size << endl; throw (error.str()); } kmer_int_type_t kmer = kmers[i]->first; unsigned int kmer_count = kmers[i]->second; if (kmer_count == 0) { continue; } if (IRKE_COMMON::MONITOR >= 2) { cerr << "SEED kmer: " << kcounter.get_kmer_string(kmer) << ", count: " << kmer_count << endl; } if (kmer == revcomp_val(kmer, kmer_length)) { // palindromic kmer, avoid palindromes as seeds if (IRKE_COMMON::MONITOR >= 2) { cerr << "SEED kmer: " << kcounter.get_kmer_string(kmer) << " is palidnromic. Skipping. " << endl; } continue; } if (kmer_count < MIN_SEED_COVERAGE) { if (IRKE_COMMON::MONITOR >= 2) { cerr << "-seed has insufficient coverage, skipping" << endl; } continue; } float entropy = compute_entropy(kmer, kmer_length); if (entropy < MIN_SEED_ENTROPY) { if (IRKE_COMMON::MONITOR >= 2) { cerr << "-skipping seed due to low entropy: " << entropy << endl; } continue; } /* Extend to the right */ Kmer_visitor visitor(kmer_length, DOUBLE_STRANDED_MODE); Path_n_count_pair selected_path_n_pair_forward = inchworm(kcounter, 'F', kmer, visitor, min_connectivity); visitor.clear(); // add selected path to visitor vector<kmer_int_type_t>& forward_path = selected_path_n_pair_forward.first; if (IRKE_COMMON::MONITOR >= 2) { cerr << "Forward path contains: " << forward_path.size() << " kmers. " << endl; } for (unsigned int i = 0; i < forward_path.size(); i++) { kmer_int_type_t kmer = forward_path[i]; visitor.add(kmer); if (IRKE_COMMON::MONITOR >= 2) { cerr << "\tForward path kmer: " << kcounter.get_kmer_string(kmer) << endl; } } /* Extend to the left */ visitor.erase(kmer); // reset the seed Path_n_count_pair selected_path_n_pair_reverse = inchworm(kcounter, 'R', kmer, visitor, min_connectivity); if (IRKE_COMMON::MONITOR >= 2) { vector<kmer_int_type_t>& reverse_path = selected_path_n_pair_reverse.first; cerr << "Reverse path contains: " << reverse_path.size() << " kmers. " << endl; for (unsigned int i = 0; i < reverse_path.size(); i++) { cerr << "\tReverse path kmer: " << kcounter.get_kmer_string(reverse_path[i]) << endl; } } unsigned int total_counts = selected_path_n_pair_forward.second + selected_path_n_pair_reverse.second + kcounter.get_kmer_count(kmer); vector<kmer_int_type_t>& reverse_path = selected_path_n_pair_reverse.first; vector<kmer_int_type_t> joined_path = _join_forward_n_reverse_paths(reverse_path, kmer, forward_path); // report sequence reconstructed from path. vector<unsigned int> assembly_base_coverage; string sequence = reconstruct_path_sequence(kcounter, joined_path, assembly_base_coverage); unsigned int avg_cov = static_cast<unsigned int> ( (float)total_counts/(sequence.length()-kcounter.get_kmer_length() +1) + 0.5); /* cout << "Inchworm-reconstructed sequence, length: " << sequence.length() << ", avgCov: " << avg_cov << " " << sequence << endl; */ if (sequence.length() >= MIN_ASSEMBLY_LENGTH && avg_cov >= MIN_ASSEMBLY_COVERAGE) { INCHWORM_ASSEMBLY_COUNTER++; stringstream headerstream; headerstream << ">a" << INCHWORM_ASSEMBLY_COUNTER << ";" << avg_cov << " K: " << kmer_length << " length: " << sequence.length(); string header = headerstream.str(); sequence = add_fasta_seq_line_breaks(sequence, 60); cout << header << endl << sequence << endl; if (WRITE_COVERAGE) { coverage_writer << header << endl; for (unsigned int i = 0; i < assembly_base_coverage.size(); i++) { coverage_writer << assembly_base_coverage[i]; if ( (i+1) % 30 == 0) { coverage_writer << endl; } else { coverage_writer << " "; } } coverage_writer << endl; } } // remove path for (unsigned int i = 0; i < joined_path.size(); i++) { kmer_int_type_t kmer = joined_path[i]; /* if (DEBUG) { cout << "\tpruning kmer: " << kmer << endl; } */ kcounter.clear_kmer(kmer); } /* if (DEBUG) { cout << "done pruning kmers." << endl; } */ } if (IRKE_COMMON::MONITOR) { cerr << endl; } if (WRITE_COVERAGE) { coverage_writer.close(); } // drop sorted kmer list as part of cleanup clear_sorted_kmers_list(); return; // end of runIRKE }
void IRKE::compute_sequence_assemblies(KmerCounter &kcounter, float min_connectivity, unsigned int MIN_ASSEMBLY_LENGTH, unsigned int MIN_ASSEMBLY_COVERAGE, bool PARALLEL_IWORM, bool WRITE_COVERAGE, string COVERAGE_OUTPUT_FILENAME) { if (!got_sorted_kmers_flag) { stringstream error; error << stacktrace() << " Error, must populate_sorted_kmers_list() before computing sequence assemblies" << endl; throw (error.str()); } //vector<Kmer_counter_map_iterator>& kmers = sorted_kmers; vector<Kmer_Occurence_Pair> &kmers = sorted_kmers; // note, these are not actually sorted if PARALLEL_IWORM mode. unsigned long init_size = kcounter.size(); cerr << "Total kcounter hash size: " << init_size << " vs. sorted list size: " << kmers.size() << endl; unsigned int kmer_length = kcounter.get_kmer_length(); ofstream coverage_writer; if (WRITE_COVERAGE) { coverage_writer.open(COVERAGE_OUTPUT_FILENAME.c_str()); } // string s = "before.kmers"; // kcounter.dump_kmers_to_file(s); /* ----------------------------------------------------------- Two reconstruction modes. (bhaas, Jan-4-2014) 1. Original (not setting PARALLEL_IWORM): kmer list is sorted descendingly by abundance. Seed kmer is chosen as most abundant, and contig construction extends from this seed. 2. PARALLEL_IWORM: the kmer list is not sorted. A random kmer (just ordered by hash iterator, nothing special, probably not so random) is used as a seed for draft contig reconstruction. The most abundant kmer in the draft contig is chosen as a proper seed. An inchworm contig is then constructed using this new seed, and reported. So, in this case, the draft contig in the first phase is just used to find a better seed, from which the inchworm contig is then properly reconstructed. --------------------------------------------------------------- */ // try building an inchworm contig from each seed kmer int myTid; if (PARALLEL_IWORM) { omp_set_num_threads(IRKE_COMMON::NUM_THREADS); } else { omp_set_num_threads(1); // turn off multithreading for the contig building. } //----------------------------------------------------------------- // Prep writing to separate inchworm output files for each thread //----------------------------------------------------------------- vector<iworm_tmp_file> tmpfiles; int num_threads = omp_get_max_threads(); cerr << "num threads set to: " << num_threads << endl; for (int i = 0; i < num_threads; i++) { iworm_tmp_file tmpfile_struct; tmpfiles.push_back(tmpfile_struct); iworm_tmp_file &itmp = tmpfiles[i]; stringstream filename_constructor; filename_constructor << "tmp.iworm.fa.pid_" << getpid() << ".thread_" << i; itmp.tmp_filename = new char[100]; strcpy(itmp.tmp_filename, filename_constructor.str().c_str()); itmp.tmp_filename[filename_constructor.str().length()] = '\0'; itmp.fh = new ofstream(); itmp.fh->open(itmp.tmp_filename); cerr << "Done opening file. " << itmp.tmp_filename << endl; } //------------------- // Build contigs. //------------------- #pragma omp parallel for private (myTid) schedule (dynamic, 1000) for (unsigned int i = 0; i < kmers.size(); i++) { // cerr << "round: " << i << endl; myTid = omp_get_thread_num(); unsigned long kmer_counter_size = kcounter.size(); if (kmer_counter_size > init_size) { // string s = "after.kmers"; // kcounter.dump_kmers_to_file(s); stringstream error; error << stacktrace() << "Error, Kcounter size has grown from " << init_size << " to " << kmer_counter_size << endl; throw (error.str()); } //kmer_int_type_t kmer = kmers[i]->first; //unsigned int kmer_count = kmers[i]->second; kmer_int_type_t kmer = kmers[i].first; // unsigned int kmer_count = kmers[i].second; // NO!!! Use for sorting, but likely zeroed out in the hashtable after contig construction unsigned int kmer_count = kcounter.get_kmer_count(kmer); if (!is_good_seed_kmer(kmer, kmer_count, kmer_length, min_connectivity)) { continue; } // cout << "SEED kmer: " << kcounter.get_kmer_string(kmer) << ", count: " << kmer_count << endl; if (IRKE_COMMON::MONITOR >= 2) { cerr << "SEED kmer: " << kcounter.get_kmer_string(kmer) << ", count: " << kmer_count << endl; } if (IRKE_COMMON::MONITOR >= 2) { #pragma omp critical cerr << "Seed for thread: " << myTid << " is " << kcounter.get_kmer_string(kmer) << " with count: " << kmer_count << endl; } unsigned int total_counts; vector<kmer_int_type_t> joined_path = build_inchworm_contig_from_seed(kmer, kcounter, min_connectivity, total_counts, PARALLEL_IWORM); if (PARALLEL_IWORM && TWO_PHASE) { // get a new seed based on the draft contig // choose the 'good' kmer with highest abundance kmer_int_type_t new_seed = extract_best_seed(joined_path, kcounter, min_connectivity); if (kcounter.get_kmer_count(new_seed) == 0) { continue; // must have been zapped by another thread } joined_path = build_inchworm_contig_from_seed(new_seed, kcounter, min_connectivity, total_counts, PARALLEL_IWORM); } // report sequence reconstructed from path. vector<unsigned int> assembly_base_coverage; string sequence = reconstruct_path_sequence(kcounter, joined_path, assembly_base_coverage); unsigned int avg_cov = static_cast<unsigned int> ((float) total_counts / (sequence.length() - kcounter.get_kmer_length() + 1) + 0.5); /* cout << "Inchworm-reconstructed sequence, length: " << sequence.length() << ", avgCov: " << avg_cov << " " << sequence << endl; */ size_t contig_length = sequence.length(); if (contig_length >= MIN_ASSEMBLY_LENGTH && avg_cov >= MIN_ASSEMBLY_COVERAGE) { *(tmpfiles[myTid].fh) << total_counts << endl << avg_cov << endl << kmer_count << endl << sequence << endl; } // remove path if (IRKE_COMMON::__DEVEL_zero_kmer_on_use) { // dont forget the seed. The forward/reverse path kmers already cleared. kcounter.clear_kmer(kmer); } else { for (unsigned int i = 0; i < joined_path.size(); i++) { kmer_int_type_t kmer = joined_path[i]; kcounter.clear_kmer(kmer); } } } if (IRKE_COMMON::MONITOR) { cerr << endl; } if (WRITE_COVERAGE) { coverage_writer.close(); } // drop sorted kmer list as part of cleanup clear_sorted_kmers_list(); //------------------------------------------------------------------------------ // examine the contigs generated by the individual threads, remove redundancies. //------------------------------------------------------------------------------ map<unsigned long long, bool> seen_contig_already; for (unsigned int i = 0; i < tmpfiles.size(); i++) { tmpfiles[i].fh->close(); ifstream tmpreader(tmpfiles[i].tmp_filename); while (!tmpreader.eof()) { unsigned int total_counts; unsigned int avg_cov; unsigned int kmer_count; string sequence; tmpreader >> total_counts >> avg_cov >> kmer_count >> sequence; if (tmpreader.eof()) // apparently only happens on the read after the last line is read. break; //cerr << "Read sequence: " << sequence << endl; unsigned int contig_hash = generateHash(sequence); if (!seen_contig_already[contig_hash] ) { seen_contig_already[contig_hash] = true; INCHWORM_ASSEMBLY_COUNTER++; stringstream headerstream; headerstream << ">a" << INCHWORM_ASSEMBLY_COUNTER << ";" << avg_cov << " total_counts: " << total_counts //<< " Fpath: " << selected_path_n_pair_forward.second << " Rpath: " << selected_path_n_pair_reverse.second << " Seed: " << kmer_count << " K: " << kmer_length << " length: " << sequence.length(); string header = headerstream.str(); sequence = add_fasta_seq_line_breaks(sequence, 60); cout << header << endl << sequence << endl; } } // cleanup from earlier dynamic allocation delete (tmpfiles[i].fh); if (!IRKE_COMMON::KEEP_TMP_FILES) { remove(tmpfiles[i].tmp_filename); } delete (tmpfiles[i].tmp_filename); } /* if (WRITE_COVERAGE) { coverage_writer << header << endl; for (unsigned int i = 0; i < assembly_base_coverage.size(); i++) { coverage_writer << assembly_base_coverage[i]; if ( (i+1) % 30 == 0) { coverage_writer << endl; } else { coverage_writer << " "; } } coverage_writer << endl; } } */ return; // end of runIRKE }