kmer_int_type_t IRKE::extract_best_seed(vector<kmer_int_type_t> &kmer_vec, KmerCounter &kcounter, float min_connectivity) { unsigned int kmer_length = kcounter.get_kmer_length(); unsigned int best_kmer_count = 0; kmer_int_type_t best_seed = 0; for (unsigned int i = 0; i < kmer_vec.size(); i++) { kmer_int_type_t kmer = kmer_vec[i]; unsigned int count = kcounter.get_kmer_count(kmer); if (count > best_kmer_count && is_good_seed_kmer(kmer, count, kmer_length, min_connectivity)) { best_kmer_count = count; best_seed = kmer; } } if (IRKE_COMMON::MONITOR >= 2) { cerr << "Parallel method found better seed: " << kcounter.get_kmer_string(best_seed) << " with count: " << best_kmer_count << endl; } return (best_seed); }
vector<kmer_int_type_t> IRKE::build_inchworm_contig_from_seed(kmer_int_type_t kmer, KmerCounter &kcounter, float min_connectivity, unsigned int &total_counts, bool) { unsigned int kmer_count = kcounter.get_kmer_count(kmer); /* Extend to the right */ unsigned int kmer_length = kcounter.get_kmer_length(); Kmer_visitor visitor(kmer_length, DOUBLE_STRANDED_MODE); Path_n_count_pair selected_path_n_pair_forward = inchworm(kcounter, 'F', kmer, visitor, min_connectivity); visitor.clear(); // add selected path to visitor vector<kmer_int_type_t> &forward_path = selected_path_n_pair_forward.first; if (IRKE_COMMON::MONITOR >= 2) { cerr << "Forward path contains: " << forward_path.size() << " kmers. " << endl; } for (unsigned int i = 0; i < forward_path.size(); i++) { kmer_int_type_t kmer = forward_path[i]; visitor.add(kmer); if (IRKE_COMMON::MONITOR >= 2) { cerr << "\tForward path kmer: " << kcounter.get_kmer_string(kmer) << endl; } } /* Extend to the left */ visitor.erase(kmer); // reset the seed Path_n_count_pair selected_path_n_pair_reverse = inchworm(kcounter, 'R', kmer, visitor, min_connectivity); if (IRKE_COMMON::MONITOR >= 2) { vector<kmer_int_type_t> &reverse_path = selected_path_n_pair_reverse.first; cerr << "Reverse path contains: " << reverse_path.size() << " kmers. " << endl; for (unsigned int i = 0; i < reverse_path.size(); i++) { cerr << "\tReverse path kmer: " << kcounter.get_kmer_string(reverse_path[i]) << endl; } } total_counts = selected_path_n_pair_forward.second + selected_path_n_pair_reverse.second + kmer_count; vector<kmer_int_type_t> &reverse_path = selected_path_n_pair_reverse.first; vector<kmer_int_type_t> joined_path = _join_forward_n_reverse_paths(reverse_path, kmer, forward_path); return (joined_path); }
Path_n_count_pair IRKE::inchworm (KmerCounter& kcounter, char direction, kmer_int_type_t kmer, Kmer_visitor& visitor, float min_connectivity) { // cout << "inchworm" << endl; Path_n_count_pair entire_path; unsigned int inchworm_round = 0; unsigned long num_total_kmers = kcounter.size(); Kmer_visitor eliminator(kcounter.get_kmer_length(), DOUBLE_STRANDED_MODE); while (true) { inchworm_round++; eliminator.clear(); if (inchworm_round > num_total_kmers) { throw(string ("Error, inchworm rounds have exceeded the number of possible seed kmers")); } if (IRKE_COMMON::MONITOR >= 3) { cerr << endl << "Inchworm round(" << string(1,direction) << "): " << inchworm_round << " searching kmer: " << kmer << endl; string kmer_str = kcounter.get_kmer_string(kmer); cerr << kcounter.describe_kmer(kmer_str) << endl; } visitor.erase(kmer); // seed kmer must be not visited already. Kmer_Occurence_Pair kmer_pair(kmer, kcounter.get_kmer_count(kmer)); Path_n_count_pair best_path = inchworm_step(kcounter, direction, kmer_pair, visitor, eliminator, inchworm_round, 0, min_connectivity, MAX_RECURSION); if (best_path.second > 0) { // append info to entire path in reverse order, so starts just after seed kmer vector<kmer_int_type_t>& kmer_list = best_path.first; unsigned int num_kmers = kmer_list.size(); int first_index = num_kmers - 1; int last_index = 0; if (CRAWL) { last_index = first_index - CRAWL_LENGTH + 1; if (last_index < 0) { last_index = 0; } } for (int i = first_index; i >= last_index; i--) { kmer_int_type_t kmer_extend = kmer_list[i]; entire_path.first.push_back(kmer_extend); visitor.add(kmer_extend); entire_path.second += kcounter.get_kmer_count(kmer_extend); } kmer = entire_path.first[ entire_path.first.size() -1 ]; } else { // no extension possible break; } } if (IRKE_COMMON::MONITOR >= 3) cerr << endl; return(entire_path); }
void IRKE::compute_sequence_assemblies(KmerCounter& kcounter, float min_connectivity, unsigned int MIN_ASSEMBLY_LENGTH, unsigned int MIN_ASSEMBLY_COVERAGE, bool WRITE_COVERAGE, string COVERAGE_OUTPUT_FILENAME) { if (! got_sorted_kmers_flag) { stringstream error; error << stacktrace() << " Error, must populate_sorted_kmers_list() before computing sequence assemblies" << endl; throw(error.str()); } unsigned int kmer_length = kcounter.get_kmer_length(); ofstream coverage_writer; if (WRITE_COVERAGE) { coverage_writer.open(COVERAGE_OUTPUT_FILENAME.c_str()); } vector<Kmer_counter_map_iterator>& kmers = sorted_kmers; //kcounter.get_kmers_sort_descending_counts(); unsigned long init_size = kcounter.size(); // string s = "before.kmers"; // kcounter.dump_kmers_to_file(s); for (unsigned int i = 0; i < kmers.size(); i++) { // cerr << "round: " << i << endl; unsigned long kmer_counter_size = kcounter.size(); if (kmer_counter_size > init_size) { // string s = "after.kmers"; // kcounter.dump_kmers_to_file(s); stringstream error; error << stacktrace() << "Error, Kcounter size has grown from " << init_size << " to " << kmer_counter_size << endl; throw (error.str()); } kmer_int_type_t kmer = kmers[i]->first; unsigned int kmer_count = kmers[i]->second; if (kmer_count == 0) { continue; } if (IRKE_COMMON::MONITOR >= 2) { cerr << "SEED kmer: " << kcounter.get_kmer_string(kmer) << ", count: " << kmer_count << endl; } if (kmer == revcomp_val(kmer, kmer_length)) { // palindromic kmer, avoid palindromes as seeds if (IRKE_COMMON::MONITOR >= 2) { cerr << "SEED kmer: " << kcounter.get_kmer_string(kmer) << " is palidnromic. Skipping. " << endl; } continue; } if (kmer_count < MIN_SEED_COVERAGE) { if (IRKE_COMMON::MONITOR >= 2) { cerr << "-seed has insufficient coverage, skipping" << endl; } continue; } float entropy = compute_entropy(kmer, kmer_length); if (entropy < MIN_SEED_ENTROPY) { if (IRKE_COMMON::MONITOR >= 2) { cerr << "-skipping seed due to low entropy: " << entropy << endl; } continue; } /* Extend to the right */ Kmer_visitor visitor(kmer_length, DOUBLE_STRANDED_MODE); Path_n_count_pair selected_path_n_pair_forward = inchworm(kcounter, 'F', kmer, visitor, min_connectivity); visitor.clear(); // add selected path to visitor vector<kmer_int_type_t>& forward_path = selected_path_n_pair_forward.first; if (IRKE_COMMON::MONITOR >= 2) { cerr << "Forward path contains: " << forward_path.size() << " kmers. " << endl; } for (unsigned int i = 0; i < forward_path.size(); i++) { kmer_int_type_t kmer = forward_path[i]; visitor.add(kmer); if (IRKE_COMMON::MONITOR >= 2) { cerr << "\tForward path kmer: " << kcounter.get_kmer_string(kmer) << endl; } } /* Extend to the left */ visitor.erase(kmer); // reset the seed Path_n_count_pair selected_path_n_pair_reverse = inchworm(kcounter, 'R', kmer, visitor, min_connectivity); if (IRKE_COMMON::MONITOR >= 2) { vector<kmer_int_type_t>& reverse_path = selected_path_n_pair_reverse.first; cerr << "Reverse path contains: " << reverse_path.size() << " kmers. " << endl; for (unsigned int i = 0; i < reverse_path.size(); i++) { cerr << "\tReverse path kmer: " << kcounter.get_kmer_string(reverse_path[i]) << endl; } } unsigned int total_counts = selected_path_n_pair_forward.second + selected_path_n_pair_reverse.second + kcounter.get_kmer_count(kmer); vector<kmer_int_type_t>& reverse_path = selected_path_n_pair_reverse.first; vector<kmer_int_type_t> joined_path = _join_forward_n_reverse_paths(reverse_path, kmer, forward_path); // report sequence reconstructed from path. vector<unsigned int> assembly_base_coverage; string sequence = reconstruct_path_sequence(kcounter, joined_path, assembly_base_coverage); unsigned int avg_cov = static_cast<unsigned int> ( (float)total_counts/(sequence.length()-kcounter.get_kmer_length() +1) + 0.5); /* cout << "Inchworm-reconstructed sequence, length: " << sequence.length() << ", avgCov: " << avg_cov << " " << sequence << endl; */ if (sequence.length() >= MIN_ASSEMBLY_LENGTH && avg_cov >= MIN_ASSEMBLY_COVERAGE) { INCHWORM_ASSEMBLY_COUNTER++; stringstream headerstream; headerstream << ">a" << INCHWORM_ASSEMBLY_COUNTER << ";" << avg_cov << " K: " << kmer_length << " length: " << sequence.length(); string header = headerstream.str(); sequence = add_fasta_seq_line_breaks(sequence, 60); cout << header << endl << sequence << endl; if (WRITE_COVERAGE) { coverage_writer << header << endl; for (unsigned int i = 0; i < assembly_base_coverage.size(); i++) { coverage_writer << assembly_base_coverage[i]; if ( (i+1) % 30 == 0) { coverage_writer << endl; } else { coverage_writer << " "; } } coverage_writer << endl; } } // remove path for (unsigned int i = 0; i < joined_path.size(); i++) { kmer_int_type_t kmer = joined_path[i]; /* if (DEBUG) { cout << "\tpruning kmer: " << kmer << endl; } */ kcounter.clear_kmer(kmer); } /* if (DEBUG) { cout << "done pruning kmers." << endl; } */ } if (IRKE_COMMON::MONITOR) { cerr << endl; } if (WRITE_COVERAGE) { coverage_writer.close(); } // drop sorted kmer list as part of cleanup clear_sorted_kmers_list(); return; // end of runIRKE }
Path_n_count_pair IRKE::inchworm(KmerCounter &kcounter, char direction, kmer_int_type_t kmer, Kmer_visitor &visitor, float min_connectivity) { // cout << "inchworm" << endl; Path_n_count_pair entire_path; entire_path.second = 0; // init cumulative path coverage unsigned int inchworm_round = 0; unsigned long num_total_kmers = kcounter.size(); Kmer_visitor eliminator(kcounter.get_kmer_length(), DOUBLE_STRANDED_MODE); while (true) { if (IRKE_COMMON::__DEVEL_rand_fracture) { // terminate extension with probability of __DEVEL_rand_fracture_prob float prob_to_fracture = rand() / (float) RAND_MAX; //cerr << "prob: " << prob_to_fracture << endl; if (prob_to_fracture <= IRKE_COMMON::__DEVEL_rand_fracture_prob) { // cerr << "Fracturing at iworm round: " << inchworm_round << " given P: " << prob_to_fracture << endl; return (entire_path); } } inchworm_round++; eliminator.clear(); if (inchworm_round > num_total_kmers) { throw (string("Error, inchworm rounds have exceeded the number of possible seed kmers")); } if (IRKE_COMMON::MONITOR >= 3) { cerr << endl << "Inchworm round(" << string(1, direction) << "): " << inchworm_round << " searching kmer: " << kmer << endl; string kmer_str = kcounter.get_kmer_string(kmer); cerr << kcounter.describe_kmer(kmer_str) << endl; } visitor.erase(kmer); // seed kmer must be not visited already. Kmer_Occurence_Pair kmer_pair(kmer, kcounter.get_kmer_count(kmer)); Path_n_count_pair best_path = inchworm_step(kcounter, direction, kmer_pair, visitor, eliminator, inchworm_round, 0, min_connectivity, MAX_RECURSION); vector<kmer_int_type_t> &kmer_list = best_path.first; unsigned int num_kmers = kmer_list.size(); if ((IRKE_COMMON::__DEVEL_zero_kmer_on_use && num_kmers >= 1) || best_path.second > 0) { // append info to entire path in reverse order, so starts just after seed kmer int first_index = num_kmers - 1; int last_index = 0; if (CRAWL) { last_index = first_index - CRAWL_LENGTH + 1; if (last_index < 0) { last_index = 0; } } for (int i = first_index; i >= last_index; i--) { kmer_int_type_t kmer_extend = kmer_list[i]; entire_path.first.push_back(kmer_extend); visitor.add(kmer_extend); //entire_path.second += kcounter.get_kmer_count(kmer_extend); // selected here, zero out: if (IRKE_COMMON::__DEVEL_zero_kmer_on_use) { kcounter.clear_kmer(kmer_extend); } } kmer = entire_path.first[entire_path.first.size() - 1]; entire_path.second += best_path.second; } else { // no extension possible break; } } if (IRKE_COMMON::MONITOR >= 3) cerr << "No extension possible." << endl << endl; return (entire_path); }
void IRKE::compute_sequence_assemblies(KmerCounter &kcounter, float min_connectivity, unsigned int MIN_ASSEMBLY_LENGTH, unsigned int MIN_ASSEMBLY_COVERAGE, bool PARALLEL_IWORM, bool WRITE_COVERAGE, string COVERAGE_OUTPUT_FILENAME) { if (!got_sorted_kmers_flag) { stringstream error; error << stacktrace() << " Error, must populate_sorted_kmers_list() before computing sequence assemblies" << endl; throw (error.str()); } //vector<Kmer_counter_map_iterator>& kmers = sorted_kmers; vector<Kmer_Occurence_Pair> &kmers = sorted_kmers; // note, these are not actually sorted if PARALLEL_IWORM mode. unsigned long init_size = kcounter.size(); cerr << "Total kcounter hash size: " << init_size << " vs. sorted list size: " << kmers.size() << endl; unsigned int kmer_length = kcounter.get_kmer_length(); ofstream coverage_writer; if (WRITE_COVERAGE) { coverage_writer.open(COVERAGE_OUTPUT_FILENAME.c_str()); } // string s = "before.kmers"; // kcounter.dump_kmers_to_file(s); /* ----------------------------------------------------------- Two reconstruction modes. (bhaas, Jan-4-2014) 1. Original (not setting PARALLEL_IWORM): kmer list is sorted descendingly by abundance. Seed kmer is chosen as most abundant, and contig construction extends from this seed. 2. PARALLEL_IWORM: the kmer list is not sorted. A random kmer (just ordered by hash iterator, nothing special, probably not so random) is used as a seed for draft contig reconstruction. The most abundant kmer in the draft contig is chosen as a proper seed. An inchworm contig is then constructed using this new seed, and reported. So, in this case, the draft contig in the first phase is just used to find a better seed, from which the inchworm contig is then properly reconstructed. --------------------------------------------------------------- */ // try building an inchworm contig from each seed kmer int myTid; if (PARALLEL_IWORM) { omp_set_num_threads(IRKE_COMMON::NUM_THREADS); } else { omp_set_num_threads(1); // turn off multithreading for the contig building. } //----------------------------------------------------------------- // Prep writing to separate inchworm output files for each thread //----------------------------------------------------------------- vector<iworm_tmp_file> tmpfiles; int num_threads = omp_get_max_threads(); cerr << "num threads set to: " << num_threads << endl; for (int i = 0; i < num_threads; i++) { iworm_tmp_file tmpfile_struct; tmpfiles.push_back(tmpfile_struct); iworm_tmp_file &itmp = tmpfiles[i]; stringstream filename_constructor; filename_constructor << "tmp.iworm.fa.pid_" << getpid() << ".thread_" << i; itmp.tmp_filename = new char[100]; strcpy(itmp.tmp_filename, filename_constructor.str().c_str()); itmp.tmp_filename[filename_constructor.str().length()] = '\0'; itmp.fh = new ofstream(); itmp.fh->open(itmp.tmp_filename); cerr << "Done opening file. " << itmp.tmp_filename << endl; } //------------------- // Build contigs. //------------------- #pragma omp parallel for private (myTid) schedule (dynamic, 1000) for (unsigned int i = 0; i < kmers.size(); i++) { // cerr << "round: " << i << endl; myTid = omp_get_thread_num(); unsigned long kmer_counter_size = kcounter.size(); if (kmer_counter_size > init_size) { // string s = "after.kmers"; // kcounter.dump_kmers_to_file(s); stringstream error; error << stacktrace() << "Error, Kcounter size has grown from " << init_size << " to " << kmer_counter_size << endl; throw (error.str()); } //kmer_int_type_t kmer = kmers[i]->first; //unsigned int kmer_count = kmers[i]->second; kmer_int_type_t kmer = kmers[i].first; // unsigned int kmer_count = kmers[i].second; // NO!!! Use for sorting, but likely zeroed out in the hashtable after contig construction unsigned int kmer_count = kcounter.get_kmer_count(kmer); if (!is_good_seed_kmer(kmer, kmer_count, kmer_length, min_connectivity)) { continue; } // cout << "SEED kmer: " << kcounter.get_kmer_string(kmer) << ", count: " << kmer_count << endl; if (IRKE_COMMON::MONITOR >= 2) { cerr << "SEED kmer: " << kcounter.get_kmer_string(kmer) << ", count: " << kmer_count << endl; } if (IRKE_COMMON::MONITOR >= 2) { #pragma omp critical cerr << "Seed for thread: " << myTid << " is " << kcounter.get_kmer_string(kmer) << " with count: " << kmer_count << endl; } unsigned int total_counts; vector<kmer_int_type_t> joined_path = build_inchworm_contig_from_seed(kmer, kcounter, min_connectivity, total_counts, PARALLEL_IWORM); if (PARALLEL_IWORM && TWO_PHASE) { // get a new seed based on the draft contig // choose the 'good' kmer with highest abundance kmer_int_type_t new_seed = extract_best_seed(joined_path, kcounter, min_connectivity); if (kcounter.get_kmer_count(new_seed) == 0) { continue; // must have been zapped by another thread } joined_path = build_inchworm_contig_from_seed(new_seed, kcounter, min_connectivity, total_counts, PARALLEL_IWORM); } // report sequence reconstructed from path. vector<unsigned int> assembly_base_coverage; string sequence = reconstruct_path_sequence(kcounter, joined_path, assembly_base_coverage); unsigned int avg_cov = static_cast<unsigned int> ((float) total_counts / (sequence.length() - kcounter.get_kmer_length() + 1) + 0.5); /* cout << "Inchworm-reconstructed sequence, length: " << sequence.length() << ", avgCov: " << avg_cov << " " << sequence << endl; */ size_t contig_length = sequence.length(); if (contig_length >= MIN_ASSEMBLY_LENGTH && avg_cov >= MIN_ASSEMBLY_COVERAGE) { *(tmpfiles[myTid].fh) << total_counts << endl << avg_cov << endl << kmer_count << endl << sequence << endl; } // remove path if (IRKE_COMMON::__DEVEL_zero_kmer_on_use) { // dont forget the seed. The forward/reverse path kmers already cleared. kcounter.clear_kmer(kmer); } else { for (unsigned int i = 0; i < joined_path.size(); i++) { kmer_int_type_t kmer = joined_path[i]; kcounter.clear_kmer(kmer); } } } if (IRKE_COMMON::MONITOR) { cerr << endl; } if (WRITE_COVERAGE) { coverage_writer.close(); } // drop sorted kmer list as part of cleanup clear_sorted_kmers_list(); //------------------------------------------------------------------------------ // examine the contigs generated by the individual threads, remove redundancies. //------------------------------------------------------------------------------ map<unsigned long long, bool> seen_contig_already; for (unsigned int i = 0; i < tmpfiles.size(); i++) { tmpfiles[i].fh->close(); ifstream tmpreader(tmpfiles[i].tmp_filename); while (!tmpreader.eof()) { unsigned int total_counts; unsigned int avg_cov; unsigned int kmer_count; string sequence; tmpreader >> total_counts >> avg_cov >> kmer_count >> sequence; if (tmpreader.eof()) // apparently only happens on the read after the last line is read. break; //cerr << "Read sequence: " << sequence << endl; unsigned int contig_hash = generateHash(sequence); if (!seen_contig_already[contig_hash] ) { seen_contig_already[contig_hash] = true; INCHWORM_ASSEMBLY_COUNTER++; stringstream headerstream; headerstream << ">a" << INCHWORM_ASSEMBLY_COUNTER << ";" << avg_cov << " total_counts: " << total_counts //<< " Fpath: " << selected_path_n_pair_forward.second << " Rpath: " << selected_path_n_pair_reverse.second << " Seed: " << kmer_count << " K: " << kmer_length << " length: " << sequence.length(); string header = headerstream.str(); sequence = add_fasta_seq_line_breaks(sequence, 60); cout << header << endl << sequence << endl; } } // cleanup from earlier dynamic allocation delete (tmpfiles[i].fh); if (!IRKE_COMMON::KEEP_TMP_FILES) { remove(tmpfiles[i].tmp_filename); } delete (tmpfiles[i].tmp_filename); } /* if (WRITE_COVERAGE) { coverage_writer << header << endl; for (unsigned int i = 0; i < assembly_base_coverage.size(); i++) { coverage_writer << assembly_base_coverage[i]; if ( (i+1) % 30 == 0) { coverage_writer << endl; } else { coverage_writer << " "; } } coverage_writer << endl; } } */ return; // end of runIRKE }
void populate_kmer_counter_from_reads (KmerCounter& kcounter, string& fasta_filename) { unsigned int kmer_length = kcounter.get_kmer_length(); int i, myTid; unsigned long sum, *record_counter = new unsigned long[omp_get_max_threads()]; unsigned long start, end; // init record counter for (int i = 0; i < omp_get_max_threads(); i++) { record_counter[i] = 0; } cerr << "-storing Kmers..." << "\n"; start = time(NULL); Fasta_reader fasta_reader(fasta_filename); unsigned int entry_num = 0; #pragma omp parallel private (myTid) { myTid = omp_get_thread_num(); record_counter[myTid] = 0; while (fasta_reader.hasNext()) { Fasta_entry fe = fasta_reader.getNext(); string accession = fe.get_accession(); #pragma omp atomic entry_num++; record_counter[myTid]++; if (IRKE_COMMON::MONITOR >= 4) { cerr << "[" << entry_num << "] acc: " << accession << ", by thread no: " << myTid << "\n";; } else if (IRKE_COMMON::MONITOR) { if (myTid == 0 && record_counter[myTid] % 1000 == 0) { sum = record_counter[0]; for (i=1; i<omp_get_num_threads(); i++) sum+= record_counter[i]; cerr << "\r [" << sum << "] sequences parsed. "; } } string seq = fe.get_sequence(); if (seq.length() < KMER_SIZE + 1) { continue; } kcounter.add_sequence(seq); } cerr << "\n" << " done parsing " << sum << " sequences, extracted " << kcounter.size() << " kmers, taking " << (end-start) << " seconds." << "\n"; } return; }