vector<kmer_int_type_t> IRKE::build_inchworm_contig_from_seed(kmer_int_type_t kmer, KmerCounter &kcounter, float min_connectivity, unsigned int &total_counts, bool) { unsigned int kmer_count = kcounter.get_kmer_count(kmer); /* Extend to the right */ unsigned int kmer_length = kcounter.get_kmer_length(); Kmer_visitor visitor(kmer_length, DOUBLE_STRANDED_MODE); Path_n_count_pair selected_path_n_pair_forward = inchworm(kcounter, 'F', kmer, visitor, min_connectivity); visitor.clear(); // add selected path to visitor vector<kmer_int_type_t> &forward_path = selected_path_n_pair_forward.first; if (IRKE_COMMON::MONITOR >= 2) { cerr << "Forward path contains: " << forward_path.size() << " kmers. " << endl; } for (unsigned int i = 0; i < forward_path.size(); i++) { kmer_int_type_t kmer = forward_path[i]; visitor.add(kmer); if (IRKE_COMMON::MONITOR >= 2) { cerr << "\tForward path kmer: " << kcounter.get_kmer_string(kmer) << endl; } } /* Extend to the left */ visitor.erase(kmer); // reset the seed Path_n_count_pair selected_path_n_pair_reverse = inchworm(kcounter, 'R', kmer, visitor, min_connectivity); if (IRKE_COMMON::MONITOR >= 2) { vector<kmer_int_type_t> &reverse_path = selected_path_n_pair_reverse.first; cerr << "Reverse path contains: " << reverse_path.size() << " kmers. " << endl; for (unsigned int i = 0; i < reverse_path.size(); i++) { cerr << "\tReverse path kmer: " << kcounter.get_kmer_string(reverse_path[i]) << endl; } } total_counts = selected_path_n_pair_forward.second + selected_path_n_pair_reverse.second + kmer_count; vector<kmer_int_type_t> &reverse_path = selected_path_n_pair_reverse.first; vector<kmer_int_type_t> joined_path = _join_forward_n_reverse_paths(reverse_path, kmer, forward_path); return (joined_path); }
void IRKE::compute_sequence_assemblies(KmerCounter& kcounter, float min_connectivity, unsigned int MIN_ASSEMBLY_LENGTH, unsigned int MIN_ASSEMBLY_COVERAGE, bool WRITE_COVERAGE, string COVERAGE_OUTPUT_FILENAME) { if (! got_sorted_kmers_flag) { stringstream error; error << stacktrace() << " Error, must populate_sorted_kmers_list() before computing sequence assemblies" << endl; throw(error.str()); } unsigned int kmer_length = kcounter.get_kmer_length(); ofstream coverage_writer; if (WRITE_COVERAGE) { coverage_writer.open(COVERAGE_OUTPUT_FILENAME.c_str()); } vector<Kmer_counter_map_iterator>& kmers = sorted_kmers; //kcounter.get_kmers_sort_descending_counts(); unsigned long init_size = kcounter.size(); // string s = "before.kmers"; // kcounter.dump_kmers_to_file(s); for (unsigned int i = 0; i < kmers.size(); i++) { // cerr << "round: " << i << endl; unsigned long kmer_counter_size = kcounter.size(); if (kmer_counter_size > init_size) { // string s = "after.kmers"; // kcounter.dump_kmers_to_file(s); stringstream error; error << stacktrace() << "Error, Kcounter size has grown from " << init_size << " to " << kmer_counter_size << endl; throw (error.str()); } kmer_int_type_t kmer = kmers[i]->first; unsigned int kmer_count = kmers[i]->second; if (kmer_count == 0) { continue; } if (IRKE_COMMON::MONITOR >= 2) { cerr << "SEED kmer: " << kcounter.get_kmer_string(kmer) << ", count: " << kmer_count << endl; } if (kmer == revcomp_val(kmer, kmer_length)) { // palindromic kmer, avoid palindromes as seeds if (IRKE_COMMON::MONITOR >= 2) { cerr << "SEED kmer: " << kcounter.get_kmer_string(kmer) << " is palidnromic. Skipping. " << endl; } continue; } if (kmer_count < MIN_SEED_COVERAGE) { if (IRKE_COMMON::MONITOR >= 2) { cerr << "-seed has insufficient coverage, skipping" << endl; } continue; } float entropy = compute_entropy(kmer, kmer_length); if (entropy < MIN_SEED_ENTROPY) { if (IRKE_COMMON::MONITOR >= 2) { cerr << "-skipping seed due to low entropy: " << entropy << endl; } continue; } /* Extend to the right */ Kmer_visitor visitor(kmer_length, DOUBLE_STRANDED_MODE); Path_n_count_pair selected_path_n_pair_forward = inchworm(kcounter, 'F', kmer, visitor, min_connectivity); visitor.clear(); // add selected path to visitor vector<kmer_int_type_t>& forward_path = selected_path_n_pair_forward.first; if (IRKE_COMMON::MONITOR >= 2) { cerr << "Forward path contains: " << forward_path.size() << " kmers. " << endl; } for (unsigned int i = 0; i < forward_path.size(); i++) { kmer_int_type_t kmer = forward_path[i]; visitor.add(kmer); if (IRKE_COMMON::MONITOR >= 2) { cerr << "\tForward path kmer: " << kcounter.get_kmer_string(kmer) << endl; } } /* Extend to the left */ visitor.erase(kmer); // reset the seed Path_n_count_pair selected_path_n_pair_reverse = inchworm(kcounter, 'R', kmer, visitor, min_connectivity); if (IRKE_COMMON::MONITOR >= 2) { vector<kmer_int_type_t>& reverse_path = selected_path_n_pair_reverse.first; cerr << "Reverse path contains: " << reverse_path.size() << " kmers. " << endl; for (unsigned int i = 0; i < reverse_path.size(); i++) { cerr << "\tReverse path kmer: " << kcounter.get_kmer_string(reverse_path[i]) << endl; } } unsigned int total_counts = selected_path_n_pair_forward.second + selected_path_n_pair_reverse.second + kcounter.get_kmer_count(kmer); vector<kmer_int_type_t>& reverse_path = selected_path_n_pair_reverse.first; vector<kmer_int_type_t> joined_path = _join_forward_n_reverse_paths(reverse_path, kmer, forward_path); // report sequence reconstructed from path. vector<unsigned int> assembly_base_coverage; string sequence = reconstruct_path_sequence(kcounter, joined_path, assembly_base_coverage); unsigned int avg_cov = static_cast<unsigned int> ( (float)total_counts/(sequence.length()-kcounter.get_kmer_length() +1) + 0.5); /* cout << "Inchworm-reconstructed sequence, length: " << sequence.length() << ", avgCov: " << avg_cov << " " << sequence << endl; */ if (sequence.length() >= MIN_ASSEMBLY_LENGTH && avg_cov >= MIN_ASSEMBLY_COVERAGE) { INCHWORM_ASSEMBLY_COUNTER++; stringstream headerstream; headerstream << ">a" << INCHWORM_ASSEMBLY_COUNTER << ";" << avg_cov << " K: " << kmer_length << " length: " << sequence.length(); string header = headerstream.str(); sequence = add_fasta_seq_line_breaks(sequence, 60); cout << header << endl << sequence << endl; if (WRITE_COVERAGE) { coverage_writer << header << endl; for (unsigned int i = 0; i < assembly_base_coverage.size(); i++) { coverage_writer << assembly_base_coverage[i]; if ( (i+1) % 30 == 0) { coverage_writer << endl; } else { coverage_writer << " "; } } coverage_writer << endl; } } // remove path for (unsigned int i = 0; i < joined_path.size(); i++) { kmer_int_type_t kmer = joined_path[i]; /* if (DEBUG) { cout << "\tpruning kmer: " << kmer << endl; } */ kcounter.clear_kmer(kmer); } /* if (DEBUG) { cout << "done pruning kmers." << endl; } */ } if (IRKE_COMMON::MONITOR) { cerr << endl; } if (WRITE_COVERAGE) { coverage_writer.close(); } // drop sorted kmer list as part of cleanup clear_sorted_kmers_list(); return; // end of runIRKE }