Ejemplo n.º 1
0
vector<kmer_int_type_t>  IRKE::build_inchworm_contig_from_seed(kmer_int_type_t kmer, KmerCounter &kcounter,
                                                               float min_connectivity, unsigned int &total_counts,
                                                               bool)
{

    unsigned int kmer_count = kcounter.get_kmer_count(kmer);

    /* Extend to the right */

    unsigned int kmer_length = kcounter.get_kmer_length();

    Kmer_visitor visitor(kmer_length, DOUBLE_STRANDED_MODE);
    Path_n_count_pair selected_path_n_pair_forward = inchworm(kcounter, 'F', kmer, visitor, min_connectivity);

    visitor.clear();
    // add selected path to visitor

    vector<kmer_int_type_t> &forward_path = selected_path_n_pair_forward.first;

    if (IRKE_COMMON::MONITOR >= 2) {
        cerr << "Forward path contains: " << forward_path.size() << " kmers. " << endl;
    }


    for (unsigned int i = 0; i < forward_path.size(); i++) {
        kmer_int_type_t kmer = forward_path[i];
        visitor.add(kmer);

        if (IRKE_COMMON::MONITOR >= 2) {
            cerr << "\tForward path kmer: " << kcounter.get_kmer_string(kmer) << endl;
        }

    }


    /* Extend to the left */
    visitor.erase(kmer); // reset the seed

    Path_n_count_pair selected_path_n_pair_reverse = inchworm(kcounter, 'R', kmer, visitor, min_connectivity);
    if (IRKE_COMMON::MONITOR >= 2) {
        vector<kmer_int_type_t> &reverse_path = selected_path_n_pair_reverse.first;
        cerr << "Reverse path contains: " << reverse_path.size() << " kmers. " << endl;
        for (unsigned int i = 0; i < reverse_path.size(); i++) {
            cerr << "\tReverse path kmer: " << kcounter.get_kmer_string(reverse_path[i]) << endl;
        }
    }


    total_counts = selected_path_n_pair_forward.second + selected_path_n_pair_reverse.second + kmer_count;

    vector<kmer_int_type_t> &reverse_path = selected_path_n_pair_reverse.first;

    vector<kmer_int_type_t> joined_path = _join_forward_n_reverse_paths(reverse_path, kmer, forward_path);

    return (joined_path);
}
Ejemplo n.º 2
0
void IRKE::compute_sequence_assemblies(KmerCounter& kcounter, float min_connectivity,
									   unsigned int MIN_ASSEMBLY_LENGTH, unsigned int MIN_ASSEMBLY_COVERAGE,
									   bool WRITE_COVERAGE, string COVERAGE_OUTPUT_FILENAME) {
	
    if (! got_sorted_kmers_flag) {
        stringstream error;
        error << stacktrace() << " Error, must populate_sorted_kmers_list() before computing sequence assemblies" << endl;
        throw(error.str());
    }
    

	unsigned int kmer_length = kcounter.get_kmer_length();
	ofstream coverage_writer;
	if (WRITE_COVERAGE) {
		coverage_writer.open(COVERAGE_OUTPUT_FILENAME.c_str());
	}
	
	vector<Kmer_counter_map_iterator>& kmers = sorted_kmers; //kcounter.get_kmers_sort_descending_counts();
	
	unsigned long init_size = kcounter.size();
	
	// string s = "before.kmers";
	// kcounter.dump_kmers_to_file(s);
	
	for (unsigned int i = 0; i < kmers.size(); i++) {
		
		// cerr << "round: " << i << endl;
		
		unsigned long kmer_counter_size = kcounter.size();
		if (kmer_counter_size > init_size) {
			
			// string s = "after.kmers";
			// kcounter.dump_kmers_to_file(s);
			
			stringstream error;
			error << stacktrace() << "Error, Kcounter size has grown from " << init_size
				  << " to " << kmer_counter_size << endl;
			throw (error.str());
		}
		
		
		kmer_int_type_t kmer = kmers[i]->first;
		unsigned int kmer_count = kmers[i]->second;
		
        
		if (kmer_count == 0) {
			continue;
		}
        
        
		if (IRKE_COMMON::MONITOR >= 2) {
			cerr << "SEED kmer: " << kcounter.get_kmer_string(kmer) << ", count: " << kmer_count << endl;
		}

        

		if (kmer == revcomp_val(kmer, kmer_length)) {
			// palindromic kmer, avoid palindromes as seeds
			
            if (IRKE_COMMON::MONITOR >= 2) {
                cerr << "SEED kmer: " << kcounter.get_kmer_string(kmer) << " is palidnromic.  Skipping. " << endl;
            }
            
            continue;
		}
		
        
		if (kmer_count < MIN_SEED_COVERAGE) {
			if (IRKE_COMMON::MONITOR >= 2) {
                cerr << "-seed has insufficient coverage, skipping" << endl;
            }
            
            continue;
		}
		
		
		float entropy = compute_entropy(kmer, kmer_length);
		
		
		if (entropy < MIN_SEED_ENTROPY) {

            if (IRKE_COMMON::MONITOR >= 2) {
                cerr << "-skipping seed due to low entropy: " << entropy << endl;
            }
            
            continue;
		}
		
				
		/* Extend to the right */
		
		Kmer_visitor visitor(kmer_length, DOUBLE_STRANDED_MODE);
		Path_n_count_pair selected_path_n_pair_forward = inchworm(kcounter, 'F', kmer, visitor, min_connectivity); 
		
		visitor.clear();
		// add selected path to visitor
		
		vector<kmer_int_type_t>& forward_path = selected_path_n_pair_forward.first;
		if (IRKE_COMMON::MONITOR >= 2) {
            cerr << "Forward path contains: " << forward_path.size() << " kmers. " << endl;
        }


        for (unsigned int i = 0; i < forward_path.size(); i++) {
			kmer_int_type_t kmer = forward_path[i];
			visitor.add(kmer);
            
            if (IRKE_COMMON::MONITOR >= 2) {
                cerr << "\tForward path kmer: " << kcounter.get_kmer_string(kmer) << endl;
            }
            
		}
		
		
		/* Extend to the left */ 
		visitor.erase(kmer); // reset the seed
		
		Path_n_count_pair selected_path_n_pair_reverse = inchworm(kcounter, 'R', kmer, visitor, min_connectivity);
        if (IRKE_COMMON::MONITOR >= 2) {
            vector<kmer_int_type_t>& reverse_path = selected_path_n_pair_reverse.first;
            cerr << "Reverse path contains: " << reverse_path.size() << " kmers. " << endl;
            for (unsigned int i = 0; i < reverse_path.size(); i++) {
                cerr  << "\tReverse path kmer: " << kcounter.get_kmer_string(reverse_path[i]) << endl; 
            }
        }
        
		
		unsigned int total_counts = selected_path_n_pair_forward.second + selected_path_n_pair_reverse.second + kcounter.get_kmer_count(kmer); 
		
		vector<kmer_int_type_t>& reverse_path = selected_path_n_pair_reverse.first;
		
		vector<kmer_int_type_t> joined_path = _join_forward_n_reverse_paths(reverse_path, kmer, forward_path);
		
		// report sequence reconstructed from path.
		
		vector<unsigned int> assembly_base_coverage;
		string sequence = reconstruct_path_sequence(kcounter, joined_path, assembly_base_coverage);
		
		unsigned int avg_cov =  static_cast<unsigned int> ( (float)total_counts/(sequence.length()-kcounter.get_kmer_length() +1) + 0.5);
		
		/*
		  cout << "Inchworm-reconstructed sequence, length: " << sequence.length() 
		  << ", avgCov: " << avg_cov
		  << " " << sequence << endl;
		*/
		
		
		
		if (sequence.length() >= MIN_ASSEMBLY_LENGTH && avg_cov >= MIN_ASSEMBLY_COVERAGE) {
			
			INCHWORM_ASSEMBLY_COUNTER++;
			
			stringstream headerstream;
			
			
			headerstream << ">a" << INCHWORM_ASSEMBLY_COUNTER << ";" << avg_cov 
						 << " K: " << kmer_length
						 << " length: " << sequence.length();
			
			string header = headerstream.str();
			
            sequence = add_fasta_seq_line_breaks(sequence, 60);
            
			cout << header << endl << sequence << endl;
			
			if (WRITE_COVERAGE) {
				
				coverage_writer << header << endl;
				
				for (unsigned int i = 0; i < assembly_base_coverage.size(); i++) {
					coverage_writer << assembly_base_coverage[i];
					if ( (i+1) % 30 == 0) {
						coverage_writer << endl;
					}
					else {
						coverage_writer << " ";
					}
				}
				coverage_writer << endl;
			}
			
		}
		
		// remove path
		for (unsigned int i = 0; i < joined_path.size(); i++) {
			
			kmer_int_type_t kmer = joined_path[i];
			
			/*
			  if (DEBUG) {
			  cout << "\tpruning kmer: " << kmer << endl;
			  }
			*/
			
				kcounter.clear_kmer(kmer);
		}
		
		/*
		  if (DEBUG) {
		  cout << "done pruning kmers." << endl;
		  }
		*/
		
	}
	
	if (IRKE_COMMON::MONITOR) {
		cerr << endl;
	}
	
	if (WRITE_COVERAGE) {
		coverage_writer.close();
	}

    
    // drop sorted kmer list as part of cleanup
    clear_sorted_kmers_list();
    
	
	return; // end of runIRKE
	
}