Beispiel #1
0
void IRKE::compute_sequence_assemblies(KmerCounter& kcounter, float min_connectivity,
									   unsigned int MIN_ASSEMBLY_LENGTH, unsigned int MIN_ASSEMBLY_COVERAGE,
									   bool WRITE_COVERAGE, string COVERAGE_OUTPUT_FILENAME) {
	
    if (! got_sorted_kmers_flag) {
        stringstream error;
        error << stacktrace() << " Error, must populate_sorted_kmers_list() before computing sequence assemblies" << endl;
        throw(error.str());
    }
    

	unsigned int kmer_length = kcounter.get_kmer_length();
	ofstream coverage_writer;
	if (WRITE_COVERAGE) {
		coverage_writer.open(COVERAGE_OUTPUT_FILENAME.c_str());
	}
	
	vector<Kmer_counter_map_iterator>& kmers = sorted_kmers; //kcounter.get_kmers_sort_descending_counts();
	
	unsigned long init_size = kcounter.size();
	
	// string s = "before.kmers";
	// kcounter.dump_kmers_to_file(s);
	
	for (unsigned int i = 0; i < kmers.size(); i++) {
		
		// cerr << "round: " << i << endl;
		
		unsigned long kmer_counter_size = kcounter.size();
		if (kmer_counter_size > init_size) {
			
			// string s = "after.kmers";
			// kcounter.dump_kmers_to_file(s);
			
			stringstream error;
			error << stacktrace() << "Error, Kcounter size has grown from " << init_size
				  << " to " << kmer_counter_size << endl;
			throw (error.str());
		}
		
		
		kmer_int_type_t kmer = kmers[i]->first;
		unsigned int kmer_count = kmers[i]->second;
		
        
		if (kmer_count == 0) {
			continue;
		}
        
        
		if (IRKE_COMMON::MONITOR >= 2) {
			cerr << "SEED kmer: " << kcounter.get_kmer_string(kmer) << ", count: " << kmer_count << endl;
		}

        

		if (kmer == revcomp_val(kmer, kmer_length)) {
			// palindromic kmer, avoid palindromes as seeds
			
            if (IRKE_COMMON::MONITOR >= 2) {
                cerr << "SEED kmer: " << kcounter.get_kmer_string(kmer) << " is palidnromic.  Skipping. " << endl;
            }
            
            continue;
		}
		
        
		if (kmer_count < MIN_SEED_COVERAGE) {
			if (IRKE_COMMON::MONITOR >= 2) {
                cerr << "-seed has insufficient coverage, skipping" << endl;
            }
            
            continue;
		}
		
		
		float entropy = compute_entropy(kmer, kmer_length);
		
		
		if (entropy < MIN_SEED_ENTROPY) {

            if (IRKE_COMMON::MONITOR >= 2) {
                cerr << "-skipping seed due to low entropy: " << entropy << endl;
            }
            
            continue;
		}
		
				
		/* Extend to the right */
		
		Kmer_visitor visitor(kmer_length, DOUBLE_STRANDED_MODE);
		Path_n_count_pair selected_path_n_pair_forward = inchworm(kcounter, 'F', kmer, visitor, min_connectivity); 
		
		visitor.clear();
		// add selected path to visitor
		
		vector<kmer_int_type_t>& forward_path = selected_path_n_pair_forward.first;
		if (IRKE_COMMON::MONITOR >= 2) {
            cerr << "Forward path contains: " << forward_path.size() << " kmers. " << endl;
        }


        for (unsigned int i = 0; i < forward_path.size(); i++) {
			kmer_int_type_t kmer = forward_path[i];
			visitor.add(kmer);
            
            if (IRKE_COMMON::MONITOR >= 2) {
                cerr << "\tForward path kmer: " << kcounter.get_kmer_string(kmer) << endl;
            }
            
		}
		
		
		/* Extend to the left */ 
		visitor.erase(kmer); // reset the seed
		
		Path_n_count_pair selected_path_n_pair_reverse = inchworm(kcounter, 'R', kmer, visitor, min_connectivity);
        if (IRKE_COMMON::MONITOR >= 2) {
            vector<kmer_int_type_t>& reverse_path = selected_path_n_pair_reverse.first;
            cerr << "Reverse path contains: " << reverse_path.size() << " kmers. " << endl;
            for (unsigned int i = 0; i < reverse_path.size(); i++) {
                cerr  << "\tReverse path kmer: " << kcounter.get_kmer_string(reverse_path[i]) << endl; 
            }
        }
        
		
		unsigned int total_counts = selected_path_n_pair_forward.second + selected_path_n_pair_reverse.second + kcounter.get_kmer_count(kmer); 
		
		vector<kmer_int_type_t>& reverse_path = selected_path_n_pair_reverse.first;
		
		vector<kmer_int_type_t> joined_path = _join_forward_n_reverse_paths(reverse_path, kmer, forward_path);
		
		// report sequence reconstructed from path.
		
		vector<unsigned int> assembly_base_coverage;
		string sequence = reconstruct_path_sequence(kcounter, joined_path, assembly_base_coverage);
		
		unsigned int avg_cov =  static_cast<unsigned int> ( (float)total_counts/(sequence.length()-kcounter.get_kmer_length() +1) + 0.5);
		
		/*
		  cout << "Inchworm-reconstructed sequence, length: " << sequence.length() 
		  << ", avgCov: " << avg_cov
		  << " " << sequence << endl;
		*/
		
		
		
		if (sequence.length() >= MIN_ASSEMBLY_LENGTH && avg_cov >= MIN_ASSEMBLY_COVERAGE) {
			
			INCHWORM_ASSEMBLY_COUNTER++;
			
			stringstream headerstream;
			
			
			headerstream << ">a" << INCHWORM_ASSEMBLY_COUNTER << ";" << avg_cov 
						 << " K: " << kmer_length
						 << " length: " << sequence.length();
			
			string header = headerstream.str();
			
            sequence = add_fasta_seq_line_breaks(sequence, 60);
            
			cout << header << endl << sequence << endl;
			
			if (WRITE_COVERAGE) {
				
				coverage_writer << header << endl;
				
				for (unsigned int i = 0; i < assembly_base_coverage.size(); i++) {
					coverage_writer << assembly_base_coverage[i];
					if ( (i+1) % 30 == 0) {
						coverage_writer << endl;
					}
					else {
						coverage_writer << " ";
					}
				}
				coverage_writer << endl;
			}
			
		}
		
		// remove path
		for (unsigned int i = 0; i < joined_path.size(); i++) {
			
			kmer_int_type_t kmer = joined_path[i];
			
			/*
			  if (DEBUG) {
			  cout << "\tpruning kmer: " << kmer << endl;
			  }
			*/
			
				kcounter.clear_kmer(kmer);
		}
		
		/*
		  if (DEBUG) {
		  cout << "done pruning kmers." << endl;
		  }
		*/
		
	}
	
	if (IRKE_COMMON::MONITOR) {
		cerr << endl;
	}
	
	if (WRITE_COVERAGE) {
		coverage_writer.close();
	}

    
    // drop sorted kmer list as part of cleanup
    clear_sorted_kmers_list();
    
	
	return; // end of runIRKE
	
}
Beispiel #2
0
void IRKE::compute_sequence_assemblies(KmerCounter &kcounter, float min_connectivity,
                                       unsigned int MIN_ASSEMBLY_LENGTH, unsigned int MIN_ASSEMBLY_COVERAGE,
                                       bool PARALLEL_IWORM, bool WRITE_COVERAGE, string COVERAGE_OUTPUT_FILENAME)
{

    if (!got_sorted_kmers_flag) {
        stringstream error;
        error << stacktrace() << " Error, must populate_sorted_kmers_list() before computing sequence assemblies"
            << endl;
        throw (error.str());
    }


    //vector<Kmer_counter_map_iterator>& kmers = sorted_kmers;
    vector<Kmer_Occurence_Pair> &kmers = sorted_kmers;  // note, these are not actually sorted if PARALLEL_IWORM mode.

    unsigned long init_size = kcounter.size();

    cerr << "Total kcounter hash size: " << init_size << " vs. sorted list size: " << kmers.size() << endl;

    unsigned int kmer_length = kcounter.get_kmer_length();
    ofstream coverage_writer;
    if (WRITE_COVERAGE) {
        coverage_writer.open(COVERAGE_OUTPUT_FILENAME.c_str());
    }


    // string s = "before.kmers";
    // kcounter.dump_kmers_to_file(s);

    /* -----------------------------------------------------------
       Two reconstruction modes. (bhaas, Jan-4-2014)

       1.  Original (not setting PARALLEL_IWORM): kmer list is sorted descendingly by abundance.
       Seed kmer is chosen as most abundant, and contig construction extends from this seed.

       2.  PARALLEL_IWORM:  the kmer list is not sorted. A random kmer (just ordered by hash iterator, nothing special, probably not so random)
       is used as a seed for draft contig reconstruction.
       The most abundant kmer in the draft contig is chosen as a proper seed.
       An inchworm contig is then constructed using this new seed, and reported.
       So, in this case, the draft contig in the first phase is just used to find a better seed, from which the inchworm contig is then 
       properly reconstructed.

    --------------------------------------------------------------- */



    // try building an inchworm contig from each seed kmer
    int myTid;


    if (PARALLEL_IWORM) {
        omp_set_num_threads(IRKE_COMMON::NUM_THREADS);
    }
    else {
        omp_set_num_threads(1); // turn off multithreading for the contig building.
    }


    //-----------------------------------------------------------------
    // Prep writing to separate inchworm output files for each thread
    //-----------------------------------------------------------------

    vector<iworm_tmp_file> tmpfiles;
    int num_threads = omp_get_max_threads();
    cerr << "num threads set to: " << num_threads << endl;

    for (int i = 0; i < num_threads; i++) {
        iworm_tmp_file tmpfile_struct;
        tmpfiles.push_back(tmpfile_struct);
        iworm_tmp_file &itmp = tmpfiles[i];

        stringstream filename_constructor;
        filename_constructor << "tmp.iworm.fa.pid_" << getpid() << ".thread_" << i;

        itmp.tmp_filename = new char[100];
        strcpy(itmp.tmp_filename, filename_constructor.str().c_str());
        itmp.tmp_filename[filename_constructor.str().length()] = '\0';

        itmp.fh = new ofstream();
        itmp.fh->open(itmp.tmp_filename);
        cerr << "Done opening file. " << itmp.tmp_filename << endl;
    }


    //-------------------
    // Build contigs.
    //-------------------

#pragma omp parallel for private (myTid) schedule (dynamic, 1000)
    for (unsigned int i = 0; i < kmers.size(); i++) {

        // cerr << "round: " << i << endl;

        myTid = omp_get_thread_num();

        unsigned long kmer_counter_size = kcounter.size();
        if (kmer_counter_size > init_size) {

            // string s = "after.kmers";
            // kcounter.dump_kmers_to_file(s);

            stringstream error;
            error << stacktrace() << "Error, Kcounter size has grown from " << init_size
                << " to " << kmer_counter_size << endl;
            throw (error.str());
        }


        //kmer_int_type_t kmer = kmers[i]->first;
        //unsigned int kmer_count = kmers[i]->second;

        kmer_int_type_t kmer = kmers[i].first;
        // unsigned int kmer_count = kmers[i].second;  // NO!!!  Use for sorting, but likely zeroed out in the hashtable after contig construction
        unsigned int kmer_count = kcounter.get_kmer_count(kmer);

        if (!is_good_seed_kmer(kmer, kmer_count, kmer_length, min_connectivity)) {
            continue;
        }

        // cout << "SEED kmer: " << kcounter.get_kmer_string(kmer) << ", count: " << kmer_count << endl;

        if (IRKE_COMMON::MONITOR >= 2) {
            cerr << "SEED kmer: " << kcounter.get_kmer_string(kmer) << ", count: " << kmer_count << endl;
        }

        if (IRKE_COMMON::MONITOR >= 2) {
#pragma omp critical
            cerr << "Seed for thread: " << myTid << " is " << kcounter.get_kmer_string(kmer) << " with count: "
                << kmer_count << endl;
        }

        unsigned int total_counts;
        vector<kmer_int_type_t> joined_path =
            build_inchworm_contig_from_seed(kmer, kcounter, min_connectivity, total_counts, PARALLEL_IWORM);

        if (PARALLEL_IWORM && TWO_PHASE) {
            // get a new seed based on the draft contig
            // choose the 'good' kmer with highest abundance

            kmer_int_type_t new_seed = extract_best_seed(joined_path, kcounter, min_connectivity);

            if (kcounter.get_kmer_count(new_seed) == 0) {
                continue; // must have been zapped by another thread
            }

            joined_path =
                build_inchworm_contig_from_seed(new_seed, kcounter, min_connectivity, total_counts, PARALLEL_IWORM);
        }

        // report sequence reconstructed from path.

        vector<unsigned int> assembly_base_coverage;

        string sequence = reconstruct_path_sequence(kcounter, joined_path, assembly_base_coverage);

        unsigned int avg_cov =
            static_cast<unsigned int> ((float) total_counts / (sequence.length() - kcounter.get_kmer_length() + 1)
                + 0.5);

        /*
          cout << "Inchworm-reconstructed sequence, length: " << sequence.length()
          << ", avgCov: " << avg_cov
          << " " << sequence << endl;
        */

        size_t contig_length = sequence.length();

        if (contig_length >= MIN_ASSEMBLY_LENGTH && avg_cov >= MIN_ASSEMBLY_COVERAGE) {


            *(tmpfiles[myTid].fh) << total_counts << endl
                << avg_cov << endl
                << kmer_count << endl
                << sequence << endl;

        }

        // remove path

        if (IRKE_COMMON::__DEVEL_zero_kmer_on_use) {

            // dont forget the seed. The forward/reverse path kmers already cleared.
            kcounter.clear_kmer(kmer);

        } else {

            for (unsigned int i = 0; i < joined_path.size(); i++) {

                kmer_int_type_t kmer = joined_path[i];
                kcounter.clear_kmer(kmer);
            }

        }

    }

    if (IRKE_COMMON::MONITOR) {
        cerr << endl;
    }

    if (WRITE_COVERAGE) {
        coverage_writer.close();
    }

    // drop sorted kmer list as part of cleanup
    clear_sorted_kmers_list();


    //------------------------------------------------------------------------------      
    // examine the contigs generated by the individual threads, remove redundancies.
    //------------------------------------------------------------------------------

    map<unsigned long long, bool> seen_contig_already;

    for (unsigned int i = 0; i < tmpfiles.size(); i++) {

        tmpfiles[i].fh->close();


        ifstream tmpreader(tmpfiles[i].tmp_filename);

        while (!tmpreader.eof()) {

            unsigned int total_counts;
            unsigned int avg_cov;
            unsigned int kmer_count;
            string sequence;

            tmpreader >> total_counts
                >> avg_cov
                >> kmer_count
                >> sequence;

            if (tmpreader.eof()) // apparently only happens on the read after the last line is read.
                break;

            //cerr << "Read sequence: " << sequence << endl;

            unsigned int contig_hash = generateHash(sequence);

            if (!seen_contig_already[contig_hash]
                ) {

                seen_contig_already[contig_hash] = true;

                INCHWORM_ASSEMBLY_COUNTER++;

                stringstream headerstream;


                headerstream << ">a" << INCHWORM_ASSEMBLY_COUNTER << ";" << avg_cov
                    << " total_counts: " << total_counts
                    //<< " Fpath: " << selected_path_n_pair_forward.second << " Rpath: " << selected_path_n_pair_reverse.second 
                    << " Seed: " << kmer_count
                    << " K: " << kmer_length
                    << " length: " << sequence.length();

                string header = headerstream.str();

                sequence = add_fasta_seq_line_breaks(sequence, 60);

                cout << header << endl << sequence << endl;

            }
        }

        // cleanup from earlier dynamic allocation
        delete (tmpfiles[i].fh);
        if (!IRKE_COMMON::KEEP_TMP_FILES) {
            remove(tmpfiles[i].tmp_filename);
        }
        delete (tmpfiles[i].tmp_filename);

    }

    /*
      
                
                if (WRITE_COVERAGE) {
				
                    coverage_writer << header << endl;
                    
                    for (unsigned int i = 0; i < assembly_base_coverage.size(); i++) {
                        coverage_writer << assembly_base_coverage[i];
                        if ( (i+1) % 30 == 0) {
                            coverage_writer << endl;
                        }
                        else {
                            coverage_writer << " ";
                        }
                    }
                    coverage_writer << endl;
                }
                
            }


        
    */


    return; // end of runIRKE

}