예제 #1
0
DeBruijnGraph constructDeBruijnGraph (vector<string> fasta_file_names, int kmer_length, int component_val, bool sStrand) {


    DeBruijnGraph g(kmer_length);
    
    for (int i = 0; i < fasta_file_names.size(); i++) {
        
        string fasta_filename = fasta_file_names[i];
        
        if (IRKE_COMMON::MONITOR > 1)
            cerr << "Parsing file: " << fasta_filename << endl;
        
        Fasta_reader fasta_reader(fasta_filename);
        
        
        
        while (fasta_reader.hasNext()) {
            
            Fasta_entry fe = fasta_reader.getNext();
            string sequence = fe.get_sequence();
            
            vector<string> seq_regions;
            string_util::tokenize(sequence, seq_regions, "X"); // inchworm bundles concatenated with 'X' delimiters by Chrysalis 
            
            for (int s = 0; s < seq_regions.size(); s++) { 
                
                string seq_region = seq_regions[s];
                
                if (contains_non_gatc(seq_region)) {
                    seq_region = replace_nonGATC_chars_with_A(seq_region);
                }
                

                if (IRKE_COMMON::MONITOR > 2) 
                    cerr << "Adding sequence to graph: " << seq_region << endl;

                g.add_sequence(seq_region);
                
                if (! sStrand) {
                    string revseq = revcomp(seq_region);
                    
                    if (IRKE_COMMON::MONITOR > 2) 
                        cerr << "Adding sequence to graph: " << revseq << endl;


                    g.add_sequence(revseq);
                    
                    

                }
            }
        }
    }
    
    return(g);
    
    
    

} 
예제 #2
0
map<string,string> Fasta_reader::retrieve_all_seqs_hash() {
    
    map<string,string> all_seqs_hash;
    
    while (this->hasNext()) {
        Fasta_entry f = this->getNext();
        string acc = f.get_accession();
        string seq = f.get_sequence();
        
        all_seqs_hash[acc] = seq;
    }
    
    return(all_seqs_hash);
}
void populate_kmer_counter(KmerCounter &kcounter, string &kmers_fasta_file)
{
    // code largely copied from IRKE.cpp
    int i, myTid;
    unsigned long sum,
        *record_counter = new unsigned long[omp_get_max_threads()];
    unsigned long start, end;
    // init record counter
    for (int i = 0; i < omp_get_max_threads(); i++) {
        record_counter[i] = 0;
    }
    cerr << "-reading Kmer occurences..." << endl;
    start = time(NULL);
    Fasta_reader fasta_reader(kmers_fasta_file);
#pragma omp parallel private (myTid)
    {
        myTid = omp_get_thread_num();
        record_counter[myTid] = 0;
        while (true) {
            Fasta_entry fe = fasta_reader.getNext();
            if (fe.get_sequence() == "") break;
            record_counter[myTid]++;
            if (IRKE_COMMON::MONITOR) {
                if (myTid == 0 && record_counter[myTid] % 100000 == 0) {
                    sum = record_counter[0];
                    for (i = 1; i < omp_get_num_threads(); i++)
                        sum += record_counter[i];
                    cerr << "\r [" << sum / 1000000 << "M] Kmers parsed.     ";
                }
            }
            string seq = fe.get_sequence();
            if (seq.length() != KMER_SIZE) {
                cerr << "ERROR: kmer " << seq << " is not of length: " << KMER_SIZE << endl;
                continue;
            }
            kmer_int_type_t kmer = kcounter.get_kmer_intval(seq);
            unsigned int count = atoi(fe.get_header().c_str());
            kcounter.add_kmer(kmer, count);
        }
    }
    end = time(NULL);
    sum = record_counter[0];
    for (i = 1; i < omp_get_max_threads(); i++)
        sum += record_counter[i];
    delete[] record_counter;
    cerr << endl << " done parsing " << sum << " Kmers, " << kcounter.size() << " added, taking " << (end - start)
        << " seconds." << endl;
    return;
}
예제 #4
0
void thread_sequences_through_graph(IRKE& irke, string cds_fasta_filename) {
    
    Fasta_reader fasta_reader(cds_fasta_filename);
    
    while (fasta_reader.hasNext()) {
        
        Fasta_entry fe = fasta_reader.getNext();
        
        string accession = fe.get_accession();
        
        string seq = fe.get_sequence();
        
        cout << "// " << accession << endl;
        
        cout << irke.thread_sequence_through_graph(seq) << endl << endl;
        
    }
    
    return;
}
예제 #5
0
void examine_CDS_paths(IRKE &irke,
                       string cds_fasta_filename,
                       unsigned int min_cov,
                       float min_connectivity,
                       float min_entropy,
                       bool WRITE_COVERAGE,
                       string COVERAGE_OUTPUT_FILENAME)
{


    Fasta_reader fasta_reader(cds_fasta_filename);

    ofstream coverage_writer;
    if (WRITE_COVERAGE) {
        coverage_writer.open(COVERAGE_OUTPUT_FILENAME.c_str());
        if (!coverage_writer.is_open()) {
            throw (stacktrace() + "Error, cannot write to file: " + COVERAGE_OUTPUT_FILENAME);
        }
    }


    while (fasta_reader.hasNext()) {

        Fasta_entry fe = fasta_reader.getNext();

        string accession = fe.get_accession();

        string seq = fe.get_sequence();

        vector<unsigned int> coverage_counter;

        cout << accession << "\tCov: " << min_cov << "\tCon: " << min_connectivity << "\tE: " << min_entropy;

        if (irke.sequence_path_exists(seq, min_cov, min_entropy, min_connectivity, coverage_counter)) {

            cout << "\tT" << endl;

        }
        else {

            cout << "\tF" << endl;
        }

        if (WRITE_COVERAGE) {

            coverage_writer << ">" << accession << endl;

            for (unsigned int i = 0; i < coverage_counter.size(); i++) {
                coverage_writer << coverage_counter[i];
                if ((i + 1) % 30 == 0) {
                    coverage_writer << endl;
                }
                else {
                    coverage_writer << " ";
                }
            }
            coverage_writer << endl;
        }

    }

    if (WRITE_COVERAGE) {
        coverage_writer.close();
    }


    return;
}
예제 #6
0
void IRKE::populate_Kmers_from_kmers(const string& fasta_filename) {
	unsigned int kmer_length = kcounter.get_kmer_length();
	int i, myTid;
	unsigned long sum, 
                  *record_counter = new unsigned long[omp_get_max_threads()];
	unsigned long start, end;

  // init record counter
  for (int i = 0; i < omp_get_max_threads(); i++) {
  	record_counter[i] = 0;
  }

	cerr << "-reading Kmer occurences..." << endl;
	start = time(NULL);

	Fasta_reader fasta_reader(fasta_filename);

  #pragma omp parallel private (myTid)
	{
		myTid = omp_get_thread_num();
		record_counter[myTid] = 0;

		while (true) {
			Fasta_entry fe = fasta_reader.getNext();
			if (fe.get_sequence() == "") break;

            record_counter[myTid]++;

			if (IRKE_COMMON::MONITOR) {
				if (myTid == 0 && record_counter[myTid] % 100000 == 0)
					{
						sum = record_counter[0];
						for (i=1; i<omp_get_num_threads(); i++)
							sum+= record_counter[i];
						cerr << "\r [" << sum/1000000 << "M] Kmers parsed.     ";
                    }
			}


			string seq = fe.get_sequence();
			if (seq.length() != kmer_length) {
				continue;
			}

			kmer_int_type_t kmer = kcounter.get_kmer_intval(seq);
			unsigned int count = atoi(fe.get_header().c_str());
			kcounter.add_kmer(kmer, count);
		}
	}
	end = time(NULL);

	sum = record_counter[0];
	for (i=1; i<omp_get_max_threads(); i++)
		sum+= record_counter[i];
    delete [] record_counter;

	cerr << endl << " done parsing " << sum << " Kmers, " << kcounter.size() << " added, taking " << (end-start) << " seconds." << endl;

    ofstream iworm_kmer_count_report_fh;
    iworm_kmer_count_report_fh.open("inchworm.kmer_count");
    iworm_kmer_count_report_fh << kcounter.size() << endl;
    iworm_kmer_count_report_fh.close();
    

	return;
}
예제 #7
0
void IRKE::populate_Kmers_from_fasta(const string& fasta_filename, bool reassembleIworm) {
	
	unsigned int kmer_length = kcounter.get_kmer_length();
	int i, myTid;
	unsigned long sum, 
                  *record_counter = new unsigned long[omp_get_max_threads()];
	unsigned long start, end;
    
    // init record counter
    for (int i = 0; i < omp_get_max_threads(); i++) {
        record_counter[i] = 0;
    }

	
	cerr << "-storing Kmers..." << endl;
	start = time(NULL);
	
	Fasta_reader fasta_reader(fasta_filename);

    unsigned int entry_num = 0;

    #pragma omp parallel private (myTid)
	{
		myTid = omp_get_thread_num();
		record_counter[myTid] = 0;
		
		while (fasta_reader.hasNext()) {
			Fasta_entry fe = fasta_reader.getNext();
            string accession = fe.get_accession();

            #pragma omp atomic            
            entry_num++;
            record_counter[myTid]++;
			
			if (IRKE_COMMON::MONITOR >= 4) {
				cerr << "[" << entry_num << "] acc: " << accession << ", by thread no: " << myTid << endl;;
			}
			else if (IRKE_COMMON::MONITOR) {
				if (myTid == 0 && record_counter[myTid] % 1000 == 0)
					{
						sum = record_counter[0];
						for (i=1; i<omp_get_num_threads(); i++)
							sum+= record_counter[i];
						cerr << "\r [" << sum << "] sequences parsed.     ";
					}
			}
			
			
			string seq = fe.get_sequence();
			
			if (seq.length() < kmer_length + 1) {
				continue;
			}
			
			if (reassembleIworm) {
				string accession = fe.get_accession();
				string header = fe.get_header();
				// get coverage value from iworm assembly
				vector<string> tokens;
				string_util::tokenize(accession, tokens, ";");
				if (tokens.size() < 2) {
					stringstream err;
					err << "Could not extract coverage value from accession: " << tokens[tokens.size()-1];
					throw(err.str());
				}
				string cov_s = tokens[tokens.size()-1];
				unsigned int cov_val = atoi(cov_s.c_str());
				
				// get Kmer value from header
				vector<string> header_toks;
				string_util::tokenize(header, header_toks, " ");
				if (header_toks.size() < 5) {
					stringstream err;
					err << "Fasta header: " << header << " lacks expected format including Kmer length from previous inchworm assembly run";
					throw(err.str());
				}
				
				unsigned int kmer_val = atoi(header_toks[2].c_str());
				
				unsigned int normalized_coverage_val = static_cast<unsigned int> (cov_val * kmer_val / 25.0 + 0.5);
				
				if (IRKE_COMMON::MONITOR >= 1) {
					cerr << "Adding inchworm assembly " << accession 
						 << " K: " << kmer_val << " Cov: " << cov_val 
						 << " with coverage: " << normalized_coverage_val << endl;
				}
				if (cov_val < 1) {
					stringstream err;
					err << "error parsing coverage value from accession: " << accession;
					throw(err.str());
				}
				kcounter.add_sequence(seq, normalized_coverage_val);
			}
			else {
				kcounter.add_sequence(seq);
			}
			
			// remove singleton kmers at read interval to minimize memory requirements.
			if (PRUNE_SINGLETON_READ_INTERVAL > 0 
				&& 
				myTid == 0
				&&
				record_counter[myTid]/omp_get_num_threads() % PRUNE_SINGLETON_READ_INTERVAL == 0) {
				if (IRKE_COMMON::MONITOR >= 1) {
					cerr << "Reached singleton kmer pruning interval at read count: " << record_counter << endl;
				}
				prune_kmers_min_count(1);
			}
			
			
		}
	}
	end = time(NULL);
	
	sum = record_counter[0];
	for (i=1; i<omp_get_max_threads(); i++)
		sum+= record_counter[i];
    delete [] record_counter;
	
	cerr << endl << " done parsing " << sum << " sequences, extracted " << kcounter.size() << " kmers, taking " << (end-start) << " seconds." << endl;
	
	
	return;
}
예제 #8
0
int runMe(int argc, char* argv[]) {

    ArgProcessor args(argc, argv);
    if(args.isArgSet("--help") ||
       (!(args.isArgSet("--reads")
          &&
          ( args.isArgSet("--kmers") || args.isArgSet("--kmers_from_reads") )
          )) ) {
        cerr << usage(args) << endl << endl;
        exit(1);
    }

    string reads_fasta_file = args.getStringVal("--reads");
    
    bool is_DS = (! args.isArgSet("--SS"));
    if(args.isArgSet("--kmer_size")) {
        KMER_SIZE = args.getIntVal("--kmer_size");
        if(KMER_SIZE < 20) {
            cerr << "Error, min kmer size is 20";
            exit(2);
        }
    }
    if(args.isArgSet("--monitor")) {
        IRKE_COMMON::MONITOR = args.getIntVal("--monitor");
    }
    if (args.isArgSet("--num_threads")) {
        int num_threads = args.getIntVal("--num_threads");
        if (num_threads < MAX_THREADS) {
            omp_set_num_threads(num_threads);
        }
        else {
            // set to max
            omp_set_num_threads(MAX_THREADS);
        }
    }
    
    if(omp_get_max_threads() > MAX_THREADS) {
        omp_set_num_threads(MAX_THREADS);
    }
    KmerCounter kcounter (KMER_SIZE, is_DS);

    if (args.isArgSet("--kmers")) {
        string kmers_fasta_file = args.getStringVal("--kmers");
        populate_kmer_counter_from_kmers(kcounter, kmers_fasta_file);
    }
    else {
        string kmer_read_source_fasta_file = args.getStringVal("--kmers_from_reads");
        populate_kmer_counter_from_reads(kcounter, kmer_read_source_fasta_file);
    }
    
    Fasta_reader fasta_reader(reads_fasta_file);
    bool write_coverage_info = args.isArgSet("--capture_coverage_info");
    
    int start_time = time(NULL);

    #pragma omp parallel
    while (true) {

        if (! fasta_reader.hasNext())
            break;
        
        int myTid = omp_get_thread_num();
        
        Fasta_entry fe = fasta_reader.getNext();
        string sequence = fe.get_sequence();
        if(sequence == "")
            continue;

        string header = fe.get_header();
        vector<unsigned int> kmer_coverage = compute_kmer_coverage(sequence, kcounter);
        unsigned int median_cov = median_coverage(kmer_coverage);
        float mean_cov = mean(kmer_coverage);
        float stdev = stDev(kmer_coverage);
        float pct_stdev_of_avg = stdev/mean_cov*100;
        stringstream stats_text;
                
        stats_text << median_cov << "\t"
                   << mean_cov << "\t"
                   << stdev << "\t"
                   << pct_stdev_of_avg << "\t"
                   << fe.get_accession();

        stats_text << "\tthread:" << myTid;
        
        if(write_coverage_info) {
            // add the coverage info
            stats_text << "\t";
            for (size_t i = 0; i < kmer_coverage.size(); i++) {
                stats_text<< kmer_coverage[i];
                if(i != kmer_coverage.size() - 1) {
                    stats_text<< ",";
                }
            }
        }
        stats_text << endl;
        
        #pragma omp critical 
        {
            cout << stats_text.str();
        }
        
        if (mean_cov < 0) {
            cerr << "ERROR, cannot have negative coverage!!" << endl;
            exit(1);
        }
        
    }

    int end_time = time(NULL);

    cerr << "STATS_GENERATION_TIME: " << (end_time - start_time) << " seconds." << endl;
    
    return(0);
}
예제 #9
0
void populate_kmer_counter_from_reads (KmerCounter& kcounter, string& fasta_filename) {
    unsigned int kmer_length = kcounter.get_kmer_length();
    int i, myTid;
    unsigned long sum,
        *record_counter = new unsigned long[omp_get_max_threads()];
    unsigned long start, end;

    // init record counter
    for (int i = 0; i < omp_get_max_threads(); i++) {
        record_counter[i] = 0;
    }


    cerr << "-storing Kmers..." << "\n";
    start = time(NULL);

    Fasta_reader fasta_reader(fasta_filename);

    unsigned int entry_num = 0;

#pragma omp parallel private (myTid)
    {
        myTid = omp_get_thread_num();
        record_counter[myTid] = 0;

        while (fasta_reader.hasNext()) {
            Fasta_entry fe = fasta_reader.getNext();
            string accession = fe.get_accession();

#pragma omp atomic
            entry_num++;
            record_counter[myTid]++;
            
            if (IRKE_COMMON::MONITOR >= 4) {
                cerr << "[" << entry_num << "] acc: " << accession << ", by thread no: " << myTid << "\n";;
            }
            else if (IRKE_COMMON::MONITOR) {
                if (myTid == 0 && record_counter[myTid] % 1000 == 0)
                    {
                        sum = record_counter[0];
                        for (i=1; i<omp_get_num_threads(); i++)
                            sum+= record_counter[i];
                        cerr << "\r [" << sum << "] sequences parsed.     ";
                    }
            }
            
            string seq = fe.get_sequence();

            if (seq.length() < KMER_SIZE + 1) {
                continue;
            }
            kcounter.add_sequence(seq);

        }
        
        cerr << "\n" << " done parsing " << sum << " sequences, extracted " << kcounter.size() << " kmers, taking " << (end-start) << " seconds." << "\n";
        
        
    }


    return;
    
}
int main(int argc, char* argv[]) {
    ArgProcessor args(argc, argv);
    if(args.isArgSet("--help") ||
       (!(args.isArgSet("--reads") && args.isArgSet("--kmers")))) {
        cerr << usage(args) << endl << endl;
        exit(1);
    }
    string reads_fasta_file = args.getStringVal("--reads");
    string kmers_fasta_file = args.getStringVal("--kmers");
    bool is_DS = (! args.isArgSet("--SS"));
    if(args.isArgSet("--kmer_size")) {
        KMER_SIZE = args.getIntVal("--kmer_size");
        if(KMER_SIZE < 20) {
            cerr << "Error, min kmer size is 20";
            exit(2);
        }
    }
    if(args.isArgSet("--monitor")) {
        IRKE_COMMON::MONITOR = args.getIntVal("--monitor");
    }
    if(omp_get_max_threads() > MAX_THREADS) {
        omp_set_num_threads(MAX_THREADS);
    }
    KmerCounter kcounter (KMER_SIZE, is_DS);
    populate_kmer_counter(kcounter, kmers_fasta_file);
    Fasta_reader fasta_reader(reads_fasta_file);
    ofstream* filewriter = NULL;
    ofstream* covwriter = NULL;
    bool write_coverage_info = args.isArgSet("--capture_coverage_info");
    while (true) {
        Fasta_entry fe = fasta_reader.getNext();
        string sequence = fe.get_sequence();
        if(sequence == "") break;
        string header = fe.get_header();
        vector<unsigned int> kmer_coverage = compute_kmer_coverage(sequence, kcounter);
        unsigned int median_cov = median_coverage(kmer_coverage);
        float mean_cov = mean(kmer_coverage);
        float stdev = stDev(kmer_coverage);
        float pct_stdev_of_avg = stdev/mean_cov*100;
        stringstream stats_text;
        stats_text << median_cov << "\t"
                   << mean_cov << "\t"
                   << stdev << "\t"
                   << pct_stdev_of_avg << "\t"
                   << fe.get_accession();
        if(write_coverage_info) {
            // add the coverage info
            stats_text << "\t";
            for (int i = 0; i < kmer_coverage.size(); i++) {
                stats_text<< kmer_coverage[i];
                if(i != kmer_coverage.size() - 1) {
                    stats_text<< ",";
                }
            }
        }
        stats_text << endl;
        cout << stats_text.str();

        if (mean_cov < 0) {
            cerr << "ERROR, cannot have negative coverage!!" << endl;
            exit(1);
        }
        
    }
    return(0);
}
예제 #11
0
int run (int argc, char* argv[]) {
    
    if (argc < 3) {
        stringstream s;
        s << "Usage: " << argv[0] << " file.fasta kmer_length [DS_mode]" << endl << endl;
        
        cerr << s.str();
        return(1);
        
    }

    string fasta_filename (argv[1]);
    unsigned int kmer_length = atoi(argv[2]);
    
    bool DS_mode = (argc >= 3) ? true : false;
    
    Fasta_reader fasta_reader(fasta_filename);
    
    Ktree ktree;

    long read_counter = 0;
    
    while (fasta_reader.hasNext()) {
        
        read_counter++;
        if (read_counter % 1000 == 0) {
            cerr << "\rread[" << read_counter << "]   ";
        }
        

        Fasta_entry fe = fasta_reader.getNext();
        
        string accession = fe.get_accession();
        
        
        string sequence = fe.get_sequence();
        
        // cerr << "Processing: " << sequence << endl;
                        
        if (sequence.length() < kmer_length + 1) {
            continue;
        }
        
        for (unsigned int i = 0; i <= sequence.length() - kmer_length; i++) {
            
            string kmer = sequence.substr(i, kmer_length);
            
            if (! contains_non_gatc(kmer)) {

                ktree.add_kmer(kmer);
            
                if (DS_mode) {
                    kmer = revcomp(kmer);
                    ktree.add_kmer(kmer);
                }

            }
            
        }
        
    }
 

    ktree.report_kmer_counts();
    
   
    return(0);
}