DeBruijnGraph constructDeBruijnGraph (vector<string> fasta_file_names, int kmer_length, int component_val, bool sStrand) { DeBruijnGraph g(kmer_length); for (int i = 0; i < fasta_file_names.size(); i++) { string fasta_filename = fasta_file_names[i]; if (IRKE_COMMON::MONITOR > 1) cerr << "Parsing file: " << fasta_filename << endl; Fasta_reader fasta_reader(fasta_filename); while (fasta_reader.hasNext()) { Fasta_entry fe = fasta_reader.getNext(); string sequence = fe.get_sequence(); vector<string> seq_regions; string_util::tokenize(sequence, seq_regions, "X"); // inchworm bundles concatenated with 'X' delimiters by Chrysalis for (int s = 0; s < seq_regions.size(); s++) { string seq_region = seq_regions[s]; if (contains_non_gatc(seq_region)) { seq_region = replace_nonGATC_chars_with_A(seq_region); } if (IRKE_COMMON::MONITOR > 2) cerr << "Adding sequence to graph: " << seq_region << endl; g.add_sequence(seq_region); if (! sStrand) { string revseq = revcomp(seq_region); if (IRKE_COMMON::MONITOR > 2) cerr << "Adding sequence to graph: " << revseq << endl; g.add_sequence(revseq); } } } } return(g); }
map<string,string> Fasta_reader::retrieve_all_seqs_hash() { map<string,string> all_seqs_hash; while (this->hasNext()) { Fasta_entry f = this->getNext(); string acc = f.get_accession(); string seq = f.get_sequence(); all_seqs_hash[acc] = seq; } return(all_seqs_hash); }
void populate_kmer_counter(KmerCounter &kcounter, string &kmers_fasta_file) { // code largely copied from IRKE.cpp int i, myTid; unsigned long sum, *record_counter = new unsigned long[omp_get_max_threads()]; unsigned long start, end; // init record counter for (int i = 0; i < omp_get_max_threads(); i++) { record_counter[i] = 0; } cerr << "-reading Kmer occurences..." << endl; start = time(NULL); Fasta_reader fasta_reader(kmers_fasta_file); #pragma omp parallel private (myTid) { myTid = omp_get_thread_num(); record_counter[myTid] = 0; while (true) { Fasta_entry fe = fasta_reader.getNext(); if (fe.get_sequence() == "") break; record_counter[myTid]++; if (IRKE_COMMON::MONITOR) { if (myTid == 0 && record_counter[myTid] % 100000 == 0) { sum = record_counter[0]; for (i = 1; i < omp_get_num_threads(); i++) sum += record_counter[i]; cerr << "\r [" << sum / 1000000 << "M] Kmers parsed. "; } } string seq = fe.get_sequence(); if (seq.length() != KMER_SIZE) { cerr << "ERROR: kmer " << seq << " is not of length: " << KMER_SIZE << endl; continue; } kmer_int_type_t kmer = kcounter.get_kmer_intval(seq); unsigned int count = atoi(fe.get_header().c_str()); kcounter.add_kmer(kmer, count); } } end = time(NULL); sum = record_counter[0]; for (i = 1; i < omp_get_max_threads(); i++) sum += record_counter[i]; delete[] record_counter; cerr << endl << " done parsing " << sum << " Kmers, " << kcounter.size() << " added, taking " << (end - start) << " seconds." << endl; return; }
void thread_sequences_through_graph(IRKE& irke, string cds_fasta_filename) { Fasta_reader fasta_reader(cds_fasta_filename); while (fasta_reader.hasNext()) { Fasta_entry fe = fasta_reader.getNext(); string accession = fe.get_accession(); string seq = fe.get_sequence(); cout << "// " << accession << endl; cout << irke.thread_sequence_through_graph(seq) << endl << endl; } return; }
void examine_CDS_paths(IRKE &irke, string cds_fasta_filename, unsigned int min_cov, float min_connectivity, float min_entropy, bool WRITE_COVERAGE, string COVERAGE_OUTPUT_FILENAME) { Fasta_reader fasta_reader(cds_fasta_filename); ofstream coverage_writer; if (WRITE_COVERAGE) { coverage_writer.open(COVERAGE_OUTPUT_FILENAME.c_str()); if (!coverage_writer.is_open()) { throw (stacktrace() + "Error, cannot write to file: " + COVERAGE_OUTPUT_FILENAME); } } while (fasta_reader.hasNext()) { Fasta_entry fe = fasta_reader.getNext(); string accession = fe.get_accession(); string seq = fe.get_sequence(); vector<unsigned int> coverage_counter; cout << accession << "\tCov: " << min_cov << "\tCon: " << min_connectivity << "\tE: " << min_entropy; if (irke.sequence_path_exists(seq, min_cov, min_entropy, min_connectivity, coverage_counter)) { cout << "\tT" << endl; } else { cout << "\tF" << endl; } if (WRITE_COVERAGE) { coverage_writer << ">" << accession << endl; for (unsigned int i = 0; i < coverage_counter.size(); i++) { coverage_writer << coverage_counter[i]; if ((i + 1) % 30 == 0) { coverage_writer << endl; } else { coverage_writer << " "; } } coverage_writer << endl; } } if (WRITE_COVERAGE) { coverage_writer.close(); } return; }
void IRKE::populate_Kmers_from_kmers(const string& fasta_filename) { unsigned int kmer_length = kcounter.get_kmer_length(); int i, myTid; unsigned long sum, *record_counter = new unsigned long[omp_get_max_threads()]; unsigned long start, end; // init record counter for (int i = 0; i < omp_get_max_threads(); i++) { record_counter[i] = 0; } cerr << "-reading Kmer occurences..." << endl; start = time(NULL); Fasta_reader fasta_reader(fasta_filename); #pragma omp parallel private (myTid) { myTid = omp_get_thread_num(); record_counter[myTid] = 0; while (true) { Fasta_entry fe = fasta_reader.getNext(); if (fe.get_sequence() == "") break; record_counter[myTid]++; if (IRKE_COMMON::MONITOR) { if (myTid == 0 && record_counter[myTid] % 100000 == 0) { sum = record_counter[0]; for (i=1; i<omp_get_num_threads(); i++) sum+= record_counter[i]; cerr << "\r [" << sum/1000000 << "M] Kmers parsed. "; } } string seq = fe.get_sequence(); if (seq.length() != kmer_length) { continue; } kmer_int_type_t kmer = kcounter.get_kmer_intval(seq); unsigned int count = atoi(fe.get_header().c_str()); kcounter.add_kmer(kmer, count); } } end = time(NULL); sum = record_counter[0]; for (i=1; i<omp_get_max_threads(); i++) sum+= record_counter[i]; delete [] record_counter; cerr << endl << " done parsing " << sum << " Kmers, " << kcounter.size() << " added, taking " << (end-start) << " seconds." << endl; ofstream iworm_kmer_count_report_fh; iworm_kmer_count_report_fh.open("inchworm.kmer_count"); iworm_kmer_count_report_fh << kcounter.size() << endl; iworm_kmer_count_report_fh.close(); return; }
void IRKE::populate_Kmers_from_fasta(const string& fasta_filename, bool reassembleIworm) { unsigned int kmer_length = kcounter.get_kmer_length(); int i, myTid; unsigned long sum, *record_counter = new unsigned long[omp_get_max_threads()]; unsigned long start, end; // init record counter for (int i = 0; i < omp_get_max_threads(); i++) { record_counter[i] = 0; } cerr << "-storing Kmers..." << endl; start = time(NULL); Fasta_reader fasta_reader(fasta_filename); unsigned int entry_num = 0; #pragma omp parallel private (myTid) { myTid = omp_get_thread_num(); record_counter[myTid] = 0; while (fasta_reader.hasNext()) { Fasta_entry fe = fasta_reader.getNext(); string accession = fe.get_accession(); #pragma omp atomic entry_num++; record_counter[myTid]++; if (IRKE_COMMON::MONITOR >= 4) { cerr << "[" << entry_num << "] acc: " << accession << ", by thread no: " << myTid << endl;; } else if (IRKE_COMMON::MONITOR) { if (myTid == 0 && record_counter[myTid] % 1000 == 0) { sum = record_counter[0]; for (i=1; i<omp_get_num_threads(); i++) sum+= record_counter[i]; cerr << "\r [" << sum << "] sequences parsed. "; } } string seq = fe.get_sequence(); if (seq.length() < kmer_length + 1) { continue; } if (reassembleIworm) { string accession = fe.get_accession(); string header = fe.get_header(); // get coverage value from iworm assembly vector<string> tokens; string_util::tokenize(accession, tokens, ";"); if (tokens.size() < 2) { stringstream err; err << "Could not extract coverage value from accession: " << tokens[tokens.size()-1]; throw(err.str()); } string cov_s = tokens[tokens.size()-1]; unsigned int cov_val = atoi(cov_s.c_str()); // get Kmer value from header vector<string> header_toks; string_util::tokenize(header, header_toks, " "); if (header_toks.size() < 5) { stringstream err; err << "Fasta header: " << header << " lacks expected format including Kmer length from previous inchworm assembly run"; throw(err.str()); } unsigned int kmer_val = atoi(header_toks[2].c_str()); unsigned int normalized_coverage_val = static_cast<unsigned int> (cov_val * kmer_val / 25.0 + 0.5); if (IRKE_COMMON::MONITOR >= 1) { cerr << "Adding inchworm assembly " << accession << " K: " << kmer_val << " Cov: " << cov_val << " with coverage: " << normalized_coverage_val << endl; } if (cov_val < 1) { stringstream err; err << "error parsing coverage value from accession: " << accession; throw(err.str()); } kcounter.add_sequence(seq, normalized_coverage_val); } else { kcounter.add_sequence(seq); } // remove singleton kmers at read interval to minimize memory requirements. if (PRUNE_SINGLETON_READ_INTERVAL > 0 && myTid == 0 && record_counter[myTid]/omp_get_num_threads() % PRUNE_SINGLETON_READ_INTERVAL == 0) { if (IRKE_COMMON::MONITOR >= 1) { cerr << "Reached singleton kmer pruning interval at read count: " << record_counter << endl; } prune_kmers_min_count(1); } } } end = time(NULL); sum = record_counter[0]; for (i=1; i<omp_get_max_threads(); i++) sum+= record_counter[i]; delete [] record_counter; cerr << endl << " done parsing " << sum << " sequences, extracted " << kcounter.size() << " kmers, taking " << (end-start) << " seconds." << endl; return; }
int runMe(int argc, char* argv[]) { ArgProcessor args(argc, argv); if(args.isArgSet("--help") || (!(args.isArgSet("--reads") && ( args.isArgSet("--kmers") || args.isArgSet("--kmers_from_reads") ) )) ) { cerr << usage(args) << endl << endl; exit(1); } string reads_fasta_file = args.getStringVal("--reads"); bool is_DS = (! args.isArgSet("--SS")); if(args.isArgSet("--kmer_size")) { KMER_SIZE = args.getIntVal("--kmer_size"); if(KMER_SIZE < 20) { cerr << "Error, min kmer size is 20"; exit(2); } } if(args.isArgSet("--monitor")) { IRKE_COMMON::MONITOR = args.getIntVal("--monitor"); } if (args.isArgSet("--num_threads")) { int num_threads = args.getIntVal("--num_threads"); if (num_threads < MAX_THREADS) { omp_set_num_threads(num_threads); } else { // set to max omp_set_num_threads(MAX_THREADS); } } if(omp_get_max_threads() > MAX_THREADS) { omp_set_num_threads(MAX_THREADS); } KmerCounter kcounter (KMER_SIZE, is_DS); if (args.isArgSet("--kmers")) { string kmers_fasta_file = args.getStringVal("--kmers"); populate_kmer_counter_from_kmers(kcounter, kmers_fasta_file); } else { string kmer_read_source_fasta_file = args.getStringVal("--kmers_from_reads"); populate_kmer_counter_from_reads(kcounter, kmer_read_source_fasta_file); } Fasta_reader fasta_reader(reads_fasta_file); bool write_coverage_info = args.isArgSet("--capture_coverage_info"); int start_time = time(NULL); #pragma omp parallel while (true) { if (! fasta_reader.hasNext()) break; int myTid = omp_get_thread_num(); Fasta_entry fe = fasta_reader.getNext(); string sequence = fe.get_sequence(); if(sequence == "") continue; string header = fe.get_header(); vector<unsigned int> kmer_coverage = compute_kmer_coverage(sequence, kcounter); unsigned int median_cov = median_coverage(kmer_coverage); float mean_cov = mean(kmer_coverage); float stdev = stDev(kmer_coverage); float pct_stdev_of_avg = stdev/mean_cov*100; stringstream stats_text; stats_text << median_cov << "\t" << mean_cov << "\t" << stdev << "\t" << pct_stdev_of_avg << "\t" << fe.get_accession(); stats_text << "\tthread:" << myTid; if(write_coverage_info) { // add the coverage info stats_text << "\t"; for (size_t i = 0; i < kmer_coverage.size(); i++) { stats_text<< kmer_coverage[i]; if(i != kmer_coverage.size() - 1) { stats_text<< ","; } } } stats_text << endl; #pragma omp critical { cout << stats_text.str(); } if (mean_cov < 0) { cerr << "ERROR, cannot have negative coverage!!" << endl; exit(1); } } int end_time = time(NULL); cerr << "STATS_GENERATION_TIME: " << (end_time - start_time) << " seconds." << endl; return(0); }
void populate_kmer_counter_from_reads (KmerCounter& kcounter, string& fasta_filename) { unsigned int kmer_length = kcounter.get_kmer_length(); int i, myTid; unsigned long sum, *record_counter = new unsigned long[omp_get_max_threads()]; unsigned long start, end; // init record counter for (int i = 0; i < omp_get_max_threads(); i++) { record_counter[i] = 0; } cerr << "-storing Kmers..." << "\n"; start = time(NULL); Fasta_reader fasta_reader(fasta_filename); unsigned int entry_num = 0; #pragma omp parallel private (myTid) { myTid = omp_get_thread_num(); record_counter[myTid] = 0; while (fasta_reader.hasNext()) { Fasta_entry fe = fasta_reader.getNext(); string accession = fe.get_accession(); #pragma omp atomic entry_num++; record_counter[myTid]++; if (IRKE_COMMON::MONITOR >= 4) { cerr << "[" << entry_num << "] acc: " << accession << ", by thread no: " << myTid << "\n";; } else if (IRKE_COMMON::MONITOR) { if (myTid == 0 && record_counter[myTid] % 1000 == 0) { sum = record_counter[0]; for (i=1; i<omp_get_num_threads(); i++) sum+= record_counter[i]; cerr << "\r [" << sum << "] sequences parsed. "; } } string seq = fe.get_sequence(); if (seq.length() < KMER_SIZE + 1) { continue; } kcounter.add_sequence(seq); } cerr << "\n" << " done parsing " << sum << " sequences, extracted " << kcounter.size() << " kmers, taking " << (end-start) << " seconds." << "\n"; } return; }
int main(int argc, char* argv[]) { ArgProcessor args(argc, argv); if(args.isArgSet("--help") || (!(args.isArgSet("--reads") && args.isArgSet("--kmers")))) { cerr << usage(args) << endl << endl; exit(1); } string reads_fasta_file = args.getStringVal("--reads"); string kmers_fasta_file = args.getStringVal("--kmers"); bool is_DS = (! args.isArgSet("--SS")); if(args.isArgSet("--kmer_size")) { KMER_SIZE = args.getIntVal("--kmer_size"); if(KMER_SIZE < 20) { cerr << "Error, min kmer size is 20"; exit(2); } } if(args.isArgSet("--monitor")) { IRKE_COMMON::MONITOR = args.getIntVal("--monitor"); } if(omp_get_max_threads() > MAX_THREADS) { omp_set_num_threads(MAX_THREADS); } KmerCounter kcounter (KMER_SIZE, is_DS); populate_kmer_counter(kcounter, kmers_fasta_file); Fasta_reader fasta_reader(reads_fasta_file); ofstream* filewriter = NULL; ofstream* covwriter = NULL; bool write_coverage_info = args.isArgSet("--capture_coverage_info"); while (true) { Fasta_entry fe = fasta_reader.getNext(); string sequence = fe.get_sequence(); if(sequence == "") break; string header = fe.get_header(); vector<unsigned int> kmer_coverage = compute_kmer_coverage(sequence, kcounter); unsigned int median_cov = median_coverage(kmer_coverage); float mean_cov = mean(kmer_coverage); float stdev = stDev(kmer_coverage); float pct_stdev_of_avg = stdev/mean_cov*100; stringstream stats_text; stats_text << median_cov << "\t" << mean_cov << "\t" << stdev << "\t" << pct_stdev_of_avg << "\t" << fe.get_accession(); if(write_coverage_info) { // add the coverage info stats_text << "\t"; for (int i = 0; i < kmer_coverage.size(); i++) { stats_text<< kmer_coverage[i]; if(i != kmer_coverage.size() - 1) { stats_text<< ","; } } } stats_text << endl; cout << stats_text.str(); if (mean_cov < 0) { cerr << "ERROR, cannot have negative coverage!!" << endl; exit(1); } } return(0); }
int run (int argc, char* argv[]) { if (argc < 3) { stringstream s; s << "Usage: " << argv[0] << " file.fasta kmer_length [DS_mode]" << endl << endl; cerr << s.str(); return(1); } string fasta_filename (argv[1]); unsigned int kmer_length = atoi(argv[2]); bool DS_mode = (argc >= 3) ? true : false; Fasta_reader fasta_reader(fasta_filename); Ktree ktree; long read_counter = 0; while (fasta_reader.hasNext()) { read_counter++; if (read_counter % 1000 == 0) { cerr << "\rread[" << read_counter << "] "; } Fasta_entry fe = fasta_reader.getNext(); string accession = fe.get_accession(); string sequence = fe.get_sequence(); // cerr << "Processing: " << sequence << endl; if (sequence.length() < kmer_length + 1) { continue; } for (unsigned int i = 0; i <= sequence.length() - kmer_length; i++) { string kmer = sequence.substr(i, kmer_length); if (! contains_non_gatc(kmer)) { ktree.add_kmer(kmer); if (DS_mode) { kmer = revcomp(kmer); ktree.add_kmer(kmer); } } } } ktree.report_kmer_counts(); return(0); }