//////////////////////////////////////////////////////////// // learn_errors // // Correct reads using a much stricter filter in order // to count the nt->nt errors and learn the errors // probabilities //////////////////////////////////////////////////////////// //static void learn_errors(string fqf, bithash * trusted, vector<streampos> & starts, vector<unsigned long long> & counts, double (&ntnt_prob)[4][4], double prior_prob[4]) { static void learn_errors(string fqf, bithash * trusted, vector<streampos> & starts, vector<unsigned long long> & counts, double ntnt_prob[Read::max_qual][4][4], double prior_prob[4]) { unsigned int ntnt_counts[Read::max_qual][4][4] = {0}; unsigned int samples = 0; unsigned int chunk = 0; #pragma omp parallel //shared(trusted) { unsigned int tchunk; string header,ntseq,strqual,corseq; int trim_length; char* nti; Read *r; ifstream reads_in(fqf.c_str()); while(chunk < threads*chunks_per_thread) { #pragma omp critical tchunk = chunk++; reads_in.seekg(starts[tchunk]); unsigned long long tcount = 0; while(getline(reads_in, header)) { //cout << header << endl; // get sequence getline(reads_in, ntseq); //cout << ntseq << endl; // convert ntseq to iseq vector<unsigned int> iseq; for(int i = 0; i < ntseq.size(); i++) { nti = strchr(nts, ntseq[i]); iseq.push_back(nti - nts); } // get quality values getline(reads_in,strqual); //cout << strqual << endl; getline(reads_in,strqual); //cout << strqual << endl; vector<int> untrusted; if(iseq.size() < trim_t) trim_length = 0; else { for(int i = 0; i < iseq.size()-k+1; i++) { if(!trusted->check(&iseq[i])) { untrusted.push_back(i); } } trim_length = quick_trim(strqual, untrusted); } // fix error reads if(untrusted.size() > 0) { // correct r = new Read(header, &iseq[0], strqual, untrusted, trim_length); corseq = r->correct(trusted, ntnt_prob, prior_prob, true); // if trimmed to long enough if(corseq.size() >= trim_t) { if(r->trusted_read != 0) { // else no guarantee there was a correction for(int c = 0; c < r->trusted_read->corrections.size(); c++) { correction cor = r->trusted_read->corrections[c]; if(iseq[cor.index] < 4) { // P(obs=o|actual=a,a!=o) for Bayes ntnt_counts[strqual[cor.index]-Read::quality_scale][cor.to][iseq[cor.index]]++; // P(actual=a|obs=o,a!=o) //ntnt_counts[iseq[cor.index]][cor.to]++; samples++; } } } } delete r; } if(++tcount == counts[tchunk] || samples > 200000) break; } } reads_in.close(); } regress_probs(ntnt_prob, ntnt_counts); output_model(ntnt_prob, ntnt_counts, fqf); }
//////////////////////////////////////////////////////////////////////////////// // correct_reads // // Correct the reads in the file 'fqf' using the data structure of trusted // kmers 'trusted', matrix of nt->nt error rates 'ntnt_prob' and prior nt // probabilities 'prior_prob'. 'starts' and 'counts' help openMP parallelize // the read processing. If 'pairedend_code' is 0, the reads are not paired; // if it's 1, this file is the first of a pair so print all reads and withold // combining; if it's 2, the file is the second of a pair so print all reads // and then combine both 1 and 2. //////////////////////////////////////////////////////////////////////////////// static void correct_reads(string fqf, int pe_code, bithash * trusted, vector<streampos> & starts, vector<unsigned long long> & counts, double ntnt_prob[Read::max_qual][4][4], double prior_prob[4]) { // output directory struct stat st_file_info; string path_suffix = split(fqf,'/').back(); string out_dir("."+path_suffix); if(stat(out_dir.c_str(), &st_file_info) == 0) { cerr << "Hidden temporary directory " << out_dir << " already exists and will be used" << endl; } else { if(mkdir(out_dir.c_str(), S_IRWXU) == -1) { cerr << "Failed to create hidden temporary directory " << out_dir << endl; exit(EXIT_FAILURE); } } // collect stats stats * thread_stats = new stats[omp_get_max_threads()]; unsigned int chunk = 0; #pragma omp parallel //shared(trusted) { int tid = omp_get_thread_num(); // input ifstream reads_in(fqf.c_str()); unsigned int tchunk; string header,ntseq,mid,strqual,corseq; int trim_length; char* nti; Read *r; #pragma omp critical tchunk = chunk++; while(tchunk < starts.size()) { reads_in.seekg(starts[tchunk]); // output string toutf(out_dir+"/"); stringstream tconvert; tconvert << tchunk; toutf += tconvert.str(); if(overwrite_temp || stat(toutf.c_str(), &st_file_info) == -1) { ofstream reads_out(toutf.c_str()); //cout << toutf << endl; // output log string tlogf = toutf + ".log"; ofstream corlog_out; if(out_log) { corlog_out.open(tlogf.c_str()); } unsigned long long tcount = 0; while(getline(reads_in, header)) { //cout << tid << " " << header << endl; // get sequence getline(reads_in, ntseq); //cout << ntseq << endl; // convert ntseq to iseq vector<unsigned int> iseq; for(int i = 0; i < ntseq.size(); i++) { nti = strchr(nts, ntseq[i]); iseq.push_back(nti - nts); } // get quality values getline(reads_in,mid); //cout << mid << endl; getline(reads_in,strqual); //cout << strqual << endl; vector<int> untrusted; if(iseq.size() < trim_t) trim_length = 0; else { for(int i = 0; i < iseq.size()-k+1; i++) { if(!trusted->check(&iseq[i])) { untrusted.push_back(i); } } trim_length = quick_trim(strqual, untrusted); //trim_length = iseq.size(); } // fix error reads if(untrusted.size() > 0) { r = new Read(header, &iseq[0], strqual, untrusted, trim_length); corseq = r->correct(trusted, ntnt_prob, prior_prob); // output read w/ trim and corrections output_read(reads_out, corlog_out, pe_code, header, ntseq, mid, strqual, corseq, thread_stats[tid]); delete r; } else { output_read(reads_out, corlog_out, pe_code, header, ntseq, mid, strqual, ntseq.substr(0,trim_length), thread_stats[tid]); // output read as trimmed /* if(contrail_out) reads_out << header << "\t" << ntseq.substr(0,trim_length) << endl; else reads_out << header << endl << ntseq.substr(0,trim_length) << endl << mid << endl << strqual.substr(0,trim_length) << endl; */ } if(++tcount == counts[tchunk]) break; } reads_out.close(); } #pragma omp critical tchunk = chunk++; } reads_in.close(); } // combine stats for(int i = 1; i < omp_get_max_threads(); i++) { thread_stats[0].validated += thread_stats[i].validated; thread_stats[0].corrected += thread_stats[i].corrected; thread_stats[0].trimmed += thread_stats[i].trimmed; thread_stats[0].trimmed_only += thread_stats[i].trimmed_only; thread_stats[0].removed += thread_stats[i].removed; } // print stats int suffix_index = fqf.rfind("."); string outf; if(suffix_index == -1) { outf = fqf+".stats.txt"; } else { outf = fqf.substr(0,suffix_index+1) + "stats.txt"; } ofstream stats_out(outf.c_str()); stats_out << "Validated: " << thread_stats[0].validated << endl; stats_out << "Corrected: " << thread_stats[0].corrected << endl; stats_out << "Trimmed: " << thread_stats[0].trimmed << endl; stats_out << "Trimmed only: " << thread_stats[0].trimmed_only << endl; stats_out << "Removed: " << thread_stats[0].removed << endl; stats_out.close(); }