/** Function get_seeds() * * Produce seeds: rID-> (s0, s1, s2...) for each fragment given two * fq files f and f2 */ void get_seeds (ii64vec_t& list_seeds, const std::string& f, const std::string& f2, int seed_len, int batch, bool silent) { std::ifstream fh, fh2; xny::openfile<std::ifstream>(fh, f); xny::openfile<std::ifstream>(fh2, f2); bio::fastq_input_iterator<> fq(fh), end, fq2(fh2); int total_read_pairs = 0; strvec_t pairs; while (fq != end && fq2 != end) { add_fq_reads_only (pairs, batch/2, fq, end); add_fq_reads_only (pairs, batch/2, fq2, end); generate_seeds (list_seeds, pairs, seed_len); total_read_pairs += pairs.size()/2; pairs.clear(); } // while if (!silent) { std::cout << "\t\ttotal frags: " << total_read_pairs << "\n"; } xny::closefile(fh); xny::closefile(fh2); } // get_seeds
/** Function get_super_sketches * */ void get_super_sketches (std::vector<sketch_t>& super_sketches, const std::string& f, const std::string& f2, xny::sketch_list& slistgen, xny::super_sketch& ssgen, jaz::murmur264& hashfunc, int batch, bool silent) { std::ifstream fh, fh2; xny::openfile<std::ifstream>(fh, f); xny::openfile<std::ifstream>(fh2, f2); bio::fastq_input_iterator<> fq(fh), end, fq2(fh2); int total_read_pairs = 0; strvec_t pairs; while (fq != end && fq2 != end) { add_fq_reads_only (pairs, batch/2, fq, end); add_fq_reads_only (pairs, batch/2, fq2, end); generate_super_sketches (super_sketches, pairs, slistgen, ssgen, hashfunc); total_read_pairs += pairs.size()/2; pairs.clear(); } // while if (!silent) { std::cout << "\t\t\ttotal pairs, super_sketches: " << total_read_pairs << ", " << super_sketches.size() << "\n"; } xny::closefile(fh); xny::closefile(fh2); } // get_super_sketches
/** Function clean_dupl_frag () * */ void clean_dupl_frag (const std::string& ifq, const std::string& ifq2, std::ofstream& ofhfq, std::ofstream& ofhfq2, const iset_t& duplIDs, xny::low_complexity& lc, int batch){ std::ifstream ifhfq, ifhfq2; xny::openfile<std::ifstream>(ifhfq, ifq); xny::openfile<std::ifstream>(ifhfq2, ifq2); bio::fastq_input_iterator<> fq(ifhfq), end, fq2(ifhfq2); int total_read_pairs = 0; std::vector<fqtuple_t> pairs; int num_lc = 0; int fragID = 0; while (fq != end && fq2 != end) { add_fq_reads (pairs, batch/2, fq, end); add_fq_reads (pairs, batch/2, fq2, end); iset_t low_complex_frag; check_low_complexity (low_complex_frag, fragID, pairs, lc); num_lc += low_complex_frag.size(); int fragnum = pairs.size()/2; for (int i = 0; i < fragnum; ++ i) { if ( (!duplIDs.count(fragID)) && (!low_complex_frag.count(fragID))) { // output ofhfq << "@" << std::get<0>(pairs[i]) << "\n"; ofhfq << std::get<1>(pairs[i]) << "\n"; ofhfq << "+\n"; ofhfq << std::get<2>(pairs[i]) << "\n"; ofhfq2 << "@" << std::get<0>(pairs[i + fragnum]) << "\n"; ofhfq2 << std::get<1>(pairs[i + fragnum]) << "\n"; ofhfq2 << "+\n"; ofhfq2 << std::get<2>(pairs[i + fragnum]) << "\n"; } ++ fragID; } total_read_pairs += pairs.size()/2; pairs.clear(); } // while std::cout << "\t\tlow complexity fragments: " << num_lc << "\n\n"; xny::closefile(ifhfq); xny::closefile(ifhfq2); } // clean_dupl_frag
void BamToFastq::PairedFastq() { // open the 1st fastq file for writing ofstream fq1(_fastq1.c_str(), ios::out); if ( !fq1 ) { cerr << "Error: The first fastq file (" << _fastq1 << ") could not be opened. Exiting!" << endl; exit (1); } // open the 2nd fastq file for writing ofstream fq2(_fastq2.c_str(), ios::out); if ( !fq2 ) { cerr << "Error: The second fastq file (" << _fastq2 << ") could not be opened. Exiting!" << endl; exit (1); } // open the BAM file BamReader reader; reader.Open(_bamFile); // rip through the BAM file and convert each mapped entry to BEDPE BamAlignment bam1, bam2; bool shouldConsumeReads = true; while (true) { if (shouldConsumeReads) { if (!reader.GetNextAlignment(bam1) || !reader.GetNextAlignment(bam2)) break; } else { shouldConsumeReads = true; } if (bam1.Name != bam2.Name) { while (bam1.Name != bam2.Name) { if (bam1.IsPaired()) { cerr << "*****WARNING: Query " << bam1.Name << " is marked as paired, but its mate does not occur" << " next to it in your BAM file. Skipping. " << endl; } bam1 = bam2; if (!reader.GetNextAlignment(bam2)) break; shouldConsumeReads = false; } } else if (bam1.IsPaired() && bam2.IsPaired()) { // extract the sequence and qualities for the BAM "query" string seq1 = bam1.QueryBases; string qual1 = bam1.Qualities; string seq2 = bam2.QueryBases; string qual2 = bam2.Qualities; if (bam1.IsReverseStrand() == true) { reverseComplement(seq1); reverseSequence(qual1); } if (bam2.IsReverseStrand() == true) { reverseComplement(seq2); reverseSequence(qual2); } fq1 << "@" << bam1.Name << "/1" << endl; fq1 << seq1 << endl; fq1 << "+" << endl; fq1 << qual1 << endl; fq2 << "@" << bam2.Name << "/2" << endl; fq2 << seq2 << endl; fq2 << "+" << endl; fq2 << qual2 << endl; } } reader.Close(); }
void BamToFastq::PairedFastqUseTags() { // open the 1st fastq file for writing ofstream fq1(_fastq1.c_str(), ios::out); if ( !fq1 ) { cerr << "Error: The first fastq file (" << _fastq1 << ") could not be opened. Exiting!" << endl; exit (1); } // open the 2nd fastq file for writing ofstream fq2(_fastq2.c_str(), ios::out); if ( !fq2 ) { cerr << "Error: The second fastq file (" << _fastq2 << ") could not be opened. Exiting!" << endl; exit (1); } // open the BAM file BamReader reader; reader.Open(_bamFile); // rip through the BAM file and convert each mapped entry to BEDPE BamAlignment bam1, bam2; while (reader.GetNextAlignment(bam1)) { reader.GetNextAlignment(bam2); if (bam1.Name != bam2.Name) { while (bam1.Name != bam2.Name) { if (bam1.IsPaired()) { cerr << "*****WARNING: Query " << bam1.Name << " is marked as paired, but it's mate does not occur" << " next to it in your BAM file. Skipping. " << endl; } bam1 = bam2; reader.GetNextAlignment(bam2); } } else if (bam1.IsPaired() && bam2.IsPaired()) { // assume the R2 and Q2 tags are on the + strand. string mateSequence, mateQualities; bam1.GetTag("R2", mateSequence); bam1.GetTag("Q2", mateQualities); string seq1 = bam1.QueryBases; string qual1 = bam1.Qualities; if (bam1.IsReverseStrand() == true) { reverseComplement(seq1); reverseSequence(qual1); } // since the info for both ends are contained in each BAM record, // we only need to process one of the two records (bam1) in order // to produce FASTQ entries for both ends. // NOTE: Assumes that R2 and Q2 have already been rev // and revcomped if necessary if (bam1.IsFirstMate() == true) { // end1 fq1 << "@" << bam1.Name << "/1" << endl; fq1 << seq1 << endl; fq1 << "+" << endl; fq1 << qual1 << endl; // end2 fq2 << "@" << bam1.Name << "/2" <<endl; fq2 << mateSequence << endl; fq2 << "+" << endl; fq2 << mateQualities << endl; } else { // end 2 fq2 << "@" << bam1.Name << "/2" <<endl; fq2 << seq1 << endl; fq2 << "+" << endl; fq2 << qual1 << endl; // end 1 fq1 << "@" << bam1.Name << "/1" <<endl; fq1 << mateSequence << endl; fq1 << "+" << endl; fq1 << mateQualities << endl; } } } reader.Close(); }