// close the BAM files void BamMultiReader::Close(void) { // close all BAM readers and clean up pointers vector<pair<BamReader*, BamAlignment*> >::iterator readerIter = readers.begin(); vector<pair<BamReader*, BamAlignment*> >::iterator readerEnd = readers.end(); for ( ; readerIter != readerEnd; ++readerIter) { BamReader* reader = (*readerIter).first; BamAlignment* alignment = (*readerIter).second; // close the reader if ( reader) reader->Close(); // delete reader pointer delete reader; reader = 0; // delete alignment pointer delete alignment; alignment = 0; } // clear out the container readers.clear(); }
void BedIntersectPE::IntersectBamPE(string bamFile) { // load the "B" bed file into a map so // that we can easily compare "A" to it for overlaps _bedB->loadBedFileIntoMap(); // open the BAM file BamReader reader; BamWriter writer; reader.Open(bamFile); // get header & reference information string bamHeader = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); // open a BAM output to stdout if we are writing BAM if (_bamOutput == true) { // set compression mode BamWriter::CompressionMode compressionMode = BamWriter::Compressed; if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed; writer.SetCompressionMode(compressionMode); // open our BAM writer writer.Open("stdout", bamHeader, refs); } // track the previous and current sequence // names so that we can identify blocks of // alignments for a given read ID. string prevName, currName; prevName = currName = ""; vector<BamAlignment> alignments; // vector of BAM alignments for a given ID in a BAM file. alignments.reserve(100); _bedA->bedType = 10; // it's a full BEDPE given it's BAM // rip through the BAM file and convert each mapped entry to BEDPE BamAlignment bam1, bam2; while (reader.GetNextAlignment(bam1)) { // the alignment must be paired if (bam1.IsPaired() == true) { // grab the second alignment for the pair. reader.GetNextAlignment(bam2); // require that the alignments are from the same query if (bam1.Name == bam2.Name) { ProcessBamBlock(bam1, bam2, refs, writer); } else { cerr << "*****ERROR: -bedpe requires BAM to be sorted or grouped by query name. " << endl; exit(1); } } } // close up reader.Close(); if (_bamOutput == true) { writer.Close(); } }
/* Description: Load all the bam into memory at one time if no parameters set, otherwise load the needed part of the bam. Save the parsed info into vector. */ bool BamParse::parseAlignment(int chrom1, int chrom1_begin, int chrom2, int chrom2_end) { BamReader reader; if ( !reader.Open(filename) ) { cerr << "Bamtools ERROR: could not open input BAM file: " << filename << endl; return false; } //check whether need to set a region. if(chrom1>-1 && chrom1_begin>-1 && chrom2>-1 && chrom2_end>-1) { this->loadIndex(reader); BamRegion br(chrom1,chrom1_begin,chrom2,chrom2_end); bool is_set=reader.SetRegion(br); if(is_set==false) { return false;//cannot set the region. } } //process input data BamAlignment al; while ( reader.GetNextAlignment(al) ) { if(al.Position<0) continue; BamAlignmentRecord* bar=new BamAlignmentRecord(); setAlignmentRecord(al,bar); bam_aln_records.push_back(bar); } reader.Close(); return true; }
int main (int argc, char *argv[]) { string bamfiletopen = string(argv[1]); BamReader reader; if ( !reader.Open(bamfiletopen) ) { cerr << "Could not open input BAM files." << endl; return 1; } BamAlignment al; while ( reader.GetNextAlignment(al) ) { string reconstructedReference = reconstructRef(&al); cout<<al.QueryBases<<endl; cout<<reconstructedReference<<endl; pair< string, vector<int> > reconP = reconstructRefWithPos(&al); for(unsigned int i=0;i<reconP.first.size();i++){ cout<<reconP.first[i]<<"\t"<<reconP.second[i]<<endl; } } reader.Close(); return 0; }
bool RevertTool::RevertToolPrivate::Run(void) { // opens the BAM file without checking for indexes BamReader reader; if ( !reader.Open(m_settings->InputFilename) ) { cerr << "Could not open input BAM file... quitting." << endl; return false; } // get BAM file metadata const string& headerText = reader.GetHeaderText(); const RefVector& references = reader.GetReferenceData(); // open writer BamWriter writer; bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() && !m_settings->IsForceCompression ); if ( !writer.Open(m_settings->OutputFilename, headerText, references, writeUncompressed) ) { cerr << "Could not open " << m_settings->OutputFilename << " for writing." << endl; return false; } // plow through file, reverting alignments BamAlignment al; while ( reader.GetNextAlignment(al) ) { RevertAlignment(al); writer.SaveAlignment(al); } // clean and exit reader.Close(); writer.Close(); return true; }
/** * Main work method. Reads the BAM file once and collects sorted information about * the 5' ends of both ends of each read (or just one end in the case of pairs). * Then makes a pass through those determining duplicates before re-reading the * input file and writing it out with duplication flags set correctly. */ int MarkDuplicates::runInternal() { ogeNameThread("am_MarkDuplicates"); if(verbose) cerr << "Reading input file and constructing read end information." << endl; buildSortedReadEndLists(); generateDuplicateIndexes(); if(verbose) cerr << "Marking " << numDuplicateIndices << " records as duplicates." << endl; BamReader in; in.Open(getBufferFileName()); // Now copy over the file while marking all the necessary indexes as duplicates long recordInFileIndex = 0; long written = 0; while (true) { BamAlignment * prec = in.GetNextAlignment(); if(!prec) break; if (prec->IsPrimaryAlignment()) { if (duplicateIndexes.count(recordInFileIndex) == 1) prec->SetIsDuplicate(true); else prec->SetIsDuplicate(false); } recordInFileIndex++; if (removeDuplicates && prec->IsDuplicate()) { // do nothing } else { putOutputAlignment(prec); if (verbose && read_count && ++written % 100000 == 0) { cerr << "\rWritten " << written << " records (" << written * 100 / read_count <<"%)." << std::flush; } } } if (verbose && read_count) cerr << "\rWritten " << written << " records (" << written * 100 / read_count <<"%)." << endl; in.Close(); remove(getBufferFileName().c_str()); return 0; }
void BedCoverage::CollectCoverageBam(string bamFile) { // load the "B" bed file into a map so // that we can easily compare "A" to it for overlaps _bedB->loadBedCovFileIntoMap(); // open the BAM file BamReader reader; reader.Open(bamFile); // get header & reference information string header = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); // convert each aligned BAM entry to BED // and compute coverage on B BamAlignment bam; while (reader.GetNextAlignment(bam)) { if (bam.IsMapped()) { // treat the BAM alignment as a single "block" if (_obeySplits == false) { // construct a new BED entry from the current BAM alignment. BED a; a.chrom = refs.at(bam.RefID).RefName; a.start = bam.Position; a.end = bam.GetEndPosition(false, false); a.strand = "+"; if (bam.IsReverseStrand()) a.strand = "-"; _bedB->countHits(a, _sameStrand, _diffStrand, _countsOnly); } // split the BAM alignment into discrete blocks and // look for overlaps only within each block. else { // vec to store the discrete BED "blocks" from a bedVector bedBlocks; // since we are counting coverage, we do want to split blocks when a // deletion (D) CIGAR op is encountered (hence the true for the last parm) GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks, false, true); // use countSplitHits to avoid over-counting each split chunk // as distinct read coverage. _bedB->countSplitHits(bedBlocks, _sameStrand, _diffStrand, _countsOnly); } } } // report the coverage (summary or histogram) for BED B. if (_countsOnly == true) ReportCounts(); else ReportCoverage(); // close the BAM file reader.Close(); }
bool CoverageTool::CoverageToolPrivate::Run(void) { // if output filename given ofstream outFile; if ( m_settings->HasOutputFile ) { // open output file stream outFile.open(m_settings->OutputFilename.c_str()); if ( !outFile ) { cerr << "bamtools coverage ERROR: could not open " << m_settings->OutputFilename << " for output" << endl; return false; } // set m_out to file's streambuf m_out.rdbuf(outFile.rdbuf()); } //open our BAM reader BamReader reader; if ( !reader.Open(m_settings->InputBamFilename) ) { cerr << "bamtools coverage ERROR: could not open input BAM file: " << m_settings->InputBamFilename << endl; return false; } // retrieve references m_references = reader.GetReferenceData(); // set up our output 'visitor' CoverageVisitor* cv = new CoverageVisitor(m_references, &m_out); // set up pileup engine with 'visitor' PileupEngine pileup; pileup.AddVisitor(cv); // process input data BamAlignment al; while ( reader.GetNextAlignment(al) ) pileup.AddAlignment(al); // clean up reader.Close(); if ( m_settings->HasOutputFile ) outFile.close(); delete cv; cv = 0; // return success return true; }
int main (int argc, char *argv[]) { if( (argc== 1) || (argc== 2 && string(argv[1]) == "-h") || (argc== 2 && string(argv[1]) == "-help") || (argc== 2 && string(argv[1]) == "--help") ){ cout<<"Usage:setAsUnpaired [in bam] [outbam]"<<endl<<"this program takes flags all paired sequences as singles"<<endl; return 1; } string bamfiletopen = string(argv[1]); string bamFileOUT = string(argv[2]); BamReader reader; BamWriter writer; if ( !reader.Open(bamfiletopen) ) { cerr << "Could not open input BAM files." << endl; return 1; } const SamHeader header = reader.GetHeader(); const RefVector references = reader.GetReferenceData(); if ( !writer.Open(bamFileOUT,header,references) ) { cerr << "Could not open output BAM file "<<bamFileOUT << endl; return 1; } BamAlignment al; while ( reader.GetNextAlignment(al) ) { if(al.IsMapped()){ cerr << "Cannot yet handle mapped reads " << endl; return 1; } al.SetIsPaired (false); writer.SaveAlignment(al); } //while al reader.Close(); writer.Close(); return 0; }
int IndexTool::Run(int argc, char* argv[]) { // parse command line arguments Options::Parse(argc, argv, 1); // open our BAM reader BamReader reader; reader.Open(m_settings->InputBamFilename); // create index for BAM file bool useDefaultIndex = !m_settings->IsUsingBamtoolsIndex; reader.CreateIndex(useDefaultIndex); // clean & exit reader.Close(); return 0; }
int main (int argc, char *argv[]) { if( (argc== 1) || (argc== 2 && string(argv[1]) == "-h") || (argc== 2 && string(argv[1]) == "-help") || (argc== 2 && string(argv[1]) == "--help") ){ cout<<"Usage:editDist [in bam]"<<endl<<"this program returns the NM field of all aligned reads"<<endl; return 1; } string bamfiletopen = string(argv[1]); // cout<<bamfiletopen<<endl; BamReader reader; // cout<<"ok"<<endl; if ( !reader.Open(bamfiletopen) ) { cerr << "Could not open input BAM files." << endl; return 1; } BamAlignment al; // cout<<"ok"<<endl; while ( reader.GetNextAlignment(al) ) { // cout<<al.Name<<endl; if(!al.IsMapped()) continue; if(al.HasTag("NM") ){ int editDist; if(al.GetTag("NM",editDist) ){ cout<<editDist<<endl; }else{ cerr<<"Cannot retrieve NM field for "<<al.Name<<endl; return 1; } }else{ cerr<<"Warning: read "<<al.Name<<" is aligned but has no NM field"<<endl; } } //while al reader.Close(); return 0; }
int main (int argc, char** argv) { // Print Commandline string ss(argv[0]); // convert Char to String string commandline = "##Print Command line " + ss; int c; FastaReference* reference = NULL; int minbaseQ = 10; //default int windowlen = 40; //by default string regionstr; string RegionFile; string bamfile; bool STdin = false; bool has_region = false; bool has_regionFile = false; bool has_bamfile = false; bool has_ref = false; int ploidy = 2; bool SetLowComplexityRegionSWGapExt = false; bool SetLowComplexityRegion = false; if (argc < 2) { printSummary(argv); exit(1); } while (true) { static struct option long_options[] = { {"help", no_argument, 0, 'h'}, {"ploidy", required_argument, 0, 'p'}, {"window-size", required_argument, 0, 'w'}, {"reference", required_argument, 0, 'f'}, {"min-base-quality", required_argument, 0,'q'}, {"Region", required_argument, 0, 'R'}, {"STdin", no_argument, 0, 's'}, {"bam", required_argument, 0, 'b'}, {"Repeat-Extgap", no_argument, 0, 'E'}, {"LowCompex", no_argument, 0, 'l'}, {0, 0, 0, 0} }; int option_index = 0; c = getopt_long (argc, argv, "hslEf:q:w:s:r:R:p:b:", long_options, &option_index); /* Detect the end of the options. */ if (c == -1) break; switch (c) { case 'f': reference = new FastaReference(optarg); // will exit on open failure commandline = commandline + " -f " + optarg; has_ref = true; break; case 'b': has_bamfile = true; bamfile = optarg; commandline = commandline + " -b " + optarg; break; case 'r': regionstr = optarg; has_region = true; commandline = commandline + " -r " + optarg; break; case 'R': RegionFile = optarg; has_regionFile = true; commandline = commandline + " -R " + optarg; break; case 's': STdin = true; commandline = commandline + " -s "; break; case 'q': minbaseQ = atoi(optarg); commandline = commandline + " -q " + optarg; break; case 'w': windowlen = atoi(optarg); commandline = commandline + " -w " + optarg; break; case 'p': ploidy = atoi(optarg); commandline = commandline + " -p " + optarg; break; case 'E': SetLowComplexityRegionSWGapExt = true; commandline = commandline + " -E "; break; case 'l': SetLowComplexityRegion = true; commandline = commandline + " -l "; break; case 'h': printSummary(argv); commandline = commandline + " -h "; exit(0); break; case '?': printSummary(argv); exit(1); break; default: abort(); break; } } //// Open Error log files ofstream cerrlog("bonsaiReport.txt"); streambuf *cerrsave = std::cerr.rdbuf(); // Redirect stream buffers if (cerrlog.is_open()) cerr.rdbuf(cerrlog.rdbuf()); cerr << commandline << endl; //Check for Reference Fasta sequence if (!has_ref) { cerr << "no FASTA reference provided, cannot realign" << endl; exit(1); } ////Check for reader BamReader reader; if (STdin == true) { if (!reader.Open("stdin")) { cerr << "could not open stdin bam for reading" << endl; cerr << reader.GetErrorString() << endl; reader.Close(); printSummary(argv); } } else { if (has_bamfile == true) { if (!reader.Open(bamfile)) { cerr << "ERROR: could not open bam files from stdin ... Aborting" << endl; cerr << reader.GetErrorString() << endl; reader.Close(); printSummary(argv); } if ( !reader.LocateIndex() ) reader.CreateIndex(); } else { cerr << "--bam flag is set but no bamfile is provided... Aborting" << endl; reader.Close(); printSummary(argv); } } //// Check Region Tags if ( (has_regionFile == true) && (has_region == true) ) { cerr << "ERROR: You provide both region and has provide a Set Region List... Aborting" << endl; exit(1); } //// store the names of all the reference sequences in the BAM file vector<RefData> referencedata = reader.GetReferenceData(); //// Store Region LIST vector<BamRegion> regionlist; if (has_region == true) { BamRegion region; ParseRegionString(regionstr, reader, region); regionlist.push_back(region); } else if (has_regionFile == true) { ifstream RG(RegionFile.c_str(), ios_base::in); string line; while(getline(RG,line)) { BamRegion region; ParseRegionString(line, reader, region); regionlist.push_back(region); } RG.close(); } else if ( (has_regionFile == false) && (has_region == false) ) { for (int i= 0; i < (int)referencedata.size(); i++) { string regionstr = referencedata.at(i).RefName; BamRegion region; ParseRegionString(regionstr, reader, region); if (!reader.SetRegion(region)) // Bam region will get [0,101) = 0 to 100 => [closed, half-opened) { cerr << "ERROR: set region " << regionstr << " failed. Check that REGION describes a valid range... Aborting" << endl; reader.Close(); exit(1); } else regionlist.push_back(region); } } //// BamWriter writer; if (!writer.Open("stdout", reader.GetHeaderText(), reader.GetReferenceData())) { cerr << "could not open stdout for writing" << endl; exit(1); } //// Smallest start position and Largest end position for Req Seq vector<RefData>::iterator refdataIter = referencedata.begin(); vector<BamRegion>::iterator regionListIter = regionlist.begin(); // CLASS RealignFunctionsClass RealignFunction; map<int, string> RefIDRedName; vector<SalRealignInfo> AlGroups; multimap<int, BamAlignment> SortRealignedAlignmentsMultimap; int refid = 0; BamAlignment alignment; bool IsNextAlignment = reader.GetNextAlignment(alignment); //cerr << " " << alignment.Name << " Chr " << alignment.RefID << " Startpos: " << alignment.Position << " Endpos: " << alignment.GetEndPosition() << " Length: " << alignment.Length << endl; int windowrealigned = 0; int TotalWindowDetected = 0; int TotalReadsAligned = 0; int TotalWindow = 0; int TotalReads = 0; while (refdataIter != referencedata.end() ) { string refname = refdataIter->RefName; RefIDRedName[refid] = refname; int reflength = refdataIter->RefLength; int winstartpos, winendpos; int AllowableBasesInWindow = 1; bool nextChrName = false; cerr << "##HeaderINFO: RefID = " << refdataIter->RefName << "\t" << "RefLen = " << reflength << endl; while (nextChrName == false ) { vector<int> minmaxRefSeqPos; bool IsPassDetectorNoRealignment = false; minmaxRefSeqPos.push_back(-1); minmaxRefSeqPos.push_back(0); //cerr << " region: " << (*regionListIter).LeftRefID << " : " << (*regionListIter).LeftPosition << " .. " << (*regionListIter).RightPosition << endl; if ((refid == (int)referencedata.size() - 1) && ((*regionListIter).LeftRefID == refid) && ((has_region==true) || (has_regionFile==true)) ) { //// if ( (has_region == true) || (has_regionFile == true) ) { winstartpos = (*regionListIter).LeftPosition; winendpos = winstartpos + windowlen - 1; reflength = (*regionListIter).RightPosition; if (reflength < winendpos) reflength = winendpos; // Get Next Alignment First if ( (refid == alignment.RefID) && (winstartpos == (*regionListIter).LeftPosition) && (IsNextAlignment == false) ) IsNextAlignment = reader.GetNextAlignment(alignment); } else if (has_region == false) { winstartpos = 0; winendpos = winstartpos + windowlen - 1; // Get Next Alignment First if ( (refid == alignment.RefID) && (winstartpos == 0) && (IsNextAlignment == false) ) IsNextAlignment = reader.GetNextAlignment(alignment); } //cerr << " winstart: " << winstartpos << " ; winend: " << winendpos; //cerr << " " << alignment.Name << " Chr " << alignment.RefID << " Startpos: " << alignment.Position << " Endpos: " << alignment.GetEndPosition() << " Length: " << alignment.Length << endl; //// while ((winstartpos < reflength)) { //// Check window end position if (winendpos > reflength) winendpos = reflength; // Reinitialized unsigned int NewReadMappedcount = 0; //// Save and Erase alignments that are outside of window (Deque?) if (!AlGroups.empty()) { minmaxRefSeqPos.at(0) = -1; minmaxRefSeqPos.at(1) = 0; //cerr << "#Start: Keep alignments with start position exceed the right end of the window/Region " << endl; vector<SalRealignInfo>::iterator Iter = AlGroups.begin(); while (Iter != AlGroups.end()) { // Erase alignment s if ((*Iter).al.GetEndPosition() < winstartpos) { //cerr << " ToWrite: " << (*Iter).second.size() << " ; " << (*Iter).al.Name << " ; " << (*Iter).al.Position << " < " << winstartpos << " : " << (*Iter).al.GetEndPosition() << endl; SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > ((*Iter).al.Position, (*Iter).al)); AlGroups.erase(Iter); //cerr << " ToWrite: DONE " << endl; } else { string referenceSequence = reference->getSubSequence(RefIDRedName[(*Iter).al.RefID], (*Iter).al.Position, 2*(*Iter).al.Length); if ((*Iter).HasRealign == true ) { (*Iter).currentReadPosition = 0; (*Iter).currentGenomeSeqPosition = 0; (*Iter).currentAlPosition = (*Iter).al.Position; (*Iter).cigarindex = 0; } (*Iter).CigarSoftclippingLength = 0; SalRealignInfo talr = (*Iter); //cerr << " ToKEEP: " << (*Iter).al.Name << " ; " << (*Iter).al.Position << " < " << winstartpos << " : " << (*Iter).al.GetEndPosition() << endl; RealignFunction.ParseAlignmentsAndExtractVariantsByBaseQualv(AlGroups, talr, Iter, (*Iter).al, referenceSequence, minmaxRefSeqPos, winstartpos, winendpos, (float) minbaseQ, false); ++Iter; //Increment iterator } } } // Write Sorted Alignments that are outside of window //cerr << "SortRealignedAlignmentsMultimap: " << SortRealignedAlignmentsMultimap.size() << " minmaxRefSeqPos.at(0)= " << minmaxRefSeqPos.at(0) << endl; if (!SortRealignedAlignmentsMultimap.empty()) // && (winWrite < winstartpos ) ) { //cerr << "#Start: Write alignments and delete alignments with start position exceed the right end of the window/Region " << endl; multimap<int, BamAlignment>::iterator sraIter = SortRealignedAlignmentsMultimap.begin(); while (sraIter != SortRealignedAlignmentsMultimap.end()) { //cerr << " (*sraIter).first= " << (*sraIter).first << " minmaxRefSeqPos.at(0)= " << minmaxRefSeqPos.at(0) << " winstartpos - ((windowlen - 1)*0.9)= " << winstartpos - ((windowlen - 1)*0.9) << endl; if (((float) (*sraIter).first < floor((float) (winstartpos - ((windowlen - 1)*0.9)))) && ((minmaxRefSeqPos.at(0) > 0) && ((*sraIter).first < minmaxRefSeqPos.at(0)))) { //writer.SaveAlignment((*sraIter).second); // Why sometimes, it doesn't work ????? if (!writer.SaveAlignment((*sraIter).second)) cerr << writer.GetErrorString() << endl; SortRealignedAlignmentsMultimap.erase(sraIter++); } else { ++sraIter; } } //cerr << "#Done: Write alignments and delete alignments with start position exceed the right end of the window/Region " << endl; } //cerr << " winstart: " << winstartpos << " ; winend: " << winendpos; //cerr << " " << alignment.Name << " Chr " << alignment.RefID << " Startpos: " << alignment.Position << " Endpos: " << alignment.GetEndPosition() << " Length: " << alignment.Length << endl; //cerr << ": " << alignment.RefID << " :" << RefIDRedName[alignment.RefID] << " : " << RefIDRedName[alignment.RefID] << endl; //cerr << "Start: Gather aligmenets that lie (fully or partially) within the window frame and group INDELs if there are ... " << endl; // Gather Reads within a window frame while ((IsNextAlignment) && (refid == alignment.RefID)) // Neeed more conditions { if (SetLowComplexityRegion == true) { string sequenceInWindow = reference->getSubSequence(RefIDRedName[alignment.RefID], winstartpos, (winendpos-winstartpos+1) ); if (IsWindowInRepeatRegion(sequenceInWindow) == true) { if ((IsWithinWindow(alignment, winstartpos, winendpos, AllowableBasesInWindow)) == 0) { TotalReads++; if (alignment.IsMapped()) { string referenceSequence = reference->getSubSequence(RefIDRedName[alignment.RefID], alignment.Position, 2*alignment.Length); vector<SalRealignInfo>::iterator tIter; SalRealignInfo alr; alr.al = alignment; alr.currentReadPosition = 0; alr.currentGenomeSeqPosition = 0; alr.currentAlPosition = alignment.Position; alr.cigarindex = 0; alr.HasRealign = false; alr.CigarSoftclippingLength = 0; string str = "ZZZZZZZZZZZZZZZZZ"; if (alignment.Name.find(str) != string::npos) { stringstream cigar; for (vector<CigarOp>::const_iterator cigarIter = alignment.CigarData.begin(); cigarIter != alignment.CigarData.end(); ++cigarIter) cigar << cigarIter->Length << cigarIter->Type; string cigarstr = cigar.str(); cerr << " TRACKING: " << alignment.RefID << " " << alignment.Name << " pos: " << alignment.Position << " cigar: " << cigarstr << endl; } RealignFunction.ParseAlignmentsAndExtractVariantsByBaseQualv(AlGroups, alr, tIter, alignment, referenceSequence, minmaxRefSeqPos, winstartpos, winendpos, (float) minbaseQ, true); NewReadMappedcount++; } else { SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > (alignment.Position, alignment)); cerr << "UNmapped : " << alignment.Name << endl; } } else if ((IsWithinWindow(alignment, winstartpos, winendpos, AllowableBasesInWindow)) == 1) { SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > (alignment.Position, alignment)); } else break; } else { if ((IsWithinWindow(alignment, winstartpos, winendpos, AllowableBasesInWindow)) < 2) SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > (alignment.Position, alignment)); else break; } } else // (SetLowComplexityRegion == false) { if ((IsWithinWindow(alignment, winstartpos, winendpos, AllowableBasesInWindow)) == 0) { TotalReads++; if (alignment.IsMapped()) { string referenceSequence = reference->getSubSequence(RefIDRedName[alignment.RefID], alignment.Position, 2 * alignment.Length); vector<SalRealignInfo>::iterator tIter; SalRealignInfo alr; alr.al = alignment; alr.currentReadPosition = 0; alr.currentGenomeSeqPosition = 0; alr.currentAlPosition = alignment.Position; alr.cigarindex = 0; alr.HasRealign = false; alr.CigarSoftclippingLength = 0; string str = "ZZZZZZZZZZZZZZZZZ"; if (alignment.Name.find(str) != string::npos) { stringstream cigar; for (vector<CigarOp>::const_iterator cigarIter = alignment.CigarData.begin(); cigarIter != alignment.CigarData.end(); ++cigarIter) cigar << cigarIter->Length << cigarIter->Type; string cigarstr = cigar.str(); cerr << " TRACKING: " << alignment.RefID << " " << alignment.Name << " pos: " << alignment.Position << " cigar: " << cigarstr << endl; } RealignFunction.ParseAlignmentsAndExtractVariantsByBaseQualv(AlGroups, alr, tIter, alignment, referenceSequence, minmaxRefSeqPos, winstartpos, winendpos, (float) minbaseQ, true); //cerr << " winstart: " << winstartpos << " ; winend: " << winendpos; //cerr << " INDEL: " << alignment.RefID << " " << alignment.Name << " pos: " << alignment.Position << " Length: " << alignment.Length << " CIGARstr: " << cigarstr << endl; NewReadMappedcount++; } else { SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > (alignment.Position, alignment)); cerr << "UNmapped : " << alignment.Name << endl; } } else if ((IsWithinWindow(alignment, winstartpos, winendpos, AllowableBasesInWindow)) == 1) { SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > (alignment.Position, alignment)); } else break; } ////Get next alignment IsNextAlignment = reader.GetNextAlignment(alignment); } //cerr << "Done: Gather aligmenets that lie (fully or partially) within the window frame and group INDELs if there are ... " << endl; //// Detector Corner bool ToRealign = MeetIndelDetectorThresholdv(AlGroups); cerr << "MeetIndelDetectorThresholdv(AlGroups).size()= " << AlGroups.size() << endl; // ************** if (ToRealign) { //cerr << " ToRealign: " << refdataIter->RefName << "\t" << reflength << "\t" << winstartpos << "\t" << winendpos << "\t" << AlGroups.size() << endl; //cerr << " minmaxRefSeqPos.at(1)= " << minmaxRefSeqPos.at(1) << " minmaxRefSeqPos.at(0)= " << minmaxRefSeqPos.at(0) << endl; ////// Perform Realign routines int TotalAlR = 0; // Total number of alignments to be realigned int NumAlR = 0; // Now many alignments are aligned TotalWindowDetected++; cerr << "#Start: Meet Threshold, Realigning ... " << endl; if (minmaxRefSeqPos.at(1) < winendpos) minmaxRefSeqPos.at(1) = winendpos; if (minmaxRefSeqPos.at(0) > winstartpos) minmaxRefSeqPos.at(0) = winstartpos; bool IsToRealign = RealignFunction.PruningByNaiveSelectionProcedureAndConstructHaplotypes2(winstartpos, winendpos, refid, refname, minmaxRefSeqPos, reference); if (IsToRealign == true) { RealignFunction.SelectHaplotypeCandidates_SmithWatermanBSv(AlGroups, minmaxRefSeqPos, SetLowComplexityRegionSWGapExt); minmaxRefSeqPos.at(0) = -1; minmaxRefSeqPos.at(1) = 0; int nextwinstartpos = winendpos + 1; int nextwinendpos = winstartpos + windowlen - 1; if (nextwinendpos > reflength) nextwinendpos = reflength; //cerr << " Before Realign : " << SortRealignedAlignmentsMultimap.size() << endl; RealignFunction.AdjustCigarsWRTChosenMultipleHaplotypesAndPrepareAlignmentsTobeWrittenOut(AlGroups, SortRealignedAlignmentsMultimap, reference, RefIDRedName, minmaxRefSeqPos, nextwinstartpos, nextwinendpos, minbaseQ, TotalAlR, NumAlR, ploidy); IsPassDetectorNoRealignment = false; // Set flag to false to deactivate write functions //cerr << " After Realign : " << SortRealignedAlignmentsMultimap.size() << endl; TotalReadsAligned += NumAlR; if (NumAlR > 0) // Realignment done windowrealigned++; } else cerr << "#Done: Meet Threshold, Realigning ... " << endl; } if (NewReadMappedcount > 0) TotalWindow++; RealignFunction.Clear(); //// Move the window frame winstartpos = winendpos + 1; winendpos = winstartpos + windowlen - 1; } //// Save and Erase remaining alignments that are outside of window (Deque?) if ((!AlGroups.empty())) { cerr << "#Start: Write Remaining alignments and delete all alignments" << endl; for (vector<SalRealignInfo>::iterator Iter = AlGroups.begin(); Iter != AlGroups.end(); ++Iter) { //cerr << " Remain alignment start: " << (*Iter).al.Name << " " << Iter->al.Position << " < " << winstartpos << " " << winendpos << endl; SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > ((*Iter).al.Position, (*Iter).al)); } cerr << "#Done: Write Remaining alignments and delete all alignments" << endl; } AlGroups.clear(); // Write Sorted remaining Alignments that are outside of window if (!SortRealignedAlignmentsMultimap.empty()) { for (multimap<int, BamAlignment>::iterator sraIter = SortRealignedAlignmentsMultimap.begin(); sraIter != SortRealignedAlignmentsMultimap.end(); ++sraIter) { //writer.SaveAlignment((*sraIter).second); if (!writer.SaveAlignment((*sraIter).second)) cerr << writer.GetErrorString() << endl; } SortRealignedAlignmentsMultimap.clear(); } } ++regionListIter; if ((*regionListIter).LeftRefID > refid) nextChrName = true; } //// If End of the chromosome position //// increament iterator ++refdataIter; ++refid; } reader.Close(); writer.Close(); cerr << "##-Completed- " << endl; cerr << " Total Reads processed = " << TotalReads << endl; cerr << " Total Reads Aligned = " << TotalReadsAligned << endl; cerr << " Total Window processed = " << TotalWindow << endl; cerr << " Total Window Detected = " << TotalWindowDetected << endl; cerr << " Total Windows Aligned = " << windowrealigned << endl; // Restore cerr's stream buffer before terminating if (cerrlog.is_open()) cerr.rdbuf(cerrsave); commandline.clear(); return 0; }
//{{{bool sort_inter_chrom_bam(string in_file_name, bool sort_inter_chrom_bam(string in_file_name, string out_file_name) { // open input BAM file BamReader reader; if ( !reader.Open(in_file_name) ) { cerr << "sort ERROR: could not open " << in_file_name << " for reading... Aborting." << endl; return false; } SamHeader header = reader.GetHeader(); if ( !header.HasVersion() ) header.Version = Constants::SAM_CURRENT_VERSION; string header_text = header.ToString(); RefVector ref = reader.GetReferenceData(); // set up alignments buffer BamAlignment al; vector<BamAlignment> buffer; buffer.reserve( (size_t)(SORT_DEFAULT_MAX_BUFFER_COUNT*1.1) ); bool bufferFull = false; int buff_count = 0; // iterate through file while ( reader.GetNextAlignment(al)) { // check buffer's usage bufferFull = ( buffer.size() >= SORT_DEFAULT_MAX_BUFFER_COUNT ); // store alignments until buffer is "full" if ( !bufferFull ) buffer.push_back(al); // if buffer is "full" else { // so create a sorted temp file with current buffer contents // then push "al" into fresh buffer create_sorted_temp_file(buffer, out_file_name, buff_count, header_text, ref); ++buff_count; buffer.push_back(al); } } // handle any leftover buffer contents if ( !buffer.empty() ) { create_sorted_temp_file(buffer, out_file_name, buff_count, header_text, ref); ++buff_count; } reader.Close(); return merge_sorted_files(out_file_name, buff_count, header_text, ref); /* for (int i = 0; i < buff_count; ++i) { stringstream temp_name; temp_name << out_file_name << i; } */ }
void BedIntersectPE::IntersectBamPE(string bamFile) { // load the "B" bed file into a map so // that we can easily compare "A" to it for overlaps _bedB->loadBedFileIntoMap(); // open the BAM file BamReader reader; BamWriter writer; reader.Open(bamFile); // get header & reference information string bamHeader = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); // open a BAM output to stdout if we are writing BAM if (_bamOutput == true) { // set compression mode BamWriter::CompressionMode compressionMode = BamWriter::Compressed; if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed; writer.SetCompressionMode(compressionMode); // open our BAM writer writer.Open("stdout", bamHeader, refs); } // track the previous and current sequence // names so that we can identify blocks of // alignments for a given read ID. string prevName, currName; prevName = currName = ""; vector<BamAlignment> alignments; // vector of BAM alignments for a given ID in a BAM file. alignments.reserve(100); _bedA->bedType = 10; // it's a full BEDPE given it's BAM // rip through the BAM file and convert each mapped entry to BEDPE BamAlignment bam1, bam2; while (reader.GetNextAlignment(bam1)) { reader.GetNextAlignment(bam2); if (bam1.Name != bam2.Name) { while (bam1.Name != bam2.Name) { if (bam1.IsPaired()) { cerr << "*****WARNING: Query " << bam1.Name << " is marked as paired, but it's mate does not occur" << " next to it in your BAM file. Skipping. " << endl; } bam1 = bam2; reader.GetNextAlignment(bam2); } } else if (bam1.IsPaired() && bam1.IsPaired()) { ProcessBamBlock(bam1, bam2, refs, writer); } } // close up reader.Close(); if (_bamOutput == true) { writer.Close(); } }
int main (int argc, char *argv[]) { bool produceUnCompressedBAM=false; bool verbose=false; bool ancientDNA=false; bool keepOrig=false; string adapter_F=options_adapter_F_BAM; string adapter_S=options_adapter_S_BAM; string adapter_chimera=options_adapter_chimera_BAM; string key=""; bool allowMissing=false; int trimCutoff=1; bool allowAligned=false; bool printLog=false; string logFileName; BamReader reader; BamWriter writer; string bamFile; string bamFileOUT=""; string key1; string key2; bool useDist=false; double location=-1.0; double scale =-1.0; bool fastqFormat=false; string fastqfile1 = ""; string fastqfile2 = ""; string fastqoutfile = ""; bool singleEndModeFQ=true; const string usage=string(string(argv[0])+ " [options] BAMfile"+"\n"+ "\nThis program takes an unaligned BAM where mates are consecutive\nor fastq files and trims and merges reads\n"+ "\n\tYou can specify a unaligned bam file or one or two fastq :\n"+ "\t\t"+"-fq1" +"\t\t"+"First fastq"+"\n"+ "\t\t"+"-fq2" +"\t\t"+"Second fastq file (for paired-end)"+"\n"+ "\t\t"+"-fqo" +"\t\t"+"Output fastq prefix"+"\n\n"+ //"\t"+"-p , --PIPE"+"\n\t\t"+"Read BAM from and write it to PIPE"+"\n"+ "\t"+"-o , --outfile" +"\t\t"+"Output (BAM format)."+"\n"+ "\t"+"-u " +"\t\t"+"Produce uncompressed bam (good for pipe)"+"\n"+ // "\t"+" , --outprefix" +"\n\t\t"+"Prefix for output files (default '"+outprefix+"')."+"\n"+ //"\t"+" , --SAM" +"\n\t\t"+"Output SAM not BAM."+"\n"+ "\t"+"--aligned" +"\t\t"+"Allow reads to be aligned (default "+boolStringify(allowAligned)+")"+"\n"+ "\t"+"-v , --verbose" +"\t\t"+"Turn all messages on (default "+boolStringify(verbose)+")"+"\n"+ "\t"+"--log [log file]" +"\t"+"Print a tally of merged reads to this log file (default only to stderr)"+"\n"+ "\n\t"+"Paired End merging/Single Read trimming options"+"\n"+ "\t\t"+"You can specify either:"+"\n"+ "\t\t\t"+"--ancientdna"+"\t\t\t"+"ancient DNA (default "+boolStringify(ancientDNA)+")"+"\n"+ "\t\t"+" "+"\t\t\t\t"+"this allows for partial overlap"+"\n"+ "\n\t\t"+"or if you know your size length distribution:"+"\n"+ "\t\t\t"+"--loc"+"\t\t\t\t"+"Location for lognormal dist. (default none)"+"\n"+ "\t\t\t"+"--scale"+"\t\t\t\t"+"Scale for lognormal dist. (default none)"+"\n"+ // "\t\t\t\t\t\t\tGood for merging ancient DNA reads into a single sequence\n\n" "\n\t\t"+"--keepOrig"+"\t\t\t\t"+"Write original reads if they are trimmed or merged (default "+boolStringify(keepOrig)+")"+"\n"+ "\t\t\t\t\t\t\tSuch reads will be marked as PCR duplicates\n\n" "\t\t"+"-f , --adapterFirstRead" +"\t\t\t"+"Adapter that is observed after the forward read (def. Multiplex: "+options_adapter_F_BAM.substr(0,30)+")"+"\n"+ "\t\t"+"-s , --adapterSecondRead" +"\t\t"+"Adapter that is observed after the reverse read (def. Multiplex: "+options_adapter_S_BAM.substr(0,30)+")"+"\n"+ "\t\t"+"-c , --FirstReadChimeraFilter" +"\t\t"+"If the forward read looks like this sequence, the cluster is filtered out.\n\t\t\t\t\t\t\tProvide several sequences separated by comma (def. Multiplex: "+options_adapter_chimera_BAM.substr(0,30)+")"+"\n"+ "\t\t"+"-k , --key"+"\t\t\t\t"+"Key sequence with which each sequence starts. Comma separate for forward and reverse reads. (default '"+key+"')"+"\n"+ "\t\t"+"-i , --allowMissing"+"\t\t\t"+"Allow one base in one key to be missing or wrong. (default "+boolStringify(allowMissing)+")"+"\n"+ "\t\t"+"-t , --trimCutoff"+"\t\t\t"+"Lowest number of adapter bases to be observed for single Read trimming (default "+stringify(trimCutoff)+")"); if( (argc== 1) || (argc== 2 && string(argv[1]) == "-h") || (argc== 2 && string(argv[1]) == "-help") || (argc== 2 && string(argv[1]) == "--help") ){ cout<<"Usage:"<<endl; cout<<""<<endl; cout<<usage<<endl; return 1; } for(int i=1;i<(argc-1);i++){ //all but the last arg if(strcmp(argv[i],"-fq1") == 0 ){ fastqfile1=string(argv[i+1]); fastqFormat=true; i++; continue; } if(strcmp(argv[i],"-fq2") == 0 ){ fastqfile2=string(argv[i+1]); fastqFormat=true; singleEndModeFQ=false; i++; continue; } if(strcmp(argv[i],"-fqo") == 0 ){ fastqoutfile=string(argv[i+1]); fastqFormat=true; i++; continue; } if(strcmp(argv[i],"--log") == 0 ){ logFileName =string(argv[i+1]); printLog=true; i++; continue; } if(strcmp(argv[i],"-p") == 0 || strcmp(argv[i],"--PIPE") == 0 ){ cerr<<"This version no longer works with pipe, exiting"<<endl; return 1; } if(strcmp(argv[i],"-u") == 0 ){ produceUnCompressedBAM=true; continue; } if(strcmp(argv[i],"--aligned") == 0 ){ allowAligned=true; continue; } if(strcmp(argv[i],"-o") == 0 || strcmp(argv[i],"--outfile") == 0 ){ bamFileOUT =string(argv[i+1]); i++; continue; } if(strcmp(argv[i],"-v") == 0 || strcmp(argv[i],"--verbose") == 0 ){ verbose=true; continue; } if(strcmp(argv[i],"--ancientdna") == 0 ){ ancientDNA=true; continue; } if(strcmp(argv[i],"--keepOrig") == 0 ){ keepOrig=true; continue; } if(strcmp(argv[i],"--loc") == 0 ){ location =destringify<double>(argv[i+1]); i++; continue; } if(strcmp(argv[i],"--scale") == 0 ){ scale =destringify<double>(argv[i+1]); i++; continue; } if(strcmp(argv[i],"-f") == 0 || strcmp(argv[i],"--adapterFirstRead") == 0 ){ adapter_F =string(argv[i+1]); i++; continue; } if(strcmp(argv[i],"-s") == 0 || strcmp(argv[i],"--adapterSecondRead") == 0 ){ adapter_S =string(argv[i+1]); i++; continue; } if(strcmp(argv[i],"-c") == 0 || strcmp(argv[i],"--FirstReadChimeraFilter") == 0 ){ adapter_chimera =string(argv[i+1]); i++; continue; } if(strcmp(argv[i],"-k") == 0 || strcmp(argv[i],"--keys") == 0 ){ key =string(argv[i+1]); i++; continue; } if(strcmp(argv[i],"-i") == 0 || strcmp(argv[i],"--allowMissing") == 0 ){ allowMissing=true; continue; } if(strcmp(argv[i],"-t") == 0 || strcmp(argv[i],"--trimCutoff") == 0 ){ trimCutoff=atoi(argv[i+1]); i++; continue; } cerr<<"Unknown option "<<argv[i] <<" exiting"<<endl; return 1; } bamFile=argv[argc-1]; if( (location != -1.0 && scale == -1.0) || (location == -1.0 && scale != -1.0) ){ cerr<<"Cannot specify --location without specifying --scale"<<endl; return 1; } if( (location != -1.0 && scale != -1.0) ){ useDist=true; if(ancientDNA){ cerr<<"Cannot specify --location/--scale and --ancientDNA"<<endl; return 1; } } MergeTrimReads mtr (adapter_F,adapter_S,adapter_chimera, key1,key2, trimCutoff,allowMissing,ancientDNA,location,scale,useDist); fqwriters onereadgroup; if(fastqFormat){ if( bamFileOUT != "" || produceUnCompressedBAM || allowAligned){ cerr<<"ERROR : Cannot specify options like -o, -u or --allowAligned for fastq"<<endl; return 1; } if(fastqfile1 == ""){ cerr<<"ERROR : Must specify as least the first file for fastq"<<endl; return 1; } FastQParser * fqp1; FastQParser * fqp2; if(singleEndModeFQ){ fqp1 = new FastQParser (fastqfile1); string outdirs = fastqoutfile+".fq.gz"; string outdirsf = fastqoutfile+".fail.fq.gz"; onereadgroup.single.open(outdirs.c_str(), ios::out); onereadgroup.singlef.open(outdirsf.c_str(), ios::out); if(!onereadgroup.single.good()){ cerr<<"Cannot write to file "<<outdirs<<endl; return 1; } if(!onereadgroup.singlef.good()){ cerr<<"Cannot write to file "<<outdirsf<<endl; return 1; } }else{ fqp1 = new FastQParser (fastqfile1); fqp2 = new FastQParser (fastqfile2); string outdirs = fastqoutfile+".fq.gz"; string outdir1 = fastqoutfile+"_r1.fq.gz"; string outdir2 = fastqoutfile+"_r2.fq.gz"; string outdirsf = fastqoutfile+".fail.fq.gz"; string outdir1f = fastqoutfile+"_r1.fail.fq.gz"; string outdir2f = fastqoutfile+"_r2.fail.fq.gz"; onereadgroup.single.open(outdirs.c_str(), ios::out); onereadgroup.pairr1.open(outdir1.c_str(), ios::out); onereadgroup.pairr2.open(outdir2.c_str(), ios::out); onereadgroup.singlef.open(outdirsf.c_str(), ios::out); onereadgroup.pairr1f.open(outdir1f.c_str(), ios::out); onereadgroup.pairr2f.open(outdir2f.c_str(), ios::out); if(!onereadgroup.single.good()){ cerr<<"Cannot write to file "<<outdirs<<endl; return 1; } if(!onereadgroup.pairr1.good()){ cerr<<"Cannot write to file "<<outdir1<<endl; return 1; } if(!onereadgroup.pairr2.good()){ cerr<<"Cannot write to file "<<outdir2<<endl; return 1; } if(!onereadgroup.singlef.good()){ cerr<<"Cannot write to file "<<outdirsf<<endl; return 1; } if(!onereadgroup.pairr1f.good()){ cerr<<"Cannot write to file "<<outdir1f<<endl; return 1; } if(!onereadgroup.pairr2f.good()){ cerr<<"Cannot write to file "<<outdir2f<<endl; return 1; } } unsigned int totalSeqs=0; while(fqp1->hasData()){ FastQObj * fo1=fqp1->getData(); vector<string> def1=allTokens( *(fo1->getID()), ' ' ); string def1s=def1[0]; FastQObj * fo2; string def2s; string ext2s; if(!singleEndModeFQ){ if(!fqp2->hasData()){ cerr << "ERROR: Discrepency between fastq files at record " << *(fo1->getID()) <<endl; return 1; } fo2=fqp2->getData(); vector<string> def2=allTokens( *(fo2->getID()), ' ' ); def2s=def2[0]; if(strEndsWith(def1s,"/1")){ def1s=def1s.substr(0,def1s.size()-2); } if(strEndsWith(def2s,"/2")){ def2s=def2s.substr(0,def2s.size()-2); } if(strBeginsWith(def1s,"@")){ def1s=def1s.substr(1,def1s.size()-1); } if(strBeginsWith(def2s,"@")){ def2s=def2s.substr(1,def2s.size()-1); } if(def1s != def2s){ cerr << "ERROR: Discrepency between fastq files, different names " << *(fo1->getID()) <<" and "<< *(fo2->getID()) <<endl; return 1; } merged result= mtr.process_PE(*(fo1->getSeq()),*(fo1->getQual()), *(fo2->getSeq()),*(fo2->getQual())); mtr.incrementCountall(); if(result.code != ' '){ //keys or chimeras if(result.code == 'K'){ mtr.incrementCountfkey(); }else{ if(result.code == 'D'){ mtr.incrementCountchimera(); }else{ cerr << "leehom: Wrong return code =\""<<result.code<<"\""<<endl; exit(1); } } onereadgroup.pairr2f<<"@"<<def2s<<"/2" <<endl <<*(fo2->getSeq())<<endl<<"+"<<endl <<*(fo2->getQual())<<endl; onereadgroup.pairr1f<<"@"<<def1s<<"/1" <<endl <<*(fo1->getSeq())<<endl<<"+"<<endl <<*(fo1->getQual())<<endl; continue; }else{ if(result.sequence != ""){ //new sequence onereadgroup.single<<"@"<<def1s<<"" <<endl << result.sequence<<endl<<"+"<<endl <<result.quality<<endl; if( result.sequence.length() > max(fo1->getSeq()->length(),fo2->getSeq()->length()) ){ mtr.incrementCountmergedoverlap(); }else{ mtr.incrementCountmerged(); } }else{ //keep as is mtr.incrementCountnothing(); onereadgroup.pairr2<<"@"<<def2s<<"/2" <<endl <<*(fo2->getSeq())<<endl<<"+"<<endl <<*(fo2->getQual())<<endl; onereadgroup.pairr1<<"@"<<def1s<<"/1" <<endl <<*(fo1->getSeq())<<endl<<"+"<<endl <<*(fo1->getQual())<<endl; } } }else{ merged result=mtr.process_SR(*(fo1->getSeq()),*(fo1->getQual())); mtr.incrementCountall(); if(result.code != ' '){ //either chimera or missing key if(result.code == 'K'){ mtr.incrementCountfkey(); }else{ if(result.code == 'D'){ mtr.incrementCountchimera(); }else{ cerr << "leehom: Wrong return code =\""<<result.code<<"\""<<endl; exit(1); } } onereadgroup.singlef<<""<<*(fo1->getID())<<"" <<endl << *(fo1->getSeq())<<endl<<"+"<<endl <<*(fo1->getQual())<<endl; continue; } if(result.sequence != ""){ //new sequence mtr.incrementCounttrimmed(); onereadgroup.single<<""<<*(fo1->getID())<<"" <<endl << result.sequence<<endl<<"+"<<endl <<result.quality<<endl; }else{ mtr.incrementCountnothing(); onereadgroup.single<<""<<*(fo1->getID())<<"" <<endl << *(fo1->getSeq())<<endl<<"+"<<endl <<*(fo1->getQual())<<endl; } } totalSeqs++; } delete fqp1; if(!singleEndModeFQ){ delete fqp2; } if(singleEndModeFQ){ onereadgroup.single.close(); onereadgroup.singlef.close(); }else{ onereadgroup.single.close(); onereadgroup.pairr1.close(); onereadgroup.pairr2.close(); onereadgroup.singlef.close(); onereadgroup.pairr1f.close(); onereadgroup.pairr2f.close(); } //fastq }else{ //else BAM // initMerge(); // set_adapter_sequences(adapter_F, // adapter_S, // adapter_chimera); // set_options(trimCutoff,allowMissing,mergeoverlap); if(key != ""){ size_t found=key.find(","); if (found == string::npos){ //single end reads key1=key; key2=""; } else{ //paired-end key1=key.substr(0,found); key2=key.substr(found+1,key.length()-found+1); } } if( bamFileOUT == "" ){ cerr<<"The output must be a be specified, exiting"<<endl; return 1; } if ( !reader.Open(bamFile) ) { cerr << "Could not open input BAM file "<<bamFile << endl; return 1; } SamHeader header = reader.GetHeader(); string pID = "mergeTrimReadsBAM"; string pName = "mergeTrimReadsBAM"; string pCommandLine = ""; for(int i=0;i<(argc);i++){ pCommandLine += (string(argv[i])+" "); } putProgramInHeader(&header,pID,pName,pCommandLine,returnGitHubVersion(string(argv[0]),"..")); const RefVector references = reader.GetReferenceData(); //we will not call bgzip with full compression, good for piping into another program to //lessen the load on the CPU if(produceUnCompressedBAM) writer.SetCompressionMode(BamWriter::Uncompressed); if ( !writer.Open(bamFileOUT,header,references) ) { cerr << "Could not open output BAM file "<<bamFileOUT << endl; return 1; } SamHeader sh=reader.GetHeader(); //Up to the user to be sure that a sequence is followed by his mate // if(!sh.HasSortOrder() || // sh.SortOrder != "queryname"){ // cerr << "Bamfile must be sorted by queryname" << endl; // return 1; // } BamAlignment al; BamAlignment al2; bool al2Null=true; while ( reader.GetNextAlignment(al) ) { if(al.IsMapped() || al.HasTag("NM") || al.HasTag("MD") ){ if(!allowAligned){ cerr << "Reads should not be aligned" << endl; return 1; }else{ //should we remove tags ? } } if(al.IsPaired() && al2Null ){ al2=al; al2Null=false; continue; }else{ if(al.IsPaired() && !al2Null){ bool result = mtr.processPair(al,al2); if( result ){//was merged BamAlignment orig; BamAlignment orig2; if(keepOrig){ orig2 = al2; orig = al; } writer.SaveAlignment(al); if(keepOrig){ orig.SetIsDuplicate(true); orig2.SetIsDuplicate(true); writer.SaveAlignment(orig2); writer.SaveAlignment(orig); } //the second record is empty }else{ //keep the sequences as pairs writer.SaveAlignment(al2); writer.SaveAlignment(al); } // // SINGLE END // }else{ BamAlignment orig; if(keepOrig){ orig =al; } mtr.processSingle(al); if(keepOrig){ //write duplicate if(orig.QueryBases.length() != al.QueryBases.length()){ orig.SetIsDuplicate(true); writer.SaveAlignment(orig); } } writer.SaveAlignment(al); } //end single end al2Null=true; }//second pair } //while al reader.Close(); writer.Close(); } //else BAM cerr <<mtr.reportSingleLine()<<endl; if(printLog){ ofstream fileLog; fileLog.open(logFileName.c_str()); if (fileLog.is_open()){ fileLog <<mtr.reportMultipleLines() <<endl; }else{ cerr << "Unable to print to file "<<logFileName<<endl; } fileLog.close(); } return 0; }
int main(const int argc, char* const argv[]) { int c, min_mapQ=0, seed=chrono::system_clock::now().time_since_epoch().count(); unsigned int flag_on=0, flag_off=0; string fn_tgt, fn_in, fn_out="", out_format="b"; while ((c = getopt(argc, argv, "SbBcCt:h1Ho:q:f:F:ul:r:?T:R:L:s:@:m:x:U:")) >= 0) { switch (c) { case 's': seed = atoi(optarg); break; case 'm': break; case 'c': break; case 'S': break; case 'b': break; case 'C': break; case 'h': break; case 'H': break; case 'o': fn_out = optarg; break; case 'U': break; case 'f': flag_on |= strtol(optarg, 0, 0); break; case 'F': flag_off |= strtol(optarg, 0, 0); break; case 'q': min_mapQ = atoi(optarg); break; case 'u': out_format = "u"; break; case '1': break; case 'l': break; case 'r': break; case 't': fn_tgt = optarg; break; case 'R': break; case '?': return usage(); case 'T': break; case 'B': break; case '@': break; case 'x': break; default: return usage(); } } if (fn_tgt.compare("") == 0) return usage(); if (argc == optind) return usage(); fn_in = argv[optind]; BamReader reader; if (!reader.Open(fn_in)) { cerr << "ERROR: cannot open [" << fn_in << "] for reading\n"; return 1; } if (!reader.LocateIndex()) { cerr << "ERROR: cannot find BAM index for [" << fn_in << "]\n"; return 1; } const SamHeader header = reader.GetHeader(); if (header.SortOrder.compare("coordinate") != 0) { cerr << "ERROR: [" << fn_in << "] not sorted by coordinate\n"; return 1; } const RefVector refseq = reader.GetReferenceData(); vector<BamRegion> regions; vector<unsigned int> src_depths, tgt_depths; if (read_region_depth(fn_tgt.c_str(), reader, regions, src_depths, tgt_depths) != 0) return 1; BamWriter writer; if (!writer.Open(fn_out, header, refseq)) { cerr << "ERROR: cannot open [" << fn_out << "] for writing\n"; return 1; } BamAlignment aln; vector<BamAlignment> reads; vector<string> paired, unpaired; unordered_map<int, int> kept; unordered_map<string, unsigned int> seen, sampled; unordered_map<string, vector<int> > pool; for (size_t i=0; i<regions.size(); ++i) { reads.clear(); paired.clear(); unpaired.clear(); kept.clear(); pool.clear(); char region_string[256]; sprintf(region_string, "%s:%d-%d", refseq[regions[i].LeftRefID].RefName.c_str(), regions[i].LeftPosition, regions[i].RightPosition); if (!reader.SetRegion(regions[i])) { cerr << "WARNING: failed to locate [" << region_string << "]\n"; //cerr << "WARNING: failed to locate [" << refseq[regions[i].LeftRefID].RefName << ':' << regions[i].LeftPosition << '-' << regions[i].RightPosition << "]\n"; continue; } while (reader.GetNextAlignment(aln)) { if ((aln.AlignmentFlag & flag_on) == flag_on && !(aln.AlignmentFlag & flag_off) && aln.MapQuality >= min_mapQ) reads.push_back(aln); } if (reads.size() == 0) continue; unsigned int depth = 0; for (size_t k=0; k<reads.size(); ++k) { aln = reads[k]; string rn = aln.Name; if (seen.find(rn) != seen.end()) { // if seen in previous regions if (sampled.find(rn) != sampled.end()) { // if self or mate sampled before, sample it if (sampled[rn] != aln.AlignmentFlag) kept[k] = 1; // if mate sampled before, keep it depth += get_overlap(aln, regions[i]); } if (seen[rn] != aln.AlignmentFlag) seen[rn] = aln.AlignmentFlag; } else { // if not seen in previous regions pool[rn].push_back(k); } if (depth > tgt_depths[i]) break; } if (depth < tgt_depths[i]) { for (auto it=pool.begin(); it!=pool.end(); ++it) { if (it->second.size()>1) paired.push_back(it->first); else unpaired.push_back(it->first); } shuffle(paired.begin(), paired.end(), default_random_engine(seed)); shuffle(unpaired.begin(), unpaired.end(), default_random_engine(seed)); int n1=paired.size(), n2=unpaired.size(), k1, k2, k3; while (depth < tgt_depths[i] && n1+n2 > 0) { if (n1>0) { k1 = pool[paired[--n1]][0]; k2 = pool[paired[n1]][1]; depth += get_overlap(reads[k1], regions[i]); depth += get_overlap(reads[k2], regions[i]); kept[k1] = 1; kept[k2] = 1; continue; } if (n2>0) { k3 = pool[unpaired[--n2]][0]; depth += get_overlap(reads[k3], regions[i]); kept[k3] = 1; continue; } } } for (auto it=pool.begin(); it!=pool.end(); ++it) { string rn = it->first; seen[rn] = reads[pool[rn].back()].AlignmentFlag; } for (auto it=kept.begin(); it!=kept.end(); ++it) { int k = it->first; string rn = reads[k].Name; sampled[rn] = reads[k].AlignmentFlag; writer.SaveAlignment(reads[k]); } cerr << "INFO: target=[" << tgt_depths[i] << "], actual=[" << depth << "], N(reads)=[" << reads.size() << "], N(kept)=[" << kept.size() << "] at [" << region_string << "]\n"; } reader.Close(); return 0; }
// generates mutiple sorted temp BAM files from single unsorted BAM file bool SortTool::SortToolPrivate::GenerateSortedRuns(void) { // open input BAM file BamReader reader; if ( !reader.Open(m_settings->InputBamFilename) ) { cerr << "bamtools sort ERROR: could not open " << m_settings->InputBamFilename << " for reading... Aborting." << endl; return false; } // get basic data that will be shared by all temp/output files SamHeader header = reader.GetHeader(); header.SortOrder = ( m_settings->IsSortingByName ? Constants::SAM_HD_SORTORDER_QUERYNAME : Constants::SAM_HD_SORTORDER_COORDINATE ); m_headerText = header.ToString(); m_references = reader.GetReferenceData(); // set up alignments buffer BamAlignment al; vector<BamAlignment> buffer; buffer.reserve( (size_t)(m_settings->MaxBufferCount*1.1) ); bool bufferFull = false; // if sorting by name, we need to generate full char data // so can't use GetNextAlignmentCore() if ( m_settings->IsSortingByName ) { // iterate through file while ( reader.GetNextAlignment(al)) { // check buffer's usage bufferFull = ( buffer.size() >= m_settings->MaxBufferCount ); // store alignments until buffer is "full" if ( !bufferFull ) buffer.push_back(al); // if buffer is "full" else { // push any unmapped reads into buffer, // don't want to split these into a separate temp file if ( !al.IsMapped() ) buffer.push_back(al); // "al" is mapped, so create a sorted temp file with current buffer contents // then push "al" into fresh buffer else { CreateSortedTempFile(buffer); buffer.push_back(al); } } } } // sorting by position, can take advantage of GNACore() speedup else { // iterate through file while ( reader.GetNextAlignmentCore(al) ) { // check buffer's usage bufferFull = ( buffer.size() >= m_settings->MaxBufferCount ); // store alignments until buffer is "full" if ( !bufferFull ) buffer.push_back(al); // if buffer is "full" else { // push any unmapped reads into buffer, // don't want to split these into a separate temp file if ( !al.IsMapped() ) buffer.push_back(al); // "al" is mapped, so create a sorted temp file with current buffer contents // then push "al" into fresh buffer else { CreateSortedTempFile(buffer); buffer.push_back(al); } } } } // handle any leftover buffer contents if ( !buffer.empty() ) CreateSortedTempFile(buffer); // close reader & return success reader.Close(); return true; }
void TagBam::Tag() { // open the annotations files for processing; OpenAnnoFiles(); // open the BAM file BamReader reader; BamWriter writer; if (!reader.Open(_bamFile)) { cerr << "Failed to open BAM file " << _bamFile << endl; exit(1); } // get header & reference information string bamHeader = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); // set compression mode BamWriter::CompressionMode compressionMode = BamWriter::Compressed; // if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed; writer.SetCompressionMode(compressionMode); // open our BAM writer writer.Open("stdout", bamHeader, refs); // rip through the BAM file and test for overlaps with each annotation file. BamAlignment al; vector<BED> hits; while (reader.GetNextAlignment(al)) { if (al.IsMapped() == true) { BED a; a.chrom = refs.at(al.RefID).RefName; a.start = al.Position; a.end = al.GetEndPosition(false, false); a.strand = "+"; if (al.IsReverseStrand()) a.strand = "-"; ostringstream annotations; // annotate the BAM file based on overlaps with the annotation files. for (size_t i = 0; i < _annoFiles.size(); ++i) { // grab the current annotation file. BedFile *anno = _annoFiles[i]; if (!_useNames && !_useScores && !_useIntervals) { // add the label for this annotation file to tag if there is overlap if (anno->anyHits(a.chrom, a.start, a.end, a.strand, _sameStrand, _diffStrand, _overlapFraction, false)) { annotations << _annoLabels[i] << ";"; } } // use the score field else if (!_useNames && _useScores && !_useIntervals) { anno->allHits(a.chrom, a.start, a.end, a.strand, hits, _sameStrand, _diffStrand, 0.0, false); for (size_t i = 0; i < hits.size(); ++i) { annotations << hits[i].score; if (i < hits.size() - 1) annotations << ","; } if (hits.size() > 0) annotations << ";"; hits.clear(); } // use the name field from the annotation files to populate tag else if (_useNames && !_useScores && !_useIntervals) { anno->allHits(a.chrom, a.start, a.end, a.strand, hits, _sameStrand, _diffStrand, 0.0, false); for (size_t j = 0; j < hits.size(); ++j) { annotations << hits[j].name; if (j < hits.size() - 1) annotations << ","; } if (hits.size() > 0) annotations << ";"; hits.clear(); } // use the full interval information annotation files to populate tag else if (!_useNames && !_useScores && _useIntervals) { anno->allHits(a.chrom, a.start, a.end, a.strand, hits, _sameStrand, _diffStrand, 0.0, false); for (size_t j = 0; j < hits.size(); ++j) { annotations << _annoLabels[i] << ":" << hits[j].chrom << ":" << hits[j].start << "-" << hits[j].end << "," << hits[j].name << "," << hits[j].score << "," << hits[j].strand; if (j < hits.size() - 1) annotations << ","; } if (hits.size() > 0) annotations << ";"; hits.clear(); } } // were there any overlaps with which to make a tag? if (annotations.str().size() > 0) { al.AddTag(_tag, "Z", annotations.str().substr(0, annotations.str().size() - 1)); // get rid of the last ";" } } writer.SaveAlignment(al); } reader.Close(); writer.Close(); // close the annotations files; CloseAnnoFiles(); }
int main(int argc, char* argv[]) { // validate argument count if( argc != 2 ) { cerr << "USAGE: " << argv[0] << " <input BAM file> " << endl; return EXIT_FAILURE; } string filename = argv[1]; //cerr << "Printing alignments from file: " << filename << endl; BamReader reader; if (!reader.Open(filename)) { cerr << "could not open filename " << filename << endl; return EXIT_FAILURE; } cerr << filename << ": Done opening" << endl; // Header can't be used to accurately determine sort order because samtools never // changes it; instead, check after loading each read as is done with "samtools index" // We don't need to load an index (right?) // if (!reader.LocateIndex()) { // const string index_filename = filename + ".bai"; // if (!reader.OpenIndex(index_filename)) { // cerr << "could not open index" << endl; // } // } const SamHeader header = reader.GetHeader(); cerr << filename << ": Done getting header" << endl; const RefVector refs = reader.GetReferenceData(); cerr << filename << ": Done getting reference data" << endl; BamWriter writer; if (! output_bam_filename.empty()) { if (! writer.Open(output_bam_filename, header, refs)) { cerr << "Could not open BAM output file " << output_bam_filename << endl; return EXIT_FAILURE; } cerr << filename << ": Done opening BAM output file " << output_bam_filename << endl; } alignmentMap read1Map; // a single map, for all reads awaiting their mate typedef map<string,int32_t> stringMap; typedef stringMap::iterator stringMapI; stringMap ref_mates; // alignmentMap read1Map, read2Map; BamAlignment full_al; int32_t count = 0; uint32_t max_reads_in_map = 0; int32_t n_reads_skipped_unmapped = 0; int32_t n_reads_skipped_mate_unmapped = 0; int32_t n_reads_skipped_wont_see_mate = 0; int32_t n_reads_skipped_mate_tail_est = 0; int32_t n_reads_skipped_ref_mate = 0; int32_t n_reads = 0; int32_t n_singleton_reads = 0; int32_t last_RefID = -1; int32_t last_Position = -1; cerr << filename << ": Looking for up to " << pairs_to_process << " link pairs," << " total tail = " << link_pair_total_tail << " critical tail = " << link_pair_crit_tail << ", must be on diff chromosome = " << link_pair_diff_chrom << endl; while (reader.GetNextAlignment(full_al) && (! pairs_to_process || count < pairs_to_process)) { BamAlignment al = full_al; //printAlignmentInfo(al, refs); //++count; ++n_reads; if (last_RefID < 0) last_RefID = al.RefID; if (last_Position < 0) last_Position = al.Position; if (al.RefID > last_RefID) { // We've moved to the next reference sequence // Clean up reads with mates expected here that haven't been seen if (debug_ref_mate) { cerr << "MISSED " << ref_mates.size() << " ref_mates on this reference " << last_RefID << " " << refs[last_RefID].RefName << endl; } for (stringMapI rmI = ref_mates.begin(); rmI != ref_mates.end(); ++rmI) { ++n_reads_skipped_ref_mate; read1Map.erase(read1Map.find(rmI->first)); ref_mates.erase(ref_mates.find(rmI->first)); } last_RefID = al.RefID; last_Position = al.Position; } else if (! isCoordinateSorted(al.RefID, al.Position, last_RefID, last_Position)) { cerr << filename << " is not sorted, " << al.Name << " out of position" << endl; return EXIT_FAILURE; } if (! al.IsMapped()) { ++n_reads_skipped_unmapped; continue; } if (! al.IsMateMapped()) { ++n_reads_skipped_mate_unmapped; continue; } alignmentMapI mI = read1Map.find(al.Name); if (mI == read1Map.end()) { // the read name has not been seen before if (al.MateRefID < al.RefID || (al.MateRefID == al.RefID && al.MatePosition < al.Position)) { // we should have seen its mate earlier, so skip it ++n_reads_skipped_wont_see_mate; continue; } // If the mate likely to also be a link pair candidate, add the read int32_t mate_tail_est = readTailS(al.IsMateMapped(), al.IsMateReverseStrand(), al.MatePosition, refs[al.MateRefID].RefLength, max_read_length); if (mate_tail_est <= mate_tail_est_crit) { // the mate tail estimate suggests it might be a link pair candidate read1Map[al.Name] = al; // add the read to the map } else { // the mate tail estimate appears too long for the mate to be a candidate ++n_reads_skipped_mate_tail_est; continue; } if (read1Map.size() > max_reads_in_map) max_reads_in_map = read1Map.size(); if (al.MateRefID == al.RefID && al.MatePosition >= al.Position) { // the mate is expected later on this contig ref_mates[al.Name] = al.MateRefID; } } else { // get the mate's alignment, and process the pair const BamAlignment& al_mate = mI->second; if (processReadPair(al, al_mate, refs, link_pair_total_tail, link_pair_crit_tail, link_pair_diff_chrom)) { ++count; // write to the new BAM file, if the string is not empty if (! output_bam_filename.empty()) { writer.SaveAlignment(al_mate); // the first one seen writer.SaveAlignment(al); // the second one seen } } read1Map.erase(mI); if (al.MateRefID == al.RefID) { stringMapI rmI = ref_mates.find(al.Name); if (rmI == ref_mates.end()) { cerr << "expected a ref_mate, couldn't find its name: " << al.Name << endl; return EXIT_FAILURE; } ref_mates.erase(rmI); } } } cerr << "===============================" << endl; cerr << read1Map.size() << " alignments left in read1Map" << endl; cerr << max_reads_in_map << " maximum number of reads in read1Map" << endl; cerr << count << " pairs processed" << endl; cerr << "===============================" << endl; cerr << n_reads << " total reads" << endl; cerr << n_singleton_reads << " singleton reads" << endl; cerr << n_reads_skipped_unmapped << " reads skipped because unmapped" << endl; cerr << n_reads_skipped_mate_unmapped << " reads skipped because mate unmapped" << endl; cerr << n_reads_skipped_wont_see_mate << " reads skipped because mate won't be seen" << endl; cerr << n_reads_skipped_mate_tail_est << " reads skipped because mate tail appears too long" << endl; cerr << n_reads_skipped_ref_mate << " reads skipped because mate not on reference" << endl; reader.Close(); if (! output_bam_filename.empty()) { writer.Close(); } return EXIT_SUCCESS; }
void BamToFastq::PairedFastqUseTags() { // open the 1st fastq file for writing ofstream fq1(_fastq1.c_str(), ios::out); if ( !fq1 ) { cerr << "Error: The first fastq file (" << _fastq1 << ") could not be opened. Exiting!" << endl; exit (1); } // open the 2nd fastq file for writing ofstream fq2(_fastq2.c_str(), ios::out); if ( !fq2 ) { cerr << "Error: The second fastq file (" << _fastq2 << ") could not be opened. Exiting!" << endl; exit (1); } // open the BAM file BamReader reader; reader.Open(_bamFile); // rip through the BAM file and convert each mapped entry to BEDPE BamAlignment bam1, bam2; while (reader.GetNextAlignment(bam1)) { reader.GetNextAlignment(bam2); if (bam1.Name != bam2.Name) { while (bam1.Name != bam2.Name) { if (bam1.IsPaired()) { cerr << "*****WARNING: Query " << bam1.Name << " is marked as paired, but it's mate does not occur" << " next to it in your BAM file. Skipping. " << endl; } bam1 = bam2; reader.GetNextAlignment(bam2); } } else if (bam1.IsPaired() && bam2.IsPaired()) { // assume the R2 and Q2 tags are on the + strand. string mateSequence, mateQualities; bam1.GetTag("R2", mateSequence); bam1.GetTag("Q2", mateQualities); string seq1 = bam1.QueryBases; string qual1 = bam1.Qualities; if (bam1.IsReverseStrand() == true) { reverseComplement(seq1); reverseSequence(qual1); } // since the info for both ends are contained in each BAM record, // we only need to process one of the two records (bam1) in order // to produce FASTQ entries for both ends. // NOTE: Assumes that R2 and Q2 have already been rev // and revcomped if necessary if (bam1.IsFirstMate() == true) { // end1 fq1 << "@" << bam1.Name << "/1" << endl; fq1 << seq1 << endl; fq1 << "+" << endl; fq1 << qual1 << endl; // end2 fq2 << "@" << bam1.Name << "/2" <<endl; fq2 << mateSequence << endl; fq2 << "+" << endl; fq2 << mateQualities << endl; } else { // end 2 fq2 << "@" << bam1.Name << "/2" <<endl; fq2 << seq1 << endl; fq2 << "+" << endl; fq2 << qual1 << endl; // end 1 fq1 << "@" << bam1.Name << "/1" <<endl; fq1 << mateSequence << endl; fq1 << "+" << endl; fq1 << mateQualities << endl; } } } reader.Close(); }
void BamToFastq::PairedFastq() { // open the 1st fastq file for writing ofstream fq1(_fastq1.c_str(), ios::out); if ( !fq1 ) { cerr << "Error: The first fastq file (" << _fastq1 << ") could not be opened. Exiting!" << endl; exit (1); } // open the 2nd fastq file for writing ofstream fq2(_fastq2.c_str(), ios::out); if ( !fq2 ) { cerr << "Error: The second fastq file (" << _fastq2 << ") could not be opened. Exiting!" << endl; exit (1); } // open the BAM file BamReader reader; reader.Open(_bamFile); // rip through the BAM file and convert each mapped entry to BEDPE BamAlignment bam1, bam2; bool shouldConsumeReads = true; while (true) { if (shouldConsumeReads) { if (!reader.GetNextAlignment(bam1) || !reader.GetNextAlignment(bam2)) break; } else { shouldConsumeReads = true; } if (bam1.Name != bam2.Name) { while (bam1.Name != bam2.Name) { if (bam1.IsPaired()) { cerr << "*****WARNING: Query " << bam1.Name << " is marked as paired, but its mate does not occur" << " next to it in your BAM file. Skipping. " << endl; } bam1 = bam2; if (!reader.GetNextAlignment(bam2)) break; shouldConsumeReads = false; } } else if (bam1.IsPaired() && bam2.IsPaired()) { // extract the sequence and qualities for the BAM "query" string seq1 = bam1.QueryBases; string qual1 = bam1.Qualities; string seq2 = bam2.QueryBases; string qual2 = bam2.Qualities; if (bam1.IsReverseStrand() == true) { reverseComplement(seq1); reverseSequence(qual1); } if (bam2.IsReverseStrand() == true) { reverseComplement(seq2); reverseSequence(qual2); } fq1 << "@" << bam1.Name << "/1" << endl; fq1 << seq1 << endl; fq1 << "+" << endl; fq1 << qual1 << endl; fq2 << "@" << bam2.Name << "/2" << endl; fq2 << seq2 << endl; fq2 << "+" << endl; fq2 << qual2 << endl; } } reader.Close(); }
int main (int argc, char *argv[]) { bool mapped =false; bool unmapped=false; const string usage=string(string(argv[0])+" [options] input.bam out.bam"+"\n\n"+ "This program takes a BAM file as input and produces\n"+ "another where the putative deaminated bases have\n"+ "have been cut\n"+ "\n"+ "Options:\n"); // "\t"+"-u , --unmapped" +"\n\t\t"+"For an unmapped bam file"+"\n"+ // "\t"+"-m , --mapped" +"\n\t\t"+"For an mapped bam file"+"\n"); if( (argc== 1) || (argc== 2 && string(argv[1]) == "-h") || (argc== 2 && string(argv[1]) == "-help") || (argc== 2 && string(argv[1]) == "--help") ){ cout<<"Usage:"<<endl; cout<<usage<<endl; cout<<""<<endl; return 1; } // for(int i=1;i<(argc-1);i++){ //all but the last arg // if(strcmp(argv[i],"-m") == 0 || strcmp(argv[i],"--mapped") == 0 ){ // mapped=true; // continue; // } // if(strcmp(argv[i],"-u") == 0 || strcmp(argv[i],"--unmapped") == 0 ){ // unmapped=true; // continue; // } // cerr<<"Unknown option "<<argv[i] <<" exiting"<<endl; // return 1; // } if(argc != 3){ cerr<<"Error: Must specify the input and output BAM files"; return 1; } string inbamFile =argv[argc-2]; string outbamFile=argv[argc-1]; // if(!mapped && !unmapped){ // cerr << "Please specify whether you reads are mapped or unmapped" << endl; // return 1; // } // if(mapped && unmapped){ // cerr << "Please specify either mapped or unmapped but not both" << endl; // return 1; // } BamReader reader; if ( !reader.Open(inbamFile) ) { cerr << "Could not open input BAM files." << endl; return 1; } vector<RefData> testRefData=reader.GetReferenceData(); const SamHeader header = reader.GetHeader(); const RefVector references = reader.GetReferenceData(); BamWriter writer; if ( !writer.Open(outbamFile, header, references) ) { cerr << "Could not open output BAM file" << endl; return 1; } BamAlignment al; // BamAlignment al2; // bool al2Null=true; while ( reader.GetNextAlignment(al) ) { if(al.IsPaired() ){ if(al.IsFirstMate() ){ //5' end, need to check first base only if(al.IsReverseStrand()){ // if(!al.IsMapped()){ cerr << "Cannot have reverse complemented unmapped reads: " <<al.Name<< endl; //return 1; } int indexToCheck; //last indexToCheck=al.QueryBases.length()-1; if(toupper(al.QueryBases[indexToCheck]) == 'A'){ //al.Qualities[indexToCheck]=char(offset+baseQualForDeam); al.QueryBases = al.QueryBases.substr(0,indexToCheck); al.Qualities = al.Qualities.substr(0, indexToCheck); } }else{ int indexToCheck; //first base indexToCheck=0; if(toupper(al.QueryBases[indexToCheck]) == 'T'){ //al.Qualities[indexToCheck]=char(offset+baseQualForDeam); al.QueryBases = al.QueryBases.substr(indexToCheck+1); al.Qualities = al.Qualities.substr(indexToCheck+1); } } }else{ //3' end, need to check last two bases only if( al.IsSecondMate() ){ if(al.IsReverseStrand()){ // if(!al.IsMapped()){ cerr << "Cannot have reverse complemented unmapped reads: " <<al.Name<< endl; //return 1; } int indexToCheck; //second to last indexToCheck=al.QueryBases.length()-2; if(toupper(al.QueryBases[indexToCheck]) == 'T'){ //al.Qualities[indexToCheck]=char(offset+baseQualForDeam); al.QueryBases = al.QueryBases.substr(0,indexToCheck); al.Qualities = al.Qualities.substr(0, indexToCheck); }else{ //last indexToCheck=al.QueryBases.length()-1; if(toupper(al.QueryBases[indexToCheck]) == 'T'){ //al.Qualities[indexToCheck]=char(offset+baseQualForDeam); al.QueryBases = al.QueryBases.substr(0,indexToCheck); al.Qualities = al.Qualities.substr(0, indexToCheck); } } }else{ int indexToCheck; //second base indexToCheck=1; if(toupper(al.QueryBases[indexToCheck]) == 'A'){ //al.Qualities[indexToCheck]=char(offset+baseQualForDeam); al.QueryBases = al.QueryBases.substr(indexToCheck+1); al.Qualities = al.Qualities.substr(indexToCheck+1); }else{ //first base indexToCheck=0; if(toupper(al.QueryBases[indexToCheck]) == 'A'){ //al.Qualities[indexToCheck]=char(offset+baseQualForDeam); al.QueryBases = al.QueryBases.substr(indexToCheck+1); al.Qualities = al.Qualities.substr(indexToCheck+1); } } } }else{ cerr << "Wrong state" << endl; return 1; } } }//end of paired end else{//we consider single reads to have been sequenced from 5' to 3' if(al.IsReverseStrand()){ //need to consider if(!al.IsMapped()){ cerr << "Cannot have reverse complemented unmapped reads: " <<al.Name<< endl; //return 1; } int indexToCheck; //second base indexToCheck=1; if(toupper(al.QueryBases[indexToCheck]) == 'A'){ // al.Qualities[indexToCheck]=char(offset+baseQualForDeam); // cout<<"51 "<<al.QueryBases<<endl; // cout<<"51 "<<al.Qualities<<endl; al.QueryBases = al.QueryBases.substr(indexToCheck+1); al.Qualities = al.Qualities.substr(indexToCheck+1); // cout<<"52 "<<al.QueryBases<<endl; // cout<<"52 "<<al.Qualities<<endl; }else{ //first base indexToCheck=0; if(toupper(al.QueryBases[indexToCheck]) == 'A'){ // al.Qualities[indexToCheck]=char(offset+baseQualForDeam); // cout<<"61 "<<al.QueryBases<<endl; // cout<<"61 "<<al.Qualities<<endl; al.QueryBases = al.QueryBases.substr(indexToCheck+1); al.Qualities = al.Qualities.substr(indexToCheck+1); // cout<<"62 "<<al.QueryBases<<endl; // cout<<"62 "<<al.Qualities<<endl; } } //last indexToCheck=al.QueryBases.length()-1; if(toupper(al.QueryBases[indexToCheck]) == 'A'){ // al.Qualities[indexToCheck]=char(offset+baseQualForDeam); // cout<<"21 "<<al.QueryBases<<endl; // cout<<"21 "<<al.Qualities<<endl; al.QueryBases = al.QueryBases.substr(0,indexToCheck); al.Qualities = al.Qualities.substr(0, indexToCheck); // cout<<"22 "<<al.QueryBases<<endl; // cout<<"22 "<<al.Qualities<<endl; } }else{ int indexToCheck; //first base indexToCheck=0; if(toupper(al.QueryBases[indexToCheck]) == 'T'){ // al.Qualities[indexToCheck]=char(offset+baseQualForDeam); // cout<<"11 "<<al.QueryBases<<endl; // cout<<"11 "<<al.Qualities<<endl; al.QueryBases = al.QueryBases.substr(indexToCheck+1); al.Qualities = al.Qualities.substr(indexToCheck+1); // cout<<"12 "<<al.QueryBases<<endl; // cout<<"12 "<<al.Qualities<<endl; } //second to last indexToCheck=al.QueryBases.length()-2; if(toupper(al.QueryBases[indexToCheck]) == 'T'){ // al.Qualities[indexToCheck]=char(offset+baseQualForDeam); // cout<<"31 "<<al.QueryBases<<endl; // cout<<"31 "<<al.Qualities<<endl; al.QueryBases = al.QueryBases.substr(0,indexToCheck); al.Qualities = al.Qualities.substr(0, indexToCheck); // cout<<"32 "<<al.QueryBases<<endl; // cout<<"32 "<<al.Qualities<<endl; }else{ //last indexToCheck=al.QueryBases.length()-1; if(toupper(al.QueryBases[indexToCheck]) == 'T'){ // al.Qualities[indexToCheck]=char(offset+baseQualForDeam); // cout<<"41 "<<al.QueryBases<<endl; // cout<<"41 "<<al.Qualities<<endl; al.QueryBases = al.QueryBases.substr(0,indexToCheck); al.Qualities = al.Qualities.substr(0, indexToCheck); // cout<<"42 "<<al.QueryBases<<endl; // cout<<"42 "<<al.Qualities<<endl; } } } }//end of single end writer.SaveAlignment(al); }// while ( reader.GetNextAlignment(al) ) { reader.Close(); writer.Close(); return 0; }
int main (int argc, char *argv[]) { int minBaseQuality = 0; string usage=string(""+string(argv[0])+" [in BAM file] [in VCF file] [chr name] [deam out BAM] [not deam out BAM]"+ "\nThis program divides aligned single end reads into potentially deaminated\n"+ "\nreads and the puts the rest into another bam file if the deaminated positions are not called as the alternative base in the VCF.\n"+ "\nThis is like filterDeaminatedVCF but it loads the VCF before then labels the reads instead of doing it on the fly\n"+ "\nwhich is good if you have many reads in the bam file.\n"+ "\nTip: if you do not need one of them, use /dev/null as your output\n"+ "\narguments:\n"+ "\t"+"--bq [base qual] : Minimum base quality to flag a deaminated site (Default: "+stringify(minBaseQuality)+")\n"+ "\t"+"--1000g [vcf file] : VCF file from 1000g to get the putative A and T positions in modern humans (Default: "+vcf1000g+")\n"+ "\n"); if(argc == 1 || argc < 4 || (argc == 2 && (string(argv[0]) == "-h" || string(argv[0]) == "--help") ) ){ cerr << "Usage "<<usage<<endl; return 1; } for(int i=1;i<(argc-2);i++){ if(string(argv[i]) == "--bq"){ minBaseQuality=destringify<int>(argv[i+1]); i++; continue; } if(string(argv[i]) == "--1000g"){ vcf1000g=string(argv[i+1]); i++; continue; } } unsigned int maxSizeChromosome=250000000;//larger than chr1 hg19 bool * hasCnoT; bool * hasGnoA; bool * thousandGenomesHasA; bool * thousandGenomesHasT; cerr<<"Trying to allocating memory"<<endl; try{ hasCnoT = new bool[ maxSizeChromosome ]; hasGnoA = new bool[ maxSizeChromosome ]; thousandGenomesHasA = new bool[ maxSizeChromosome ]; thousandGenomesHasT = new bool[ maxSizeChromosome ]; }catch(bad_alloc& exc){ cerr<<"ERROR: allocating memory failed"<<endl; return 1; } cerr<<"Success in allocating memory"<<endl; for(unsigned int i = 0;i<maxSizeChromosome;i++){ hasCnoT[i]=false; hasGnoA[i]=false; thousandGenomesHasA[i]=false; thousandGenomesHasT[i]=false; } string bamfiletopen = string( argv[ argc-5 ] ); string vcffiletopen = string( argv[ argc-4 ] ); string chrname = string( argv[ argc-3 ] ); string deambam = string( argv[ argc-2 ] ); string nondeambam = string( argv[ argc-1 ] ); cerr<<"Reading consensus VCF "<<vcffiletopen<<" ... "<<endl; VCFreader vcfr (vcffiletopen, // vcffiletopen+".tbi", // chrname, // 1, // maxSizeChromosome, 0); while(vcfr.hasData()){ SimpleVCF * toprint=vcfr.getData(); if(toprint->getRef().length() != 1 ) continue; //if the VCF has a at least one G but no A if( toprint->hasAtLeastOneG() && !toprint->hasAtLeastOneA() ){ hasGnoA[ toprint->getPosition() ] =true; } if( toprint->hasAtLeastOneC() && !toprint->hasAtLeastOneT() ){ hasCnoT[ toprint->getPosition() ] =true; } } cerr<<"done reading VCF"<<endl; cerr<<"Reading 1000g VCF :"<<vcf1000g<<" ..."<<endl; string line1000g; ifstream myFile1000g; myFile1000g.open(vcf1000g.c_str(), ios::in); if (myFile1000g.is_open()){ while ( getline (myFile1000g,line1000g)){ vector<string> fields=allTokens(line1000g,'\t'); //0 chr //1 pos //2 id //3 ref //4 alt //check if same chr if(fields[0] != chrname){ cerr <<"Error, wrong chromosome in 1000g file for line= "<<line1000g<<endl; return 1; } //skip indels if(fields[3].size() != 1 || fields[4].size() != 1 ) continue; char ref=toupper(fields[3][0]); char alt=toupper(fields[4][0]); unsigned int pos=destringify<unsigned int>( fields[1] ); thousandGenomesHasA[ pos ] = ( (ref=='A') || (alt=='A') ); thousandGenomesHasT[ pos ] = ( (ref=='T') || (alt=='T') ); } myFile1000g.close(); }else{ cerr <<"Unable to open file "<<vcf1000g<<endl; return 1; } cerr<<"done reading 1000g VCF"<<endl; BamReader reader; if ( !reader.Open(bamfiletopen) ) { cerr << "Could not open input BAM file"<< bamfiletopen << endl; return 1; } //positioning the bam file int refid=reader.GetReferenceID(chrname); if(refid < 0){ cerr << "Cannot retrieve the reference ID for "<< chrname << endl; return 1; } //cout<<"redif "<<refid<<endl; //setting the BAM reader at that position reader.SetRegion(refid, 0, refid, -1); vector<RefData> testRefData=reader.GetReferenceData(); const SamHeader header = reader.GetHeader(); const RefVector references = reader.GetReferenceData(); BamWriter writerDeam; if ( !writerDeam.Open(deambam, header, references) ) { cerr << "Could not open output BAM file" << endl; return 1; } BamWriter writerNoDeam; if ( !writerNoDeam.Open(nondeambam, header, references) ) { cerr << "Could not open output BAM file" << endl; return 1; } unsigned int totalReads =0; unsigned int deaminatedReads =0; unsigned int ndeaminatedReads =0; unsigned int skipped =0; //iterating over the alignments for these regions BamAlignment al; int i; while ( reader.GetNextAlignment(al) ) { // cerr<<al.Name<<endl; //skip unmapped if(!al.IsMapped()){ skipped++; continue; } //skip paired end ! if(al.IsPaired() ){ continue; // cerr<<"Paired end not yet coded"<<endl; // return 1; } string reconstructedReference = reconstructRef(&al); char refeBase; char readBase; bool isDeaminated; if(al.Qualities.size() != reconstructedReference.size()){ cerr<<"Quality line is not the same size as the reconstructed reference"<<endl; return 1; } isDeaminated=false; if(al.IsReverseStrand()){ //first base next to 3' i = 0 ; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); if( readBase == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){ //isDeaminated=true; } if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } if( hasGnoA[al.Position+1] && !thousandGenomesHasA[al.Position+1] ) isDeaminated=true; // transformRef(&refeBase,&readBase); // vcfr.repositionIterator(chrname,al.Position+1,al.Position+1); // while(vcfr.hasData()){ // SimpleVCF * toprint=vcfr.getData(); // // cout<<*toprint<<endl; // //skip deletions in the alt // if(toprint->getRef().length() != 1 ) // continue; // if(toprint->getRef()[0] != refeBase){ // cerr<<reconstructedReference<<endl; // cerr<<al.Position<<endl; // cerr<<numberOfDeletions(&al)<<endl; // cerr<<"Problem1 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; // exit(1); // } // //if the VCF has a at least one G but no A // if( toprint->hasAtLeastOneG() && // !toprint->hasAtLeastOneA() ){ // isDeaminated=true; // } // } } //second base next to 3' i = 1; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'G' && if( readBase == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){ //isDeaminated=true; } if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } if( hasGnoA[al.Position+2] && !thousandGenomesHasA[al.Position+2] ) isDeaminated=true; // transformRef(&refeBase,&readBase); // vcfr.repositionIterator(chrname,al.Position+2,al.Position+2); // while(vcfr.hasData()){ // SimpleVCF * toprint=vcfr.getData(); // // cout<<*toprint<<endl; // //skip deletions in the alt // if(toprint->getRef().length() != 1 ) // continue; // if(toprint->getRef()[0] != refeBase){ // cerr<<reconstructedReference<<endl; // cerr<<al.Position<<endl; // cerr<<numberOfDeletions(&al)<<endl; // cerr<<"Problem2 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; // exit(1); // } // //if the VCF has at least one G but no A // // if(toprint->hasAtLeastOneG() && // // toprint->getAlt().find("A") == string::npos){ // if( toprint->hasAtLeastOneG() && // !toprint->hasAtLeastOneA() ){ // isDeaminated=true; // } // } } //last base next to 5' i = (al.QueryBases.length()-1) ; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'G' && if( readBase == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){ //isDeaminated=true; } if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); int lengthMatches=countMatchesRecons(reconstructedReference,0); int positionJump = al.Position+lengthMatches+numberOfDeletions(&al); if(hasGnoA[positionJump] && !thousandGenomesHasA[positionJump] ) isDeaminated=true; // vcfr.repositionIterator(chrname,positionJump,positionJump); // while(vcfr.hasData()){ // SimpleVCF * toprint=vcfr.getData(); // //skip deletions in the alt // if(toprint->getRef().length() != 1 ) // continue; // if(toprint->getRef()[0] != refeBase){ // cerr<<reconstructedReference<<endl; // cerr<<al.Position<<endl; // cerr<<lengthMatches<<endl; // cerr<<numberOfDeletions(&al)<<endl; // cerr<<positionJump<<endl; // cerr<<"Problem3 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; // exit(1); // } // //if the VCF has at least one G but no A // if( toprint->hasAtLeastOneG() && // !toprint->hasAtLeastOneA() ){ // isDeaminated=true; // } // } } }else{ //first base next to 5' i = 0; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'C' if( readBase == 'T' && int(al.Qualities[i]-offset) >= minBaseQuality){ if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } // transformRef(&refeBase,&readBase); if(hasCnoT[al.Position+1] && !thousandGenomesHasT[al.Position+1] ) isDeaminated=true; // vcfr.repositionIterator(chrname,al.Position+1,al.Position+1); // while(vcfr.hasData()){ // SimpleVCF * toprint=vcfr.getData(); // //cout<<*toprint<<endl; // //skip deletions in the alt // if(toprint->getRef().length() != 1 ) // continue; // if(toprint->getRef()[0] != refeBase){ // cerr<<reconstructedReference<<endl; // cerr<<al.Position<<endl; // cerr<<numberOfDeletions(&al)<<endl; // cerr<<"Problem4 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; // exit(1); // } // //if the VCF has at least one C but no T // if( toprint->hasAtLeastOneC() && // !toprint->hasAtLeastOneT() ){ // isDeaminated=true; // } // } //cout<<al.Position+ } //second last base next to 3' i = (al.QueryBases.length()-2); refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'C' && if( readBase == 'T' && int(al.Qualities[i]-offset) >= minBaseQuality){ if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } //transformRef(&refeBase,&readBase); int lengthMatches=countMatchesRecons(reconstructedReference,1); int positionJump = al.Position+lengthMatches+numberOfDeletions(&al); if(hasCnoT[positionJump] && !thousandGenomesHasT[positionJump] ) isDeaminated=true; // vcfr.repositionIterator(chrname,positionJump,positionJump); // while(vcfr.hasData()){ // SimpleVCF * toprint=vcfr.getData(); // //skip deletions in the alt // if(toprint->getRef().length() != 1 ) // continue; // if(toprint->getRef()[0] != refeBase){ // cerr<<reconstructedReference<<endl; // cerr<<al.Position<<endl; // cerr<<lengthMatches<<endl; // cerr<<numberOfDeletions(&al)<<endl; // cerr<<positionJump<<endl; // cerr<<"Problem5 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; // exit(1); // } // if( toprint->hasAtLeastOneC() && // !toprint->hasAtLeastOneT() ){ // isDeaminated=true; // } // } } //last base next to 3' i = (al.QueryBases.length()-1); refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //&& refeBase == 'C' if( readBase == 'T' && int(al.Qualities[i]-offset) >= minBaseQuality){ if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); int lengthMatches=countMatchesRecons(reconstructedReference,0); int positionJump = al.Position+lengthMatches+numberOfDeletions(&al); if(hasCnoT[positionJump] && !thousandGenomesHasT[positionJump] ) isDeaminated=true; // vcfr.repositionIterator(chrname,positionJump,positionJump); // while(vcfr.hasData()){ // SimpleVCF * toprint=vcfr.getData(); // //skip deletions in the alt // if(toprint->getRef().length() != 1 ) // continue; // if(toprint->getRef()[0] != refeBase){ // cerr<<reconstructedReference<<endl; // cerr<<al.Position<<endl; // cerr<<lengthMatches<<endl; // cerr<<numberOfDeletions(&al)<<endl; // cerr<<positionJump<<endl; // cerr<<"Problem6 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; // exit(1); // } // if( toprint->hasAtLeastOneC() && // !toprint->hasAtLeastOneT() ){ // isDeaminated=true; // } // } } } totalReads++; if(isDeaminated){ deaminatedReads++; writerDeam.SaveAlignment(al); }else{ ndeaminatedReads++; writerNoDeam.SaveAlignment(al); } }//end for each read reader.Close(); writerDeam.Close(); writerNoDeam.Close(); delete(hasCnoT); delete(hasGnoA); cerr<<"Program finished sucessfully, out of "<<totalReads<<" mapped reads (skipped: "<<skipped<<" reads) we flagged "<<deaminatedReads<<" as deaminated and "<<ndeaminatedReads<<" as not deaminated"<<endl; return 0; }
// close requested BAM files bool BamMultiReaderPrivate::CloseFiles(const std::vector<std::string>& filenames) { bool errorsEncountered = false; m_errorString.clear(); // iterate over filenames std::vector<std::string>::const_iterator filesIter = filenames.begin(); std::vector<std::string>::const_iterator filesEnd = filenames.end(); for (; filesIter != filesEnd; ++filesIter) { const std::string& filename = (*filesIter); if (filename.empty()) continue; // iterate over readers std::vector<MergeItem>::iterator readerIter = m_readers.begin(); std::vector<MergeItem>::iterator readerEnd = m_readers.end(); for (; readerIter != readerEnd; ++readerIter) { MergeItem& item = (*readerIter); BamReader* reader = item.Reader; if (reader == 0) continue; // if reader matches requested filename if (reader->GetFilename() == filename) { // remove reader's entry from alignment cache m_alignmentCache->Remove(reader); // clean up reader & its alignment if (!reader->Close()) { m_errorString.append(1, '\t'); m_errorString.append(reader->GetErrorString()); m_errorString.append(1, '\n'); errorsEncountered = true; } delete reader; reader = 0; // delete reader's alignment entry BamAlignment* alignment = item.Alignment; delete alignment; alignment = 0; // remove reader from reader list m_readers.erase(readerIter); // on match, just go on to next filename // (no need to keep looking and item iterator is invalid now anyway) break; } } } // make sure we clean up properly if all readers were closed if (m_readers.empty()) { // clean up merger if (m_alignmentCache) { m_alignmentCache->Clear(); delete m_alignmentCache; m_alignmentCache = 0; } // reset merge flags m_hasUserMergeOrder = false; m_mergeOrder = BamMultiReader::RoundRobinMerge; } // return whether all readers closed OK return !errorsEncountered; }
//{{{ bool merge_sorted_files(string out_file_name, bool merge_sorted_files(string out_file_name, int buff_count, string header_text, RefVector &ref) { map<string,BamReader*> bam_readers; priority_queue< BamAlignment, vector<BamAlignment>, inter_chrom_rev_sort > q; for (int i = 0; i < buff_count; ++i) { stringstream temp_name; temp_name << out_file_name << i; BamReader *reader = new BamReader(); if ( !reader->Open(temp_name.str()) ) { cerr << "sort ERROR: could not open " << temp_name.str() << " for reading... Aborting." << endl; return false; } bam_readers[temp_name.str()] = reader; // place an item from each bam onto the q BamAlignment al; if (reader->GetNextAlignment(al)) q.push(al); } BamWriter merged_writer; if ( !merged_writer.Open(out_file_name, header_text, ref) ) { cerr << "sort ERROR: could not open " << out_file_name << " for writing." << endl; return false; } while (!q.empty()) { BamAlignment al = q.top(); q.pop(); merged_writer.SaveAlignment(al); BamReader *reader = bam_readers[al.Filename]; BamAlignment new_al; if (reader->GetNextAlignment(new_al)) q.push(new_al); } merged_writer.Close(); //close and remove temp files map<string,BamReader*>::iterator it; for (it = bam_readers.begin(); it != bam_readers.end(); ++it) { BamReader *reader = it->second; reader->Close(); delete reader; remove(it->first.c_str()); } return true; }
void BedIntersect::IntersectBam(string bamFile) { // load the "B" bed file into a map so // that we can easily compare "A" to it for overlaps _bedB = new BedFile(_bedBFile); _bedB->loadBedFileIntoMap(); // create a dummy BED A file for printing purposes if not // using BAM output. if (_bamOutput == false) { _bedA = new BedFile(_bedAFile); _bedA->bedType = 12; } // open the BAM file BamReader reader; BamWriter writer; reader.Open(bamFile); // get header & reference information string bamHeader = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); // open a BAM output to stdout if we are writing BAM if (_bamOutput == true) { // set compression mode BamWriter::CompressionMode compressionMode = BamWriter::Compressed; if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed; writer.SetCompressionMode(compressionMode); // open our BAM writer writer.Open("stdout", bamHeader, refs); } vector<BED> hits; // reserve some space hits.reserve(100); BamAlignment bam; // get each set of alignments for each pair. while (reader.GetNextAlignment(bam)) { // save an unaligned read if -v if (!bam.IsMapped()) { if (_noHit == true) writer.SaveAlignment(bam); continue; } // break alignment into discrete blocks, bedVector bed_blocks; string chrom = refs.at(bam.RefID).RefName; GetBamBlocks(bam, chrom, bed_blocks, false, true); // create a basic BED entry from the BAM alignment BED bed; MakeBedFromBam(bam, chrom, bed_blocks, bed); bool overlapsFound = false; if ((_bamOutput == true) && (_obeySplits == false)) { overlapsFound = _bedB->anyHits(bed.chrom, bed.start, bed.end, bed.strand, _sameStrand, _diffStrand, _overlapFraction, _reciprocal); } else if ( ((_bamOutput == true) && (_obeySplits == true)) || ((_bamOutput == false) && (_obeySplits == true)) ) { // find the hits that overlap with the full span of the blocked BED _bedB->allHits(bed.chrom, bed.start, bed.end, bed.strand, hits, _sameStrand, _diffStrand, _overlapFraction, _reciprocal); // find the overlaps between the block in A and B overlapsFound = FindBlockedOverlaps(bed, bed_blocks, hits, _bamOutput); } else if ((_bamOutput == false) && (_obeySplits == false)) { FindOverlaps(bed, hits); } // save the BAM alignment if overlap reqs. were met if (_bamOutput == true) { if ((overlapsFound == true) && (_noHit == false)) writer.SaveAlignment(bam); else if ((overlapsFound == false) && (_noHit == true)) writer.SaveAlignment(bam); } hits.clear(); } // close the relevant BAM files. reader.Close(); if (_bamOutput == true) { writer.Close(); } }
int main (int argc, char *argv[]) { int minBaseQuality = 0; string usage=string(""+string(argv[0])+" [in BAM file] [in VCF file] [chr name] [deam out BAM] [not deam out BAM]"+ "\nThis program divides aligned single end reads into potentially deaminated\n"+ "\nreads and the puts the rest into another bam file if the deaminated positions are not called as the alternative base in the VCF.\n"+ "\nTip: if you do not need one of them, use /dev/null as your output\n"+ "arguments:\n"+ "\t"+"--bq [base qual] : Minimum base quality to flag a deaminated site (Default: "+stringify(minBaseQuality)+")\n"+ "\n"); if(argc == 1 || argc < 4 || (argc == 2 && (string(argv[0]) == "-h" || string(argv[0]) == "--help") ) ){ cerr << "Usage "<<usage<<endl; return 1; } for(int i=1;i<(argc-2);i++){ if(string(argv[i]) == "--bq"){ minBaseQuality=destringify<int>(argv[i+1]); i++; continue; } } string bamfiletopen = string( argv[ argc-5 ] ); string vcffiletopen = string( argv[ argc-4 ] ); string chrname = string( argv[ argc-3 ] ); string deambam = string( argv[ argc-2 ] ); string nondeambam = string( argv[ argc-1 ] ); //dummy reader, will need to reposition anyway VCFreader vcfr (vcffiletopen, vcffiletopen+".tbi", chrname, 1, 1, 0); BamReader reader; if ( !reader.Open(bamfiletopen) ) { cerr << "Could not open input BAM file"<< bamfiletopen << endl; return 1; } // if ( !reader.LocateIndex() ) { // cerr << "The index for the BAM file cannot be located" << endl; // return 1; // } // if ( !reader.HasIndex() ) { // cerr << "The BAM file has not been indexed." << endl; // return 1; // } //positioning the bam file int refid=reader.GetReferenceID(chrname); if(refid < 0){ cerr << "Cannot retrieve the reference ID for "<< chrname << endl; return 1; } //cout<<"redif "<<refid<<endl; //setting the BAM reader at that position reader.SetRegion(refid, 0, refid, -1); vector<RefData> testRefData=reader.GetReferenceData(); const SamHeader header = reader.GetHeader(); const RefVector references = reader.GetReferenceData(); BamWriter writerDeam; if ( !writerDeam.Open(deambam, header, references) ) { cerr << "Could not open output BAM file" << endl; return 1; } BamWriter writerNoDeam; if ( !writerNoDeam.Open(nondeambam, header, references) ) { cerr << "Could not open output BAM file" << endl; return 1; } unsigned int totalReads =0; unsigned int deaminatedReads =0; unsigned int ndeaminatedReads =0; unsigned int skipped =0; //iterating over the alignments for these regions BamAlignment al; int i; while ( reader.GetNextAlignment(al) ) { // cerr<<al.Name<<endl; //skip unmapped if(!al.IsMapped()){ skipped++; continue; } //skip paired end ! if(al.IsPaired() ){ continue; // cerr<<"Paired end not yet coded"<<endl; // return 1; } string reconstructedReference = reconstructRef(&al); char refeBase; char readBase; bool isDeaminated; if(al.Qualities.size() != reconstructedReference.size()){ cerr<<"Quality line is not the same size as the reconstructed reference"<<endl; return 1; } isDeaminated=false; if(al.IsReverseStrand()){ //first base next to 3' i = 0 ; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); if( readBase == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){ //isDeaminated=true; } if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); vcfr.repositionIterator(chrname,al.Position+1,al.Position+1); while(vcfr.hasData()){ SimpleVCF * toprint=vcfr.getData(); // cout<<*toprint<<endl; //skip deletions in the alt if(toprint->getRef().length() != 1 ) continue; if(toprint->getRef()[0] != refeBase){ cerr<<reconstructedReference<<endl; cerr<<al.Position<<endl; cerr<<numberOfDeletions(&al)<<endl; cerr<<"Problem1 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; exit(1); } //if the VCF has a at least one G but no A if( toprint->hasAtLeastOneG() && !toprint->hasAtLeastOneA() ){ isDeaminated=true; } } } //second base next to 3' i = 1; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'G' && if( readBase == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){ //isDeaminated=true; } if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); vcfr.repositionIterator(chrname,al.Position+2,al.Position+2); while(vcfr.hasData()){ SimpleVCF * toprint=vcfr.getData(); // cout<<*toprint<<endl; //skip deletions in the alt if(toprint->getRef().length() != 1 ) continue; if(toprint->getRef()[0] != refeBase){ cerr<<reconstructedReference<<endl; cerr<<al.Position<<endl; cerr<<numberOfDeletions(&al)<<endl; cerr<<"Problem2 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; exit(1); } //if the VCF has at least one G but no A // if(toprint->hasAtLeastOneG() && // toprint->getAlt().find("A") == string::npos){ if( toprint->hasAtLeastOneG() && !toprint->hasAtLeastOneA() ){ isDeaminated=true; } } } //last base next to 5' i = (al.QueryBases.length()-1) ; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'G' && if( readBase == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){ //isDeaminated=true; } if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); int lengthMatches=countMatchesRecons(reconstructedReference,0); int positionJump = al.Position+lengthMatches+numberOfDeletions(&al); vcfr.repositionIterator(chrname,positionJump,positionJump); while(vcfr.hasData()){ SimpleVCF * toprint=vcfr.getData(); //skip deletions in the alt if(toprint->getRef().length() != 1 ) continue; if(toprint->getRef()[0] != refeBase){ cerr<<reconstructedReference<<endl; cerr<<al.Position<<endl; cerr<<lengthMatches<<endl; cerr<<numberOfDeletions(&al)<<endl; cerr<<positionJump<<endl; cerr<<"Problem3 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; exit(1); } //if the VCF has at least one G but no A if( toprint->hasAtLeastOneG() && !toprint->hasAtLeastOneA() ){ isDeaminated=true; } } } }else{ //first base next to 5' i = 0; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'C' if( readBase == 'T' && int(al.Qualities[i]-offset) >= minBaseQuality){ if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); vcfr.repositionIterator(chrname,al.Position+1,al.Position+1); while(vcfr.hasData()){ SimpleVCF * toprint=vcfr.getData(); //cout<<*toprint<<endl; //skip deletions in the alt if(toprint->getRef().length() != 1 ) continue; if(toprint->getRef()[0] != refeBase){ cerr<<reconstructedReference<<endl; cerr<<al.Position<<endl; cerr<<numberOfDeletions(&al)<<endl; cerr<<"Problem4 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; exit(1); } //if the VCF has at least one C but no T if( toprint->hasAtLeastOneC() && !toprint->hasAtLeastOneT() ){ isDeaminated=true; } } //cout<<al.Position+ } //second last base next to 3' i = (al.QueryBases.length()-2); refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'C' && if( readBase == 'T' && int(al.Qualities[i]-offset) >= minBaseQuality){ if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); int lengthMatches=countMatchesRecons(reconstructedReference,1); int positionJump = al.Position+lengthMatches+numberOfDeletions(&al); vcfr.repositionIterator(chrname,positionJump,positionJump); while(vcfr.hasData()){ SimpleVCF * toprint=vcfr.getData(); //skip deletions in the alt if(toprint->getRef().length() != 1 ) continue; if(toprint->getRef()[0] != refeBase){ cerr<<reconstructedReference<<endl; cerr<<al.Position<<endl; cerr<<lengthMatches<<endl; cerr<<numberOfDeletions(&al)<<endl; cerr<<positionJump<<endl; cerr<<"Problem5 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; exit(1); } if( toprint->hasAtLeastOneC() && !toprint->hasAtLeastOneT() ){ isDeaminated=true; } } } //last base next to 3' i = (al.QueryBases.length()-1); refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //&& refeBase == 'C' if( readBase == 'T' && int(al.Qualities[i]-offset) >= minBaseQuality){ if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); int lengthMatches=countMatchesRecons(reconstructedReference,0); int positionJump = al.Position+lengthMatches+numberOfDeletions(&al); vcfr.repositionIterator(chrname,positionJump,positionJump); while(vcfr.hasData()){ SimpleVCF * toprint=vcfr.getData(); //skip deletions in the alt if(toprint->getRef().length() != 1 ) continue; if(toprint->getRef()[0] != refeBase){ cerr<<reconstructedReference<<endl; cerr<<al.Position<<endl; cerr<<lengthMatches<<endl; cerr<<numberOfDeletions(&al)<<endl; cerr<<positionJump<<endl; cerr<<"Problem6 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; exit(1); } if( toprint->hasAtLeastOneC() && !toprint->hasAtLeastOneT() ){ isDeaminated=true; } } } } totalReads++; if(isDeaminated){ deaminatedReads++; writerDeam.SaveAlignment(al); }else{ ndeaminatedReads++; writerNoDeam.SaveAlignment(al); } }//end for each read reader.Close(); writerDeam.Close(); writerNoDeam.Close(); cerr<<"Program finished sucessfully, out of "<<totalReads<<" mapped reads (skipped: "<<skipped<<" reads) we flagged "<<deaminatedReads<<" as deaminated and "<<ndeaminatedReads<<" as not deaminated"<<endl; return 0; }
// close the BAM files void BamMultiReader::Close(void) { for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) { BamReader* reader = it->first; reader->Close(); // close the reader } }
int main (int argc, char *argv[]) { if( (argc!= 3) || (argc== 2 && string(argv[1]) == "-h") || (argc== 2 && string(argv[1]) == "-help") || (argc== 2 && string(argv[1]) == "--help") ){ cerr<<"Usage:splitByRG [in bam] [out prefix]"<<endl<<"this program creates one bam file per RG in the with the outprefix\nFor example splitByRG in.bam out will create\nout.rg1.bam\nout.rg2.bam\n"<<endl; return 1; } string bamfiletopen = string(argv[1]); // if(!strEndsWith(bamfiletopen,".bam")){ // } string bamDirOutPrefix = string(argv[2]); map<string,BamWriter *> rg2BamWriter; // if(!isDirectory(bamDirOut)){ // cerr<<"ERROR: the out directory does not exist"<<endl; // return 1; // } BamReader reader; if ( !reader.Open(bamfiletopen) ) { cerr << "Could not open input BAM files." << endl; return 1; } SamHeader header = reader.GetHeader(); const RefVector references = reader.GetReferenceData(); vector<RefData> refData=reader.GetReferenceData(); string pID = "splitByRG"; string pName = "splitByRG"; string pCommandLine = ""; for(int i=0;i<(argc);i++){ pCommandLine += (string(argv[i])+" "); } putProgramInHeader(&header,pID,pName,pCommandLine,returnGitHubVersion(string(argv[0]),"..")); SamReadGroupDictionary srgd=header.ReadGroups; for(SamReadGroupConstIterator srgci=srgd.ConstBegin(); srgci<srgd.ConstEnd(); srgci++){ //cout<<*srgci<<endl; const SamReadGroup rg = (*srgci); //cout<<rg.ID<<endl; rg2BamWriter[rg.ID] = new BamWriter(); rg2BamWriter[rg.ID]->Open(bamDirOutPrefix+"."+rg.ID+".bam",header,references); } BamAlignment al; unsigned int total=0; while ( reader.GetNextAlignment(al) ) { // al.SetIsFailedQC(false); // writer.SaveAlignment(al); // if(al.IsMapped () ){ // if(rg2BamWriter.find(refData[al.RefID].RefName) == rg2BamWriter.end()){ //new // rg2BamWriter[refData[al.RefID].RefName] = new BamWriter(); // if ( !rg2BamWriter[refData[al.RefID].RefName]->Open(bamDirOutPrefix+"."+refData[al.RefID].RefName+".bam",header,references) ) { // cerr << "Could not open output BAM file "<< bamDirOutPrefix<<"."<<refData[al.RefID].RefName<<".bam" << endl; // return 1; // } // }else{ // rg2BamWriter[refData[al.RefID].RefName]->SaveAlignment(al); // } // }else{ // unmapped.SaveAlignment(al); // } if(al.HasTag("RG")){ string rgTag; al.GetTag("RG",rgTag); //cout<<rgTag<<endl; if(rg2BamWriter.find(rgTag) == rg2BamWriter.end()){ //new cerr<<"Found new RG "<<rgTag<<endl; rg2BamWriter[rgTag] = new BamWriter(); if ( !rg2BamWriter[rgTag]->Open(bamDirOutPrefix+"."+rgTag+".bam",header,references) ) { cerr << "Could not open output BAM file "<< bamDirOutPrefix<<"."<<rgTag<<".bam" << endl; return 1; } rg2BamWriter[rgTag]->SaveAlignment(al); }else{ rg2BamWriter[rgTag]->SaveAlignment(al); } }else{ string rgTag="unknown"; //cout<<rgTag<<endl; if(rg2BamWriter.find(rgTag) == rg2BamWriter.end()){ //new cerr<<"Found new RG "<<rgTag<<endl; rg2BamWriter[rgTag] = new BamWriter(); if ( !rg2BamWriter[rgTag]->Open(bamDirOutPrefix+"."+rgTag+".bam",header,references) ) { cerr << "Could not open output BAM file "<< bamDirOutPrefix<<"."<<rgTag<<".bam" << endl; return 1; } rg2BamWriter[rgTag]->SaveAlignment(al); }else{ rg2BamWriter[rgTag]->SaveAlignment(al); } // cerr << "Cannot get RG tag for " << al.Name<<endl; // return 1; } total++; } //while al reader.Close(); // writer.Close(); // unmapped.Close(); map<string,BamWriter *>::iterator rg2BamWriterIt; for (rg2BamWriterIt =rg2BamWriter.begin(); rg2BamWriterIt!=rg2BamWriter.end(); rg2BamWriterIt++){ rg2BamWriterIt->second->Close(); } cerr<<"Wrote succesfully "<<total<<" reads"<<endl; return 0; }
int main (int argc, char *argv[]) { if( (argc== 1) || (argc== 2 && string(argv[1]) == "-h") || (argc== 2 && string(argv[1]) == "-help") || (argc== 2 && string(argv[1]) == "--help") ){ cout<<"Usage:"<<endl; cout<<""<<endl; cout<<"plotQualScore input.bam"<<endl; return 1; } string bamfiletopen = string(argv[1]); BamReader reader; if ( !reader.Open(bamfiletopen) ) { cerr << "Could not open input BAM files." << endl; return 1; } // if ( !reader.LocateIndex() ){ // cerr << "warning: cannot locate index for file " << bamfiletopen<<endl; // //return 1; // } BamAlignment al; BamAlignment al2; bool unsurePEorSE=true; bool pe=true; int strLength=-1; int vecLengthToUse=-1; map<short,unsigned long> ** counterA = 0; map<short,unsigned long> ** counterC = 0; map<short,unsigned long> ** counterG = 0; map<short,unsigned long> ** counterT = 0; int lengthIndex1=0; int lengthIndex2=0; string seqInd1; string seqInd2; string qualInd1; string qualInd2; int offsetInd2; while ( reader.GetNextAlignment(al) ) { if(unsurePEorSE){ strLength=al.QueryBases.length(); if(al.IsPaired()){ pe=true; vecLengthToUse=2*strLength; }else{ pe=false; vecLengthToUse=strLength; } string index1; string index2; if(al.HasTag("XI")){ al.GetTag("XI",index1); vecLengthToUse+=index1.length(); lengthIndex1=index1.length(); } if(al.HasTag("XJ")){ al.GetTag("XJ",index2); vecLengthToUse+=index2.length(); lengthIndex2=index2.length(); } counterA = new map<short,unsigned long> * [vecLengthToUse]; counterC = new map<short,unsigned long> * [vecLengthToUse]; counterG = new map<short,unsigned long> * [vecLengthToUse]; counterT = new map<short,unsigned long> * [vecLengthToUse]; for(int i=0;i<vecLengthToUse;i++){ counterA[i]=new map<short,unsigned long> (); counterC[i]=new map<short,unsigned long> (); counterG[i]=new map<short,unsigned long> (); counterT[i]=new map<short,unsigned long> (); for(short k=minQualScore;k<=maxQualScore;k++){ (*counterA[i])[k]=0; (*counterC[i])[k]=0; (*counterG[i])[k]=0; (*counterT[i])[k]=0; } } unsurePEorSE=false; }else{ if(pe && !al.IsPaired()){ cerr << "Cannot have unpaired reads in PE mode" << endl; return 1; } if(!pe && al.IsPaired()){ cerr << "Cannot have unpaired reads in SE mode" << endl; return 1; } } if(al.QueryBases.length() != al.Qualities.length()){ cerr << "Cannot have different lengths for sequence and quality" << endl; return 1; } if(int(al.QueryBases.length()) != strLength){ cerr << "Cannot have different lengths for sequence and quality" << endl; return 1; } if(pe){ if(al.IsFirstMate()){ reader.GetNextAlignment(al2); if(al2.QueryBases.length() != al2.Qualities.length()){ cerr << "Cannot have different lengths for sequence and quality" << endl; return 1; } }else{ cerr << "First read should be the first mate" << endl; return 1; } } //cycle for(unsigned int i=0;i<al.QueryBases.length();i++){ short x=(short(al.Qualities[i])-qualOffset); if(al.QueryBases[i] == 'A'){ (*counterA[i])[x]++; } if(al.QueryBases[i] == 'C'){ (*counterC[i])[x]++; } if(al.QueryBases[i] == 'G'){ (*counterG[i])[x]++; } if(al.QueryBases[i] == 'T'){ (*counterT[i])[x]++; } } //The indices for al and al2 should hopefully be the same if(lengthIndex1>0){ al.GetTag("XI",seqInd1); al.GetTag("YI",qualInd1); int j; for(int i=0;i<lengthIndex1;i++){ j=i+al.QueryBases.length(); short x=(short(qualInd1[i])-qualOffset); if(seqInd1[i] == 'A'){ (*counterA[j])[x]++; } if(seqInd1[i] == 'C'){ (*counterC[j])[x]++; } if(seqInd1[i] == 'G'){ (*counterG[j])[x]++; } if(seqInd1[i] == 'T'){ (*counterT[j])[x]++; } } } if(pe){ offsetInd2=al.QueryBases.length()+lengthIndex1+al2.QueryBases.length(); int j; for(unsigned int i=0;i<al2.QueryBases.length();i++){ j=i+al.QueryBases.length()+lengthIndex1; short x=(short(al2.Qualities[i])-qualOffset); if(al2.QueryBases[i] == 'A'){ (*counterA[j])[x]++; } if(al2.QueryBases[i] == 'C'){ (*counterC[j])[x]++; } if(al2.QueryBases[i] == 'G'){ (*counterG[j])[x]++; } if(al2.QueryBases[i] == 'T'){ (*counterT[j])[x]++; } } }else{ offsetInd2=al.QueryBases.length()+lengthIndex1; } //The indices for al and al2 should hopefully be the same if(lengthIndex2>0){ al.GetTag("XJ",seqInd2); al.GetTag("YJ",qualInd2); int j; for(int i=0;i<lengthIndex2;i++){ j=offsetInd2+i; short x=(short(qualInd2[i])-qualOffset); if(seqInd2[i] == 'A'){ (*counterA[j])[x]++; } if(seqInd2[i] == 'C'){ (*counterC[j])[x]++; } if(seqInd2[i] == 'G'){ (*counterG[j])[x]++; } if(seqInd2[i] == 'T'){ (*counterT[j])[x]++; } } } } reader.Close(); cout<<"cycle\t"<<"nuc\t"; for(short k=minQualScore;k<maxQualScore;k++){ cout<<k<<"\t"; } cout<<maxQualScore<<endl; for(int i=0;i<vecLengthToUse;i++){ cout<<(i+1)<<"\t"; cout<<"A\t"; for(short k=minQualScore;k<maxQualScore;k++){ cout<<(*counterA[i])[k]<<"\t"; } cout<<(*counterA[i])[maxQualScore]<<endl; cout<<(i+1)<<"\t"; cout<<"C\t"; for(short k=minQualScore;k<maxQualScore;k++){ cout<<(*counterC[i])[k]<<"\t"; } cout<<(*counterC[i])[maxQualScore]<<endl; cout<<(i+1)<<"\t"; cout<<"G\t"; for(short k=minQualScore;k<maxQualScore;k++){ cout<<(*counterG[i])[k]<<"\t"; } cout<<(*counterG[i])[maxQualScore]<<endl; cout<<(i+1)<<"\t"; cout<<"T\t"; for(short k=minQualScore;k<maxQualScore;k++){ cout<<(*counterT[i])[k]<<"\t"; } cout<<(*counterT[i])[maxQualScore]<<endl; } return 0; }