int main (int argc, char *argv[]) { if( (argc== 1) || (argc== 2 && string(argv[1]) == "-h") || (argc== 2 && string(argv[1]) == "-help") || (argc== 2 && string(argv[1]) == "--help") ){ cout<<"Usage:"<<endl; cout<<""<<endl; cout<<"plotQualScore input.bam"<<endl; return 1; } string bamfiletopen = string(argv[1]); BamReader reader; if ( !reader.Open(bamfiletopen) ) { cerr << "Could not open input BAM files." << endl; return 1; } // if ( !reader.LocateIndex() ){ // cerr << "warning: cannot locate index for file " << bamfiletopen<<endl; // //return 1; // } BamAlignment al; BamAlignment al2; bool unsurePEorSE=true; bool pe=true; int strLength=-1; int vecLengthToUse=-1; map<short,unsigned long> ** counterA = 0; map<short,unsigned long> ** counterC = 0; map<short,unsigned long> ** counterG = 0; map<short,unsigned long> ** counterT = 0; int lengthIndex1=0; int lengthIndex2=0; string seqInd1; string seqInd2; string qualInd1; string qualInd2; int offsetInd2; while ( reader.GetNextAlignment(al) ) { if(unsurePEorSE){ strLength=al.QueryBases.length(); if(al.IsPaired()){ pe=true; vecLengthToUse=2*strLength; }else{ pe=false; vecLengthToUse=strLength; } string index1; string index2; if(al.HasTag("XI")){ al.GetTag("XI",index1); vecLengthToUse+=index1.length(); lengthIndex1=index1.length(); } if(al.HasTag("XJ")){ al.GetTag("XJ",index2); vecLengthToUse+=index2.length(); lengthIndex2=index2.length(); } counterA = new map<short,unsigned long> * [vecLengthToUse]; counterC = new map<short,unsigned long> * [vecLengthToUse]; counterG = new map<short,unsigned long> * [vecLengthToUse]; counterT = new map<short,unsigned long> * [vecLengthToUse]; for(int i=0;i<vecLengthToUse;i++){ counterA[i]=new map<short,unsigned long> (); counterC[i]=new map<short,unsigned long> (); counterG[i]=new map<short,unsigned long> (); counterT[i]=new map<short,unsigned long> (); for(short k=minQualScore;k<=maxQualScore;k++){ (*counterA[i])[k]=0; (*counterC[i])[k]=0; (*counterG[i])[k]=0; (*counterT[i])[k]=0; } } unsurePEorSE=false; }else{ if(pe && !al.IsPaired()){ cerr << "Cannot have unpaired reads in PE mode" << endl; return 1; } if(!pe && al.IsPaired()){ cerr << "Cannot have unpaired reads in SE mode" << endl; return 1; } } if(al.QueryBases.length() != al.Qualities.length()){ cerr << "Cannot have different lengths for sequence and quality" << endl; return 1; } if(int(al.QueryBases.length()) != strLength){ cerr << "Cannot have different lengths for sequence and quality" << endl; return 1; } if(pe){ if(al.IsFirstMate()){ reader.GetNextAlignment(al2); if(al2.QueryBases.length() != al2.Qualities.length()){ cerr << "Cannot have different lengths for sequence and quality" << endl; return 1; } }else{ cerr << "First read should be the first mate" << endl; return 1; } } //cycle for(unsigned int i=0;i<al.QueryBases.length();i++){ short x=(short(al.Qualities[i])-qualOffset); if(al.QueryBases[i] == 'A'){ (*counterA[i])[x]++; } if(al.QueryBases[i] == 'C'){ (*counterC[i])[x]++; } if(al.QueryBases[i] == 'G'){ (*counterG[i])[x]++; } if(al.QueryBases[i] == 'T'){ (*counterT[i])[x]++; } } //The indices for al and al2 should hopefully be the same if(lengthIndex1>0){ al.GetTag("XI",seqInd1); al.GetTag("YI",qualInd1); int j; for(int i=0;i<lengthIndex1;i++){ j=i+al.QueryBases.length(); short x=(short(qualInd1[i])-qualOffset); if(seqInd1[i] == 'A'){ (*counterA[j])[x]++; } if(seqInd1[i] == 'C'){ (*counterC[j])[x]++; } if(seqInd1[i] == 'G'){ (*counterG[j])[x]++; } if(seqInd1[i] == 'T'){ (*counterT[j])[x]++; } } } if(pe){ offsetInd2=al.QueryBases.length()+lengthIndex1+al2.QueryBases.length(); int j; for(unsigned int i=0;i<al2.QueryBases.length();i++){ j=i+al.QueryBases.length()+lengthIndex1; short x=(short(al2.Qualities[i])-qualOffset); if(al2.QueryBases[i] == 'A'){ (*counterA[j])[x]++; } if(al2.QueryBases[i] == 'C'){ (*counterC[j])[x]++; } if(al2.QueryBases[i] == 'G'){ (*counterG[j])[x]++; } if(al2.QueryBases[i] == 'T'){ (*counterT[j])[x]++; } } }else{ offsetInd2=al.QueryBases.length()+lengthIndex1; } //The indices for al and al2 should hopefully be the same if(lengthIndex2>0){ al.GetTag("XJ",seqInd2); al.GetTag("YJ",qualInd2); int j; for(int i=0;i<lengthIndex2;i++){ j=offsetInd2+i; short x=(short(qualInd2[i])-qualOffset); if(seqInd2[i] == 'A'){ (*counterA[j])[x]++; } if(seqInd2[i] == 'C'){ (*counterC[j])[x]++; } if(seqInd2[i] == 'G'){ (*counterG[j])[x]++; } if(seqInd2[i] == 'T'){ (*counterT[j])[x]++; } } } } reader.Close(); cout<<"cycle\t"<<"nuc\t"; for(short k=minQualScore;k<maxQualScore;k++){ cout<<k<<"\t"; } cout<<maxQualScore<<endl; for(int i=0;i<vecLengthToUse;i++){ cout<<(i+1)<<"\t"; cout<<"A\t"; for(short k=minQualScore;k<maxQualScore;k++){ cout<<(*counterA[i])[k]<<"\t"; } cout<<(*counterA[i])[maxQualScore]<<endl; cout<<(i+1)<<"\t"; cout<<"C\t"; for(short k=minQualScore;k<maxQualScore;k++){ cout<<(*counterC[i])[k]<<"\t"; } cout<<(*counterC[i])[maxQualScore]<<endl; cout<<(i+1)<<"\t"; cout<<"G\t"; for(short k=minQualScore;k<maxQualScore;k++){ cout<<(*counterG[i])[k]<<"\t"; } cout<<(*counterG[i])[maxQualScore]<<endl; cout<<(i+1)<<"\t"; cout<<"T\t"; for(short k=minQualScore;k<maxQualScore;k++){ cout<<(*counterT[i])[k]<<"\t"; } cout<<(*counterT[i])[maxQualScore]<<endl; } return 0; }
int main (int argc, char *argv[]) { string usage=string(""+string(argv[0])+" [in BAM file]"+ "\nThis program reads a BAM file and computes the error rate for each cycle\n"+ // "\nreads and the puts the rest into another bam file.\n"+ // "\nTip: if you do not need one of them, use /dev/null as your output\n"+ // "arguments:\n"+ // "\t"+"--bq [base qual] : Minimum base quality to flag a deaminated site (Default: "+stringify(minBaseQuality)+")\n"+ "\n"); if(argc == 1 || (argc == 2 && (string(argv[0]) == "-h" || string(argv[0]) == "--help") ) ){ cerr << "Usage "<<usage<<endl; return 1; } // for(int i=1;i<(argc-2);i++){ // if(string(argv[i]) == "--bq"){ // minBaseQuality=destringify<int>(argv[i+1]); // i++; // continue; // } // } string bamfiletopen = string( argv[ argc-1 ] ); // string deambam = string( argv[ argc-2 ] ); // string nondeambam = string( argv[ argc-1 ] ); BamReader reader; if ( !reader.Open(bamfiletopen) ) { cerr << "Could not open input BAM file"<< bamfiletopen << endl; return 1; } //iterating over the alignments for these regions BamAlignment al; bool pairedEnd=false; bool firstRead=true; while ( reader.GetNextAlignment(al) ) { if(firstRead){ //reads are either all paired end or single end, I don't allow a mix numberOfCycles=al.QueryBases.size(); // cout<<"numberOfCycles "<<numberOfCycles<<endl; if(al.IsPaired() ){ pairedEnd=true; matches = vector<unsigned int> (2*numberOfCycles,0); mismatches = vector<unsigned int> (2*numberOfCycles,0); typesOfMismatches = vector< vector<unsigned int> >(); for(int i=0;i<12;i++) typesOfMismatches.push_back( vector<unsigned int> (2*numberOfCycles,0) ); }else{ matches = vector<unsigned int> ( numberOfCycles,0); mismatches = vector<unsigned int> ( numberOfCycles,0); typesOfMismatches = vector< vector<unsigned int> >(); for(int i=0;i<12;i++) typesOfMismatches.push_back( vector<unsigned int> ( numberOfCycles,0) ); } firstRead=false; } if( ( pairedEnd && !al.IsPaired()) || ( !pairedEnd && al.IsPaired()) ){ cerr<<"Read "<<al.Name<<" is wrong, cannot have a mixture of paired and unpaired read for this program"<<endl; return 1; } //skip unmapped if(!al.IsMapped()) continue; if(numberOfCycles!=int(al.QueryBases.size())){ cerr<<"The length of read "<<al.Name<<" is wrong, should be "<<numberOfCycles<<"bp"<<endl; return 1; } string reconstructedReference = reconstructRef(&al); if(al.Qualities.size() != reconstructedReference.size()){ cerr<<"Quality line is not the same size as the reconstructed reference"<<endl; return 1; } if( pairedEnd ){ if( al.IsFirstMate() ){ //start cycle 0 if( al.IsReverseStrand() ){ increaseCounters(al,reconstructedReference,numberOfCycles-1,-1); //start cycle numberOfCycles-1 }else{ increaseCounters(al,reconstructedReference,0 , 1); //start cycle 0 } }else{ if( al.IsSecondMate() ){ if( al.IsReverseStrand() ){ increaseCounters(al,reconstructedReference,2*numberOfCycles-1,-1); //start cycle 2*numberOfCycles-1 }else{ increaseCounters(al,reconstructedReference,numberOfCycles , 1); //start cycle numberOfCycles } }else{ cerr<<"Reads "<<al.Name<<" must be either first or second mate"<<endl; return 1; } } }else{ //single end if( al.IsReverseStrand() ){ increaseCounters(al,reconstructedReference,numberOfCycles-1,-1); //start cycle numberOfCycles-1 }else{ increaseCounters(al,reconstructedReference,0 , 1); //start cycle 0 } } }//end while each read reader.Close(); cout<<"cycle\tmatches\tmismatches\tmismatches%\tA>C\tA>C%\tA>G\tA>G%\tA>T\tA>T%\tC>A\tC>A%\tC>G\tC>G%\tC>T\tC>T%\tG>A\tG>A%\tG>C\tG>C%\tG>T\tG>T%\tT>A\tT>A%\tT>C\tT>C%\tT>G\tT>G%"<<endl; for(unsigned int i=0;i<matches.size();i++){ cout<<(i+1); if( (matches[i]+mismatches[i]!=0) ) cout<<"\t"<<matches[i]<<"\t"<<mismatches[i]<<"\t"<< 100.0*(double(mismatches[i])/double(matches[i]+mismatches[i])) ; else cout<<"\t"<<matches[i]<<"\t"<<mismatches[i]<<"\tNA"; for(int j=0;j<12;j++){ cout<<"\t"<<typesOfMismatches[j][i]; if( (matches[i]+mismatches[i]!=0) ) cout<<"\t"<<100.0*double(typesOfMismatches[j][i])/double(matches[i]+mismatches[i]); else cout<<"\tNA"; } cout<<endl; } return 0; }
int main (int argc, char *argv[]) { bool mapped =false; bool unmapped=false; const string usage=string(string(argv[0])+" [options] input.bam out.bam"+"\n\n"+ "This program takes a BAM file as input and produces\n"+ "another where the putative deaminated bases have\n"+ "have been cut\n"+ "\n"+ "Options:\n"); // "\t"+"-u , --unmapped" +"\n\t\t"+"For an unmapped bam file"+"\n"+ // "\t"+"-m , --mapped" +"\n\t\t"+"For an mapped bam file"+"\n"); if( (argc== 1) || (argc== 2 && string(argv[1]) == "-h") || (argc== 2 && string(argv[1]) == "-help") || (argc== 2 && string(argv[1]) == "--help") ){ cout<<"Usage:"<<endl; cout<<usage<<endl; cout<<""<<endl; return 1; } // for(int i=1;i<(argc-1);i++){ //all but the last arg // if(strcmp(argv[i],"-m") == 0 || strcmp(argv[i],"--mapped") == 0 ){ // mapped=true; // continue; // } // if(strcmp(argv[i],"-u") == 0 || strcmp(argv[i],"--unmapped") == 0 ){ // unmapped=true; // continue; // } // cerr<<"Unknown option "<<argv[i] <<" exiting"<<endl; // return 1; // } if(argc != 3){ cerr<<"Error: Must specify the input and output BAM files"; return 1; } string inbamFile =argv[argc-2]; string outbamFile=argv[argc-1]; // if(!mapped && !unmapped){ // cerr << "Please specify whether you reads are mapped or unmapped" << endl; // return 1; // } // if(mapped && unmapped){ // cerr << "Please specify either mapped or unmapped but not both" << endl; // return 1; // } BamReader reader; if ( !reader.Open(inbamFile) ) { cerr << "Could not open input BAM files." << endl; return 1; } vector<RefData> testRefData=reader.GetReferenceData(); const SamHeader header = reader.GetHeader(); const RefVector references = reader.GetReferenceData(); BamWriter writer; if ( !writer.Open(outbamFile, header, references) ) { cerr << "Could not open output BAM file" << endl; return 1; } BamAlignment al; // BamAlignment al2; // bool al2Null=true; while ( reader.GetNextAlignment(al) ) { if(al.IsPaired() ){ if(al.IsFirstMate() ){ //5' end, need to check first base only if(al.IsReverseStrand()){ // if(!al.IsMapped()){ cerr << "Cannot have reverse complemented unmapped reads: " <<al.Name<< endl; //return 1; } int indexToCheck; //last indexToCheck=al.QueryBases.length()-1; if(toupper(al.QueryBases[indexToCheck]) == 'A'){ //al.Qualities[indexToCheck]=char(offset+baseQualForDeam); al.QueryBases = al.QueryBases.substr(0,indexToCheck); al.Qualities = al.Qualities.substr(0, indexToCheck); } }else{ int indexToCheck; //first base indexToCheck=0; if(toupper(al.QueryBases[indexToCheck]) == 'T'){ //al.Qualities[indexToCheck]=char(offset+baseQualForDeam); al.QueryBases = al.QueryBases.substr(indexToCheck+1); al.Qualities = al.Qualities.substr(indexToCheck+1); } } }else{ //3' end, need to check last two bases only if( al.IsSecondMate() ){ if(al.IsReverseStrand()){ // if(!al.IsMapped()){ cerr << "Cannot have reverse complemented unmapped reads: " <<al.Name<< endl; //return 1; } int indexToCheck; //second to last indexToCheck=al.QueryBases.length()-2; if(toupper(al.QueryBases[indexToCheck]) == 'T'){ //al.Qualities[indexToCheck]=char(offset+baseQualForDeam); al.QueryBases = al.QueryBases.substr(0,indexToCheck); al.Qualities = al.Qualities.substr(0, indexToCheck); }else{ //last indexToCheck=al.QueryBases.length()-1; if(toupper(al.QueryBases[indexToCheck]) == 'T'){ //al.Qualities[indexToCheck]=char(offset+baseQualForDeam); al.QueryBases = al.QueryBases.substr(0,indexToCheck); al.Qualities = al.Qualities.substr(0, indexToCheck); } } }else{ int indexToCheck; //second base indexToCheck=1; if(toupper(al.QueryBases[indexToCheck]) == 'A'){ //al.Qualities[indexToCheck]=char(offset+baseQualForDeam); al.QueryBases = al.QueryBases.substr(indexToCheck+1); al.Qualities = al.Qualities.substr(indexToCheck+1); }else{ //first base indexToCheck=0; if(toupper(al.QueryBases[indexToCheck]) == 'A'){ //al.Qualities[indexToCheck]=char(offset+baseQualForDeam); al.QueryBases = al.QueryBases.substr(indexToCheck+1); al.Qualities = al.Qualities.substr(indexToCheck+1); } } } }else{ cerr << "Wrong state" << endl; return 1; } } }//end of paired end else{//we consider single reads to have been sequenced from 5' to 3' if(al.IsReverseStrand()){ //need to consider if(!al.IsMapped()){ cerr << "Cannot have reverse complemented unmapped reads: " <<al.Name<< endl; //return 1; } int indexToCheck; //second base indexToCheck=1; if(toupper(al.QueryBases[indexToCheck]) == 'A'){ // al.Qualities[indexToCheck]=char(offset+baseQualForDeam); // cout<<"51 "<<al.QueryBases<<endl; // cout<<"51 "<<al.Qualities<<endl; al.QueryBases = al.QueryBases.substr(indexToCheck+1); al.Qualities = al.Qualities.substr(indexToCheck+1); // cout<<"52 "<<al.QueryBases<<endl; // cout<<"52 "<<al.Qualities<<endl; }else{ //first base indexToCheck=0; if(toupper(al.QueryBases[indexToCheck]) == 'A'){ // al.Qualities[indexToCheck]=char(offset+baseQualForDeam); // cout<<"61 "<<al.QueryBases<<endl; // cout<<"61 "<<al.Qualities<<endl; al.QueryBases = al.QueryBases.substr(indexToCheck+1); al.Qualities = al.Qualities.substr(indexToCheck+1); // cout<<"62 "<<al.QueryBases<<endl; // cout<<"62 "<<al.Qualities<<endl; } } //last indexToCheck=al.QueryBases.length()-1; if(toupper(al.QueryBases[indexToCheck]) == 'A'){ // al.Qualities[indexToCheck]=char(offset+baseQualForDeam); // cout<<"21 "<<al.QueryBases<<endl; // cout<<"21 "<<al.Qualities<<endl; al.QueryBases = al.QueryBases.substr(0,indexToCheck); al.Qualities = al.Qualities.substr(0, indexToCheck); // cout<<"22 "<<al.QueryBases<<endl; // cout<<"22 "<<al.Qualities<<endl; } }else{ int indexToCheck; //first base indexToCheck=0; if(toupper(al.QueryBases[indexToCheck]) == 'T'){ // al.Qualities[indexToCheck]=char(offset+baseQualForDeam); // cout<<"11 "<<al.QueryBases<<endl; // cout<<"11 "<<al.Qualities<<endl; al.QueryBases = al.QueryBases.substr(indexToCheck+1); al.Qualities = al.Qualities.substr(indexToCheck+1); // cout<<"12 "<<al.QueryBases<<endl; // cout<<"12 "<<al.Qualities<<endl; } //second to last indexToCheck=al.QueryBases.length()-2; if(toupper(al.QueryBases[indexToCheck]) == 'T'){ // al.Qualities[indexToCheck]=char(offset+baseQualForDeam); // cout<<"31 "<<al.QueryBases<<endl; // cout<<"31 "<<al.Qualities<<endl; al.QueryBases = al.QueryBases.substr(0,indexToCheck); al.Qualities = al.Qualities.substr(0, indexToCheck); // cout<<"32 "<<al.QueryBases<<endl; // cout<<"32 "<<al.Qualities<<endl; }else{ //last indexToCheck=al.QueryBases.length()-1; if(toupper(al.QueryBases[indexToCheck]) == 'T'){ // al.Qualities[indexToCheck]=char(offset+baseQualForDeam); // cout<<"41 "<<al.QueryBases<<endl; // cout<<"41 "<<al.Qualities<<endl; al.QueryBases = al.QueryBases.substr(0,indexToCheck); al.Qualities = al.Qualities.substr(0, indexToCheck); // cout<<"42 "<<al.QueryBases<<endl; // cout<<"42 "<<al.Qualities<<endl; } } } }//end of single end writer.SaveAlignment(al); }// while ( reader.GetNextAlignment(al) ) { reader.Close(); writer.Close(); return 0; }
virtual void main() { //init QTextStream out(stdout); BamReader reader; NGSHelper::openBAM(reader, getInfile("in")); FastqOutfileStream out1(getOutfile("out1"), false); FastqOutfileStream out2(getOutfile("out2"), false); long long c_unpaired = 0; long long c_paired = 0; int max_cached = 0; //iterate through reads BamAlignment al; QHash<QByteArray, BamAlignment> al_cache; while (reader.GetNextAlignment(al)) { //skip secondary alinments if(!al.IsPrimaryAlignment()) continue; //skip unpaired if(!al.IsPaired()) { ++c_unpaired; continue; } QByteArray name(al.Name.data()); //TODO use QByteArray::fromStdString (when upgraded to Qt5.4) //store cached read when we encounter the mate if (al_cache.contains(name)) { BamAlignment mate = al_cache.take(name); //out << name << " [AL] First: " << al.IsFirstMate() << " Reverse: " << al.IsReverseStrand() << " Seq: " << al.QueryBases.data() << endl; //out << name << " [MA] First: " << mate.IsFirstMate() << " Reverse: " << mate.IsReverseStrand() << " Seq: " << mate.QueryBases.data() << endl; if (al.IsFirstMate()) { write(out1, al, al.IsReverseStrand()); write(out2, mate, mate.IsReverseStrand()); } else { write(out1, mate, mate.IsReverseStrand()); write(out2, al, al.IsReverseStrand()); } ++c_paired; } //cache read for later retrieval else { al_cache.insert(name, al); } max_cached = std::max(max_cached, al_cache.size()); } reader.Close(); out1.close(); out2.close(); //write debug output out << "Pair reads (written) : " << c_paired << endl; out << "Unpaired reads (skipped) : " << c_unpaired << endl; out << "Unmatched paired reads (skipped): " << al_cache.size() << endl; out << endl; out << "Maximum cached reads : " << max_cached << endl; }
void BedWindow::WindowIntersectBam(string bamFile) { // load the "B" bed file into a map so // that we can easily compare "A" to it for overlaps _bedB->loadBedFileIntoMap(); // open the BAM file BamReader reader; BamWriter writer; reader.Open(bamFile); // get header & reference information string bamHeader = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); // open a BAM output to stdout if we are writing BAM if (_bamOutput == true) { // set compression mode BamWriter::CompressionMode compressionMode = BamWriter::Compressed; if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed; writer.SetCompressionMode(compressionMode); // open our BAM writer writer.Open("stdout", bamHeader, refs); } vector<BED> hits; // vector of potential hits // reserve some space hits.reserve(100); _bedA->bedType = 6; BamAlignment bam; bool overlapsFound; // get each set of alignments for each pair. while (reader.GetNextAlignment(bam)) { if (bam.IsMapped()) { BED a; a.chrom = refs.at(bam.RefID).RefName; a.start = bam.Position; a.end = bam.GetEndPosition(false, false); // build the name field from the BAM alignment. a.name = bam.Name; if (bam.IsFirstMate()) a.name += "/1"; if (bam.IsSecondMate()) a.name += "/2"; a.score = ToString(bam.MapQuality); a.strand = "+"; if (bam.IsReverseStrand()) a.strand = "-"; if (_bamOutput == true) { overlapsFound = FindOneOrMoreWindowOverlaps(a); if (overlapsFound == true) { if (_noHit == false) writer.SaveAlignment(bam); } else { if (_noHit == true) writer.SaveAlignment(bam); } } else { FindWindowOverlaps(a, hits); hits.clear(); } } // BAM IsMapped() is false else if (_noHit == true) { writer.SaveAlignment(bam); } } // close the relevant BAM files. reader.Close(); if (_bamOutput == true) { writer.Close(); } }
void BedGenomeCoverage::CoverageBam(string bamFile) { ResetChromCoverage(); // open the BAM file BamReader reader; if (!reader.Open(bamFile)) { cerr << "Failed to open BAM file " << bamFile << endl; exit(1); } // get header & reference information string header = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); // load the BAM header references into a BEDTools "genome file" _genome = new GenomeFile(refs); // convert each aligned BAM entry to BED // and compute coverage on B BamAlignment bam; while (reader.GetNextAlignment(bam)) { // skip if the read is unaligned if (bam.IsMapped() == false) continue; bool _isReverseStrand = bam.IsReverseStrand(); //changing second mate's strand to opposite if( _dUTP && bam.IsPaired() && bam.IsMateMapped() && bam.IsSecondMate()) _isReverseStrand = !bam.IsReverseStrand(); // skip if we care about strands and the strand isn't what // the user wanted if ( (_filterByStrand == true) && ((_requestedStrand == "-") != _isReverseStrand) ) continue; // extract the chrom, start and end from the BAM alignment string chrom(refs.at(bam.RefID).RefName); CHRPOS start = bam.Position; CHRPOS end = bam.GetEndPosition(false, false) - 1; // are we on a new chromosome? if ( chrom != _currChromName ) StartNewChrom(chrom); if(_pair_chip_) { // Skip if not a proper pair if (bam.IsPaired() && (!bam.IsProperPair() or !bam.IsMateMapped()) ) continue; // Skip if wrong coordinates if( ( (bam.Position<bam.MatePosition) && bam.IsReverseStrand() ) || ( (bam.MatePosition < bam.Position) && bam.IsMateReverseStrand() ) ) { //chemically designed: left on positive strand, right on reverse one continue; } /*if(_haveSize) { if (bam.IsFirstMate() && bam.IsReverseStrand()) { //put fragmentSize in to the middle of pair end_fragment int mid = bam.MatePosition+abs(bam.InsertSize)/2; if(mid<_fragmentSize/2) AddCoverage(0, mid+_fragmentSize/2); else AddCoverage(mid-_fragmentSize/2, mid+_fragmentSize/2); } else if (bam.IsFirstMate() && bam.IsMateReverseStrand()) { //put fragmentSize in to the middle of pair end_fragment int mid = start+abs(bam.InsertSize)/2; if(mid<_fragmentSize/2) AddCoverage(0, mid+_fragmentSize/2); else AddCoverage(mid-_fragmentSize/2, mid+_fragmentSize/2); } } else */ if (bam.IsFirstMate() && bam.IsReverseStrand()) { //prolong to the mate to the left AddCoverage(bam.MatePosition, end); } else if (bam.IsFirstMate() && bam.IsMateReverseStrand()) { //prolong to the mate to the right AddCoverage(start, start + abs(bam.InsertSize) - 1); } } else if (_haveSize) { if(bam.IsReverseStrand()) { if(end<_fragmentSize) { //sometimes fragmentSize is bigger :( AddCoverage(0, end); } else { AddCoverage(end + 1 - _fragmentSize, end ); } } else { AddCoverage(start,start+_fragmentSize - 1); } } else // add coverage accordingly. if (!_only_5p_end && !_only_3p_end) { bedVector bedBlocks; // we always want to split blocks when a D CIGAR op is found. // if the user invokes -split, we want to also split on N ops. if (_obeySplits) { // "D" true, "N" true GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks, true, true); } else { // "D" true, "N" false GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks, true, false); } AddBlockedCoverage(bedBlocks); } else if (_only_5p_end) { CHRPOS pos = ( !bam.IsReverseStrand() ) ? start : end; AddCoverage(pos,pos); } else if (_only_3p_end) { CHRPOS pos = ( bam.IsReverseStrand() ) ? start : end; AddCoverage(pos,pos); } } // close the BAM reader.Close(); // process the results of the last chromosome. ReportChromCoverage(_currChromCoverage, _currChromSize, _currChromName, _currChromDepthHist); // report all empty chromsomes PrintEmptyChromosomes(); // report the overall coverage if asked. PrintFinalCoverage(); }
void BedIntersect::IntersectBam(string bamFile) { // load the "B" bed file into a map so // that we can easily compare "A" to it for overlaps _bedB->loadBedFileIntoMap(); // open the BAM file BamReader reader; BamWriter writer; reader.Open(bamFile); // get header & reference information string header = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); // open a BAM output to stdout if we are writing BAM if (_bamOutput == true) { // open our BAM writer writer.Open("stdout", header, refs, _isUncompressedBam); } vector<BED> hits; // reserve some space hits.reserve(100); _bedA->bedType = 6; BamAlignment bam; // get each set of alignments for each pair. while (reader.GetNextAlignment(bam)) { if (bam.IsMapped()) { BED a; a.chrom = refs.at(bam.RefID).RefName; a.start = bam.Position; a.end = bam.GetEndPosition(false); // build the name field from the BAM alignment. a.name = bam.Name; if (bam.IsFirstMate()) a.name += "/1"; if (bam.IsSecondMate()) a.name += "/2"; a.score = ToString(bam.MapQuality); a.strand = "+"; if (bam.IsReverseStrand()) a.strand = "-"; if (_bamOutput == true) { bool overlapsFound = false; // treat the BAM alignment as a single "block" if (_obeySplits == false) { overlapsFound = FindOneOrMoreOverlap(a); } // split the BAM alignment into discrete blocks and // look for overlaps only within each block. else { bool overlapFoundForBlock; bedVector bedBlocks; // vec to store the discrete BED "blocks" from a // we don't want to split on "D" ops, hence the "false" getBamBlocks(bam, refs, bedBlocks, false); vector<BED>::const_iterator bedItr = bedBlocks.begin(); vector<BED>::const_iterator bedEnd = bedBlocks.end(); for (; bedItr != bedEnd; ++bedItr) { overlapFoundForBlock = FindOneOrMoreOverlap(a); if (overlapFoundForBlock == true) overlapsFound = true; } } if (overlapsFound == true) { if (_noHit == false) writer.SaveAlignment(bam); } else { if (_noHit == true) { writer.SaveAlignment(bam); } } } else { // treat the BAM alignment as a single BED "block" if (_obeySplits == false) { FindOverlaps(a, hits); hits.clear(); } // split the BAM alignment into discrete BED blocks and // look for overlaps only within each block. else { bedVector bedBlocks; // vec to store the discrete BED "blocks" from a getBamBlocks(bam, refs, bedBlocks, false); vector<BED>::const_iterator bedItr = bedBlocks.begin(); vector<BED>::const_iterator bedEnd = bedBlocks.end(); for (; bedItr != bedEnd; ++bedItr) { FindOverlaps(*bedItr, hits); hits.clear(); } } } } } // close the relevant BAM files. reader.Close(); if (_bamOutput == true) { writer.Close(); } }
/* Input file format: BAM file */ void Distinguish( string mappingFile, map< string, bool > & hapSNP, map< string, string > & refSeq, string outfilePrefix ) { // [WARNING] If refSeq is not empty than it's BS data, otherwirse is resequencing data. string outfile1 = outfilePrefix + ".1.bam"; string outfile2 = outfilePrefix + ".2.bam"; string outHomoF = outfilePrefix + ".h**o.bam"; string outUdeci = outfilePrefix + ".ambiguity.bam"; // The ambiguity reads or the reads which has't any SNP. BamReader h_I; // bam input file handle if ( !h_I.Open( mappingFile ) ) cerr << "[ERROR]: " << h_I.GetErrorString() << endl; // "header" and "references" from BAM files, these are required by BamWriter const SamHeader header = h_I.GetHeader(); const RefVector references = h_I.GetReferenceData(); BamWriter h_O1, h_O2, h_U, h_H; if ( !h_O1.Open( outfile1, header, references ) ) { cerr << "Cannot open output BAM file: " << outfile1 << endl; exit(1); } if ( !h_O2.Open( outfile2, header, references ) ) { cerr << "Cannot open output BAM file: " << outfile2 << endl; exit(1); } if ( !h_U.Open ( outUdeci, header, references ) ) { cerr << "Cannot open output BAM file: " << outUdeci << endl; exit(1); } if ( !h_H.Open ( outHomoF, header, references ) ) { cerr << "Cannot open output BAM file: " << outHomoF << endl; exit(1); } int readsNumberRecord(0); SamLine samline; // Samline class SamExt sam; BamAlignment al; map< string, pair<BamAlignment, SamExt> > firstMateAl; // record the first mate reads alignment, HIstory problem to be like this struct!! string refstr; // Just For BS data. bool isC2T ( false ); // Just For BS data. while ( h_I.GetNextAlignment( al ) ) { ++readsNumberRecord; if ( readsNumberRecord % 1000000 == 0 ) cerr << "Have been dealed " << readsNumberRecord << " lines. " << local_time (); if ( !al.IsMapped() ) continue; //if ( al.InsertSize == 0 || al.RefID != al.MateRefID ) continue; samline._RID = al.Name; samline._Flag= al.AlignmentFlag; samline._ref_id = h_I.GetReferenceData()[al.RefID].RefName; samline._position = al.Position + 1; // Position (0-base starts in BamTools), but I need 1-base starts samline._mapQ = al.MapQuality; // MateRefID == -1 means mate read is unmapping samline._XorD = ( al.MateRefID > -1 ) ? h_I.GetReferenceData()[al.MateRefID].RefName : "*"; samline._coor = al.MatePosition + 1; // Position (0-base starts in BamTools), but I need 1-base starts samline._seq = al.QueryBases; samline._insert_size = abs (al.InsertSize); if ( samline._ref_id.compare( "BIG_ID_CAT" ) == 0 ) continue; // Ignore "BIG_ID_CAT" // get cigar; samline._cigar = itoa(al.CigarData[0].Length); samline._cigar.append( 1, al.CigarData[0].Type ); for ( size_t i(1); i < al.CigarData.size(); ++i ) { samline._cigar += itoa(al.CigarData[i].Length); samline._cigar.append( 1, al.CigarData[i].Type ); } sam.assign( &samline ); /*********************************** For BS Data *********************************************/ if ( !refSeq.empty() ) { // If the data is BS data, we should modify the QueryBases. if ( !refSeq.count( samline._ref_id ) ) { cerr << "[ERROR]There's no such reference in the reference file. " << samline._ref_id << endl; exit(1); } if ( al.IsFirstMate() && !al.IsReverseStrand() ) { isC2T = true; } else if ( al.IsFirstMate() && al.IsReverseStrand() ) { isC2T = false; } else if ( al.IsSecondMate() && !al.IsReverseStrand() ) { isC2T = false; } else if ( al.IsSecondMate() && al.IsReverseStrand() ) { isC2T = true; } else { cerr << "[ERROR MATCH] " << endl; exit(1); } refstr.assign( refSeq[samline._ref_id], sam.ref_start() - 1, sam.ref_end() - sam.ref_start() + 1 ); modifyBSreadBases( samline._ref_id, sam.ref_start (), sam.read_start(), sam.cigar_seq(), refstr, sam._seq, hapSNP, isC2T ); } /********************************** End For BS Data *******************************************/ // Consider the mate pair reads if ( !firstMateAl.count(al.Name) && (al.MateRefID > -1) ) { firstMateAl[al.Name] = std::make_pair( al, sam ); } else { // Consider the mate pair reads if ( !firstMateAl.count(al.Name) ) { switch ( Decide( sam, hapSNP ) ) { case 1 : h_O1.SaveAlignment( al ); break; // Hap1 case 2 : h_O2.SaveAlignment( al ); break; // Hap2 case 0 : h_U.SaveAlignment ( al ); break; // Ambiguity default: // This alignment didn't contain any hete SNP. h_H.SaveAlignment ( al ); // Homozygous reads } } else { int mark1 = Decide( firstMateAl[al.Name].second, hapSNP ); int mark2 = Decide( sam, hapSNP ); if ( mark1 == 1 && mark2 == 1 ) { h_O1.SaveAlignment( firstMateAl[al.Name].first ); h_O1.SaveAlignment( al ); } else if ( (mark1 == 1 && mark2 == 0) || (mark1 == 0 && mark2 == 1) ) { h_O1.SaveAlignment( firstMateAl[al.Name].first ); h_O1.SaveAlignment( al ); } else if ( (mark1 == 1 && mark2 == -1) || (mark1 == -1 && mark2 == 1) ) { h_O1.SaveAlignment( firstMateAl[al.Name].first ); h_O1.SaveAlignment( al ); } else if ( mark1 == 2 && mark2 == 2 ) { h_O2.SaveAlignment( firstMateAl[al.Name].first ); h_O2.SaveAlignment( al ); } else if ( (mark1 == 2 && mark2 == 0 ) || (mark1 == 0 && mark2 == 2) ) { h_O2.SaveAlignment( firstMateAl[al.Name].first ); h_O2.SaveAlignment( al ); } else if ( (mark1 == 2 && mark2 == -1) || (mark1 == -1 && mark2 == 2) ) { h_O2.SaveAlignment( firstMateAl[al.Name].first ); h_O2.SaveAlignment( al ); } else if ( mark1 == -1 && mark2 == -1 ) { h_H.SaveAlignment ( firstMateAl[al.Name].first ); h_H.SaveAlignment ( al ); } else { h_U.SaveAlignment ( firstMateAl[al.Name].first ); h_U.SaveAlignment ( al ); } firstMateAl.erase( al.Name ); } } } cerr << "------------ Remaind size: " << firstMateAl.size() << endl; for (map< string, pair<BamAlignment, SamExt> >::iterator it( firstMateAl.begin() ); it != firstMateAl.end(); ++it ) { switch ( Decide( it->second.second, hapSNP ) ) { case 1 : h_O1.SaveAlignment( it->second.first ); break; // Hap1 case 2 : h_O2.SaveAlignment( it->second.first ); break; // Hap2 case 0 : h_U.SaveAlignment ( it->second.first ); break; // Ambiguity default: // This alignment didn't contain any hete SNP. h_H.SaveAlignment ( it->second.first ); // Homozygous reads } } h_I.Close(); h_U.Close(); h_H.Close(); h_O1.Close(); h_O2.Close(); cerr << ">>>>>>>>>>>>> All Done <<<<<<<<<<<<<<" << endl; cerr << "Write to output file: " << outfile1 << endl; cerr << "Write to output file: " << outfile2 << endl; cerr << "Write to output file: " << outHomoF << endl; cerr << "Write to output file: " << outUdeci << endl; return; }
int main (int argc, char *argv[]) { // bool mapped =false; // bool unmapped=false; int bpToDecrease5=1; int bpToDecrease3=2; const string usage=string(string(argv[0])+" [options] input.bam out.bam"+"\n\n"+ "\tThis program takes a BAM file as input and produces\n"+ "\tanother where the putative deaminated bases have\n"+ "\ta base quality score of "+intStringify(baseQualForDeam)+"\n"+ "\tgiven an "+intStringify(offset)+" offset \n"+ "\n"+ "\tOptions:\n"+ "\t\t"+"-n5" +"\t\t\t"+"Decrease the nth bases surrounding the 5' ends (Default:"+stringify(bpToDecrease5)+") "+"\n"+ "\t\t"+"-n3" +"\t\t\t"+"Decrease the nth bases surrounding the 3' ends (Default:"+stringify(bpToDecrease3)+") "+"\n" ); // "\t"+"-u , --unmapped" +"\n\t\t"+"For an unmapped bam file"+"\n"+ // "\t"+"-m , --mapped" +"\n\t\t"+"For an mapped bam file"+"\n"); if( (argc== 1) || (argc== 2 && string(argv[1]) == "-h") || (argc== 2 && string(argv[1]) == "-help") || (argc== 2 && string(argv[1]) == "--help") ){ cout<<"Usage:"<<endl; cout<<usage<<endl; cout<<""<<endl; return 1; } for(int i=1;i<(argc-2);i++){ //all but the last arg if( string(argv[i]) == "-n5" ){ bpToDecrease5 = destringify<int>(argv[i+1]); i++; continue; } if( string(argv[i]) == "-n3" ){ bpToDecrease3 = destringify<int>(argv[i+1]); i++; continue; } cerr<<"Unknown option "<<argv[i] <<" exiting"<<endl; return 1; } if(argc < 3){ cerr<<"Error: Must specify the input and output BAM files"; return 1; } string inbamFile =argv[argc-2]; string outbamFile=argv[argc-1]; if(inbamFile == outbamFile){ cerr<<"Input and output files are the same"<<endl; return 1; } // if(!mapped && !unmapped){ // cerr << "Please specify whether you reads are mapped or unmapped" << endl; // return 1; // } // if(mapped && unmapped){ // cerr << "Please specify either mapped or unmapped but not both" << endl; // return 1; // } BamReader reader; if ( !reader.Open(inbamFile) ) { cerr << "Could not open input BAM files." << endl; return 1; } vector<RefData> testRefData=reader.GetReferenceData(); SamHeader header = reader.GetHeader(); string pID = "decrQualDeaminated"; string pName = "decrQualDeaminated"; string pCommandLine = ""; for(int i=0;i<(argc);i++){ pCommandLine += (string(argv[i])+" "); } putProgramInHeader(&header,pID,pName,pCommandLine); const RefVector references = reader.GetReferenceData(); BamWriter writer; if ( !writer.Open(outbamFile, header, references) ) { cerr << "Could not open output BAM file" << endl; return 1; } BamAlignment al; // BamAlignment al2; // bool al2Null=true; while ( reader.GetNextAlignment(al) ) { if(al.IsPaired() ){ if(al.IsFirstMate() ){ //5' end, need to check first base only if(al.IsReverseStrand()){ // if(!al.IsMapped()){ cerr << "Cannot have reverse complemented unmapped reads :" <<al.Name<< endl; //return 1; } int indexToCheck; //5' of first mate reversed indexToCheck=al.QueryBases.length()-1; for(int i=0;i<bpToDecrease5;i++){ if(toupper(al.QueryBases[indexToCheck]) == 'A'){ al.Qualities[indexToCheck]=char(offset+baseQualForDeam); } indexToCheck=max(indexToCheck-1,0); } }else{ int indexToCheck; //5' of first mate indexToCheck=0; for(int i=0;i<bpToDecrease5;i++){ //first base if(toupper(al.QueryBases[indexToCheck]) == 'T'){ al.Qualities[indexToCheck]=char(offset+baseQualForDeam); } indexToCheck=min(indexToCheck+1,int(al.Qualities.size())); } } }else{ //3' end, need to check last two bases only if( al.IsSecondMate() ){ if(al.IsReverseStrand()){ // if(!al.IsMapped()){ cerr << "Cannot have reverse complemented unmapped reads :" <<al.Name<< endl; //return 1; } int indexToCheck; //3' of second mate reversed indexToCheck=al.QueryBases.length()-1; for(int i=0;i<bpToDecrease3;i++){ if(toupper(al.QueryBases[indexToCheck]) == 'T'){ al.Qualities[indexToCheck]=char(offset+baseQualForDeam); } indexToCheck=max(indexToCheck-1,0); } }else{ int indexToCheck; //3' of second mate forward indexToCheck=0; for(int i=0;i<bpToDecrease3;i++){ //first base if(toupper(al.QueryBases[indexToCheck]) == 'A'){ al.Qualities[indexToCheck]=char(offset+baseQualForDeam); } indexToCheck=min(indexToCheck+1,int(al.Qualities.size())); } } }else{ cerr << "Wrong state" << endl; return 1; } } }//end of paired end else{//we consider single reads to have been sequenced from 5' to 3' if(al.IsReverseStrand()){ //need to consider if(!al.IsMapped()){ cerr << "Cannot have reverse complemented unmapped reads :" <<al.Name<< endl; //return 1; } int indexToCheck; //5' of single read reversed indexToCheck=al.QueryBases.length()-1; for(int i=0;i<bpToDecrease5;i++){ if(toupper(al.QueryBases[indexToCheck]) == 'A'){ al.Qualities[indexToCheck]=char(offset+baseQualForDeam); } indexToCheck=max(indexToCheck-1,0); } //3' of single read reversed indexToCheck=0; for(int i=0;i<bpToDecrease3;i++){ //first base if(toupper(al.QueryBases[indexToCheck]) == 'A'){ al.Qualities[indexToCheck]=char(offset+baseQualForDeam); } indexToCheck=min(indexToCheck+1,int(al.Qualities.size())); } }else{ int indexToCheck; //5' of single read indexToCheck=0; for(int i=0;i<bpToDecrease5;i++){ //first base if(toupper(al.QueryBases[indexToCheck]) == 'T'){ al.Qualities[indexToCheck]=char(offset+baseQualForDeam); } indexToCheck=min(indexToCheck+1,int(al.Qualities.size())); } //3' of single read indexToCheck=al.QueryBases.length()-1; for(int i=0;i<bpToDecrease3;i++){ if(toupper(al.QueryBases[indexToCheck]) == 'T'){ al.Qualities[indexToCheck]=char(offset+baseQualForDeam); } indexToCheck=max(indexToCheck-1,0); } } }//end of single end writer.SaveAlignment(al); }// while ( reader.GetNextAlignment(al) ) { reader.Close(); writer.Close(); cerr<<"Program terminated gracefully"<<endl; return 0; }
int main (int argc, char *argv[]) { if(argc != 4){ cerr<<"This program strips the mapping information and cuts sequences"<<endl; cerr<<"Usage "<<argv[0]<<" [bam file in] [bam file out] [distribution]"<<endl; cerr<<"The distribution is one per line"<<endl; return 1; } string bamfiletopen = string(argv[1]); string bamfiletwrite = string(argv[2]); string fileDist = string(argv[3]); igzstream myFile; string line; vector<int> distToUse; myFile.open(fileDist.c_str(), ios::in); if (myFile.good()){ while ( getline (myFile,line)){ distToUse.push_back( destringify<int>(line) ); } myFile.close(); }else{ cerr << "Unable to open file "<<fileDist<<endl; return 1; } cerr<<"Read "<<distToUse.size()<<" data points "<<endl; cerr<<"Reading "<<bamfiletopen<<" writing to "<<bamfiletwrite<<endl; BamReader reader; BamWriter writer; if ( !reader.Open(bamfiletopen) ) { cerr << "Could not open input BAM files." << endl; return 1; } SamHeader myHeader=reader.GetHeader(); SamProgram sp; string pID = "removeTagsMapping"; string pName = "removeTagsMapping"; string pCommandLine = ""; for(int i=0;i<(argc);i++){ pCommandLine += (string(argv[i])+" "); } putProgramInHeader(&myHeader,pID,pName,pCommandLine,returnGitHubVersion(string(argv[0]),".")); //no @SQ myHeader.Sequences.Clear(); vector< RefData > emptyRefVector; if( !writer.Open(bamfiletwrite,myHeader,emptyRefVector ) ) { cerr << "Could not open output BAM file "<<bamfiletwrite << endl; return 1; } unsigned int readsTotal=0; BamAlignment al; while ( reader.GetNextAlignment(al) ) { //deleting tag data al.TagData=""; //reset the flag // if(al.IsPaired()){ // if(al.IsFirstMate()){ // al.AlignmentFlag = flagFirstPair; // }else{ // al.AlignmentFlag = flagSecondPair; // } // }else{ // } if(al.IsPaired()){ if(al.IsFirstMate()){ al.Name = al.Name+"/1"; }else{ al.Name = al.Name+"/2"; } } al.AlignmentFlag = flagSingleReads; //no ref or positon al.RefID=-1; al.MateRefID=-1; al.Position=-1; al.MatePosition=-1; //no insert size al.InsertSize=0; //no cigar al.CigarData.clear(); //no mapping quality al.MapQuality=0; int length = distToUse[ randomInt(0,distToUse.size()-1) ]; length = MIN( length, al.Length); al.QueryBases = al.QueryBases.substr(0,length); al.Qualities = al.Qualities.substr( 0,length); writer.SaveAlignment(al); readsTotal++; } reader.Close(); writer.Close(); cerr<<"Program "<<argv[0]<<" terminated gracefully, looked at "<<readsTotal<<endl; return 0; }