// print BamAlignment in FASTQ format // N.B. - uses QueryBases NOT AlignedBases void ConvertTool::ConvertToolPrivate::PrintFastq(const BamAlignment& a) { // @BamAlignment.Name // BamAlignment.QueryBases // + // BamAlignment.Qualities // // N.B. - QueryBases are reverse-complemented (& Qualities reversed) if aligned to reverse strand . // Name is appended "/1" or "/2" if paired-end, to reflect which mate this entry is. // handle paired-end alignments string name = a.Name; if ( a.IsPaired() ) name.append( (a.IsFirstMate() ? "/1" : "/2") ); // handle reverse strand alignment - bases & qualities string qualities = a.Qualities; string sequence = a.QueryBases; if ( a.IsReverseStrand() ) { Utilities::Reverse(qualities); Utilities::ReverseComplement(sequence); } // write to output stream m_out << "@" << name << endl << sequence << endl << "+" << endl << qualities << endl; }
int DataStatisticsTool::Execute() { // iterate over reads in BAM file(s) BamAlignment alignObj; while(bamReader.GetNextAlignment(alignObj)) { if (alignObj.IsDuplicate()) continue; if (alignObj.IsFailedQC()) continue; if (!alignObj.IsMapped()) continue; if (!alignObj.IsPrimaryAlignment()) continue; if (alignObj.IsPaired() && !alignObj.IsProperPair()) continue; if (alignObj.IsPaired() && !alignObj.IsMateMapped()) continue; if (!alignObj.HasTag("MD")) continue; // // debug // GenericBamAlignmentTools::printBamAlignmentCigar(alignObj); // GenericBamAlignmentTools::printBamAlignmentMD(alignObj); // shift InDel GenericBamAlignmentTools::leftShiftInDel(alignObj); // // debug // GenericBamAlignmentTools::printBamAlignmentCigar(alignObj); // GenericBamAlignmentTools::printBamAlignmentMD(alignObj); // get the alignment sequences string alignRead; string alignGenome; GenericBamAlignmentTools::getAlignmentSequences(alignObj, alignRead, alignGenome); // update the statistics statistics.update(alignRead, alignGenome); } // print to screen cout << statistics << endl; // statistics.printMatchMismatch(); // close BAM reader bamReader.Close(); // close Fasta genomeFasta.Close(); return 1; }
// use current input alignment to update BAM file alignment stats void StatsTool::StatsToolPrivate::ProcessAlignment(const BamAlignment& al) { // increment total alignment counter ++numReads; // check the paired-independent flags if ( al.IsDuplicate() ) ++numDuplicates; if ( al.IsFailedQC() ) ++numFailedQC; if ( al.IsMapped() ) ++numMapped; // check forward/reverse strand if ( al.IsReverseStrand() ) ++numReverseStrand; else ++numForwardStrand; // if alignment is paired-end if ( al.IsPaired() ) { // increment PE counter ++numPaired; // increment first mate/second mate counters if ( al.IsFirstMate() ) ++numFirstMate; if ( al.IsSecondMate() ) ++numSecondMate; // if alignment is mapped, check mate status if ( al.IsMapped() ) { // if mate mapped if ( al.IsMateMapped() ) ++numBothMatesMapped; // else singleton else ++numSingletons; } // check for explicit proper pair flag if ( al.IsProperPair() ) ++numProperPair; // store insert size for first mate if ( settings->IsShowingInsertSizeSummary && al.IsFirstMate() && (al.InsertSize != 0) ) { int insertSize = abs(al.InsertSize); insertSizes.push_back( insertSize ); } } }
// print BamAlignment in SAM format void ConvertTool::ConvertToolPrivate::PrintSam(const BamAlignment& a) { // tab-delimited // <QNAME> <FLAG> <RNAME> <POS> <MAPQ> <CIGAR> <MRNM> <MPOS> <ISIZE> <SEQ> <QUAL> [ <TAG>:<VTYPE>:<VALUE> [...] ] // write name & alignment flag m_out << a.Name << "\t" << a.AlignmentFlag << "\t"; // write reference name if ( (a.RefID >= 0) && (a.RefID < (int)m_references.size()) ) m_out << m_references[a.RefID].RefName << "\t"; else m_out << "*\t"; // write position & map quality m_out << a.Position+1 << "\t" << a.MapQuality << "\t"; // write CIGAR const vector<CigarOp>& cigarData = a.CigarData; if ( cigarData.empty() ) m_out << "*\t"; else { vector<CigarOp>::const_iterator cigarIter = cigarData.begin(); vector<CigarOp>::const_iterator cigarEnd = cigarData.end(); for ( ; cigarIter != cigarEnd; ++cigarIter ) { const CigarOp& op = (*cigarIter); m_out << op.Length << op.Type; } m_out << "\t"; } // write mate reference name, mate position, & insert size if ( a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)m_references.size()) ) { if ( a.MateRefID == a.RefID ) m_out << "=\t"; else m_out << m_references[a.MateRefID].RefName << "\t"; m_out << a.MatePosition+1 << "\t" << a.InsertSize << "\t"; } else m_out << "*\t0\t0\t"; // write sequence if ( a.QueryBases.empty() ) m_out << "*\t"; else m_out << a.QueryBases << "\t"; // write qualities if ( a.Qualities.empty() || (a.Qualities.at(0) == (char)0xFF) ) m_out << "*"; else m_out << a.Qualities; // write tag data const char* tagData = a.TagData.c_str(); const size_t tagDataLength = a.TagData.length(); size_t index = 0; while ( index < tagDataLength ) { // write tag name string tagName = a.TagData.substr(index, 2); m_out << "\t" << tagName << ":"; index += 2; // get data type char type = a.TagData.at(index); ++index; switch ( type ) { case (Constants::BAM_TAG_TYPE_ASCII) : m_out << "A:" << tagData[index]; ++index; break; case (Constants::BAM_TAG_TYPE_INT8) : case (Constants::BAM_TAG_TYPE_UINT8) : m_out << "i:" << (int)tagData[index]; ++index; break; case (Constants::BAM_TAG_TYPE_INT16) : m_out << "i:" << BamTools::UnpackSignedShort(&tagData[index]); index += sizeof(int16_t); break; case (Constants::BAM_TAG_TYPE_UINT16) : m_out << "i:" << BamTools::UnpackUnsignedShort(&tagData[index]); index += sizeof(uint16_t); break; case (Constants::BAM_TAG_TYPE_INT32) : m_out << "i:" << BamTools::UnpackSignedInt(&tagData[index]); index += sizeof(int32_t); break; case (Constants::BAM_TAG_TYPE_UINT32) : m_out << "i:" << BamTools::UnpackUnsignedInt(&tagData[index]); index += sizeof(uint32_t); break; case (Constants::BAM_TAG_TYPE_FLOAT) : m_out << "f:" << BamTools::UnpackFloat(&tagData[index]); index += sizeof(float); break; case (Constants::BAM_TAG_TYPE_HEX) : case (Constants::BAM_TAG_TYPE_STRING) : m_out << type << ":"; while (tagData[index]) { m_out << tagData[index]; ++index; } ++index; break; } if ( tagData[index] == '\0') break; } m_out << endl; }
// print BamAlignment in JSON format void ConvertTool::ConvertToolPrivate::PrintJson(const BamAlignment& a) { // write name & alignment flag m_out << "{\"name\":\"" << a.Name << "\",\"alignmentFlag\":\"" << a.AlignmentFlag << "\","; // write reference name if ( (a.RefID >= 0) && (a.RefID < (int)m_references.size()) ) m_out << "\"reference\":\"" << m_references[a.RefID].RefName << "\","; // write position & map quality m_out << "\"position\":" << a.Position+1 << ",\"mapQuality\":" << a.MapQuality << ","; // write CIGAR const vector<CigarOp>& cigarData = a.CigarData; if ( !cigarData.empty() ) { m_out << "\"cigar\":["; vector<CigarOp>::const_iterator cigarBegin = cigarData.begin(); vector<CigarOp>::const_iterator cigarIter = cigarBegin; vector<CigarOp>::const_iterator cigarEnd = cigarData.end(); for ( ; cigarIter != cigarEnd; ++cigarIter ) { const CigarOp& op = (*cigarIter); if (cigarIter != cigarBegin) m_out << ","; m_out << "\"" << op.Length << op.Type << "\""; } m_out << "],"; } // write mate reference name, mate position, & insert size if ( a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)m_references.size()) ) { m_out << "\"mate\":{" << "\"reference\":\"" << m_references[a.MateRefID].RefName << "\"," << "\"position\":" << a.MatePosition+1 << ",\"insertSize\":" << a.InsertSize << "},"; } // write sequence if ( !a.QueryBases.empty() ) m_out << "\"queryBases\":\"" << a.QueryBases << "\","; // write qualities if ( !a.Qualities.empty() && a.Qualities.at(0) != (char)0xFF ) { string::const_iterator s = a.Qualities.begin(); m_out << "\"qualities\":[" << static_cast<short>(*s) - 33; ++s; for ( ; s != a.Qualities.end(); ++s ) m_out << "," << static_cast<short>(*s) - 33; m_out << "],"; } // write alignment's source BAM file m_out << "\"filename\":" << a.Filename << ","; // write tag data const char* tagData = a.TagData.c_str(); const size_t tagDataLength = a.TagData.length(); size_t index = 0; if ( index < tagDataLength ) { m_out << "\"tags\":{"; while ( index < tagDataLength ) { if ( index > 0 ) m_out << ","; // write tag name m_out << "\"" << a.TagData.substr(index, 2) << "\":"; index += 2; // get data type char type = a.TagData.at(index); ++index; switch ( type ) { case (Constants::BAM_TAG_TYPE_ASCII) : m_out << "\"" << tagData[index] << "\""; ++index; break; case (Constants::BAM_TAG_TYPE_INT8) : case (Constants::BAM_TAG_TYPE_UINT8) : m_out << (int)tagData[index]; ++index; break; case (Constants::BAM_TAG_TYPE_INT16) : m_out << BamTools::UnpackSignedShort(&tagData[index]); index += sizeof(int16_t); break; case (Constants::BAM_TAG_TYPE_UINT16) : m_out << BamTools::UnpackUnsignedShort(&tagData[index]); index += sizeof(uint16_t); break; case (Constants::BAM_TAG_TYPE_INT32) : m_out << BamTools::UnpackSignedInt(&tagData[index]); index += sizeof(int32_t); break; case (Constants::BAM_TAG_TYPE_UINT32) : m_out << BamTools::UnpackUnsignedInt(&tagData[index]); index += sizeof(uint32_t); break; case (Constants::BAM_TAG_TYPE_FLOAT) : m_out << BamTools::UnpackFloat(&tagData[index]); index += sizeof(float); break; case (Constants::BAM_TAG_TYPE_HEX) : case (Constants::BAM_TAG_TYPE_STRING) : m_out << "\""; while (tagData[index]) { if (tagData[index] == '\"') m_out << "\\\""; // escape for json else m_out << tagData[index]; ++index; } m_out << "\""; ++index; break; } if ( tagData[index] == '\0') break; } m_out << "}"; } m_out << "}" << endl; }
int main (int argc, char *argv[]) { int minBaseQuality = 0; string usage=string(""+string(argv[0])+" [in BAM file] [in VCF file] [chr name] [deam out BAM] [not deam out BAM]"+ "\nThis program divides aligned single end reads into potentially deaminated\n"+ "\nreads and the puts the rest into another bam file if the deaminated positions are not called as the alternative base in the VCF.\n"+ "\nTip: if you do not need one of them, use /dev/null as your output\n"+ "arguments:\n"+ "\t"+"--bq [base qual] : Minimum base quality to flag a deaminated site (Default: "+stringify(minBaseQuality)+")\n"+ "\n"); if(argc == 1 || argc < 4 || (argc == 2 && (string(argv[0]) == "-h" || string(argv[0]) == "--help") ) ){ cerr << "Usage "<<usage<<endl; return 1; } for(int i=1;i<(argc-2);i++){ if(string(argv[i]) == "--bq"){ minBaseQuality=destringify<int>(argv[i+1]); i++; continue; } } string bamfiletopen = string( argv[ argc-5 ] ); string vcffiletopen = string( argv[ argc-4 ] ); string chrname = string( argv[ argc-3 ] ); string deambam = string( argv[ argc-2 ] ); string nondeambam = string( argv[ argc-1 ] ); //dummy reader, will need to reposition anyway VCFreader vcfr (vcffiletopen, vcffiletopen+".tbi", chrname, 1, 1, 0); BamReader reader; if ( !reader.Open(bamfiletopen) ) { cerr << "Could not open input BAM file"<< bamfiletopen << endl; return 1; } // if ( !reader.LocateIndex() ) { // cerr << "The index for the BAM file cannot be located" << endl; // return 1; // } // if ( !reader.HasIndex() ) { // cerr << "The BAM file has not been indexed." << endl; // return 1; // } //positioning the bam file int refid=reader.GetReferenceID(chrname); if(refid < 0){ cerr << "Cannot retrieve the reference ID for "<< chrname << endl; return 1; } //cout<<"redif "<<refid<<endl; //setting the BAM reader at that position reader.SetRegion(refid, 0, refid, -1); vector<RefData> testRefData=reader.GetReferenceData(); const SamHeader header = reader.GetHeader(); const RefVector references = reader.GetReferenceData(); BamWriter writerDeam; if ( !writerDeam.Open(deambam, header, references) ) { cerr << "Could not open output BAM file" << endl; return 1; } BamWriter writerNoDeam; if ( !writerNoDeam.Open(nondeambam, header, references) ) { cerr << "Could not open output BAM file" << endl; return 1; } unsigned int totalReads =0; unsigned int deaminatedReads =0; unsigned int ndeaminatedReads =0; unsigned int skipped =0; //iterating over the alignments for these regions BamAlignment al; int i; while ( reader.GetNextAlignment(al) ) { // cerr<<al.Name<<endl; //skip unmapped if(!al.IsMapped()){ skipped++; continue; } //skip paired end ! if(al.IsPaired() ){ continue; // cerr<<"Paired end not yet coded"<<endl; // return 1; } string reconstructedReference = reconstructRef(&al); char refeBase; char readBase; bool isDeaminated; if(al.Qualities.size() != reconstructedReference.size()){ cerr<<"Quality line is not the same size as the reconstructed reference"<<endl; return 1; } isDeaminated=false; if(al.IsReverseStrand()){ //first base next to 3' i = 0 ; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); if( readBase == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){ //isDeaminated=true; } if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); vcfr.repositionIterator(chrname,al.Position+1,al.Position+1); while(vcfr.hasData()){ SimpleVCF * toprint=vcfr.getData(); // cout<<*toprint<<endl; //skip deletions in the alt if(toprint->getRef().length() != 1 ) continue; if(toprint->getRef()[0] != refeBase){ cerr<<reconstructedReference<<endl; cerr<<al.Position<<endl; cerr<<numberOfDeletions(&al)<<endl; cerr<<"Problem1 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; exit(1); } //if the VCF has a at least one G but no A if( toprint->hasAtLeastOneG() && !toprint->hasAtLeastOneA() ){ isDeaminated=true; } } } //second base next to 3' i = 1; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'G' && if( readBase == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){ //isDeaminated=true; } if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); vcfr.repositionIterator(chrname,al.Position+2,al.Position+2); while(vcfr.hasData()){ SimpleVCF * toprint=vcfr.getData(); // cout<<*toprint<<endl; //skip deletions in the alt if(toprint->getRef().length() != 1 ) continue; if(toprint->getRef()[0] != refeBase){ cerr<<reconstructedReference<<endl; cerr<<al.Position<<endl; cerr<<numberOfDeletions(&al)<<endl; cerr<<"Problem2 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; exit(1); } //if the VCF has at least one G but no A // if(toprint->hasAtLeastOneG() && // toprint->getAlt().find("A") == string::npos){ if( toprint->hasAtLeastOneG() && !toprint->hasAtLeastOneA() ){ isDeaminated=true; } } } //last base next to 5' i = (al.QueryBases.length()-1) ; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'G' && if( readBase == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){ //isDeaminated=true; } if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); int lengthMatches=countMatchesRecons(reconstructedReference,0); int positionJump = al.Position+lengthMatches+numberOfDeletions(&al); vcfr.repositionIterator(chrname,positionJump,positionJump); while(vcfr.hasData()){ SimpleVCF * toprint=vcfr.getData(); //skip deletions in the alt if(toprint->getRef().length() != 1 ) continue; if(toprint->getRef()[0] != refeBase){ cerr<<reconstructedReference<<endl; cerr<<al.Position<<endl; cerr<<lengthMatches<<endl; cerr<<numberOfDeletions(&al)<<endl; cerr<<positionJump<<endl; cerr<<"Problem3 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; exit(1); } //if the VCF has at least one G but no A if( toprint->hasAtLeastOneG() && !toprint->hasAtLeastOneA() ){ isDeaminated=true; } } } }else{ //first base next to 5' i = 0; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'C' if( readBase == 'T' && int(al.Qualities[i]-offset) >= minBaseQuality){ if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); vcfr.repositionIterator(chrname,al.Position+1,al.Position+1); while(vcfr.hasData()){ SimpleVCF * toprint=vcfr.getData(); //cout<<*toprint<<endl; //skip deletions in the alt if(toprint->getRef().length() != 1 ) continue; if(toprint->getRef()[0] != refeBase){ cerr<<reconstructedReference<<endl; cerr<<al.Position<<endl; cerr<<numberOfDeletions(&al)<<endl; cerr<<"Problem4 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; exit(1); } //if the VCF has at least one C but no T if( toprint->hasAtLeastOneC() && !toprint->hasAtLeastOneT() ){ isDeaminated=true; } } //cout<<al.Position+ } //second last base next to 3' i = (al.QueryBases.length()-2); refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'C' && if( readBase == 'T' && int(al.Qualities[i]-offset) >= minBaseQuality){ if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); int lengthMatches=countMatchesRecons(reconstructedReference,1); int positionJump = al.Position+lengthMatches+numberOfDeletions(&al); vcfr.repositionIterator(chrname,positionJump,positionJump); while(vcfr.hasData()){ SimpleVCF * toprint=vcfr.getData(); //skip deletions in the alt if(toprint->getRef().length() != 1 ) continue; if(toprint->getRef()[0] != refeBase){ cerr<<reconstructedReference<<endl; cerr<<al.Position<<endl; cerr<<lengthMatches<<endl; cerr<<numberOfDeletions(&al)<<endl; cerr<<positionJump<<endl; cerr<<"Problem5 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; exit(1); } if( toprint->hasAtLeastOneC() && !toprint->hasAtLeastOneT() ){ isDeaminated=true; } } } //last base next to 3' i = (al.QueryBases.length()-1); refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //&& refeBase == 'C' if( readBase == 'T' && int(al.Qualities[i]-offset) >= minBaseQuality){ if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); int lengthMatches=countMatchesRecons(reconstructedReference,0); int positionJump = al.Position+lengthMatches+numberOfDeletions(&al); vcfr.repositionIterator(chrname,positionJump,positionJump); while(vcfr.hasData()){ SimpleVCF * toprint=vcfr.getData(); //skip deletions in the alt if(toprint->getRef().length() != 1 ) continue; if(toprint->getRef()[0] != refeBase){ cerr<<reconstructedReference<<endl; cerr<<al.Position<<endl; cerr<<lengthMatches<<endl; cerr<<numberOfDeletions(&al)<<endl; cerr<<positionJump<<endl; cerr<<"Problem6 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; exit(1); } if( toprint->hasAtLeastOneC() && !toprint->hasAtLeastOneT() ){ isDeaminated=true; } } } } totalReads++; if(isDeaminated){ deaminatedReads++; writerDeam.SaveAlignment(al); }else{ ndeaminatedReads++; writerNoDeam.SaveAlignment(al); } }//end for each read reader.Close(); writerDeam.Close(); writerNoDeam.Close(); cerr<<"Program finished sucessfully, out of "<<totalReads<<" mapped reads (skipped: "<<skipped<<" reads) we flagged "<<deaminatedReads<<" as deaminated and "<<ndeaminatedReads<<" as not deaminated"<<endl; return 0; }
bool check(const PropertyFilter& filter, const BamAlignment& al) { bool keepAlignment = true; const PropertyMap& properties = filter.Properties; PropertyMap::const_iterator propertyIter = properties.begin(); PropertyMap::const_iterator propertyEnd = properties.end(); for ( ; propertyIter != propertyEnd; ++propertyIter ) { // check alignment data field depending on propertyName const string& propertyName = (*propertyIter).first; const PropertyFilterValue& valueFilter = (*propertyIter).second; if ( propertyName == ALIGNMENTFLAG_PROPERTY ) keepAlignment &= valueFilter.check(al.AlignmentFlag); else if ( propertyName == CIGAR_PROPERTY ) { stringstream cigarSs; const vector<CigarOp>& cigarData = al.CigarData; if ( !cigarData.empty() ) { vector<CigarOp>::const_iterator cigarBegin = cigarData.begin(); vector<CigarOp>::const_iterator cigarIter = cigarBegin; vector<CigarOp>::const_iterator cigarEnd = cigarData.end(); for ( ; cigarIter != cigarEnd; ++cigarIter ) { const CigarOp& op = (*cigarIter); cigarSs << op.Length << op.Type; } keepAlignment &= valueFilter.check(cigarSs.str()); } } else if ( propertyName == INSERTSIZE_PROPERTY ) keepAlignment &= valueFilter.check(al.InsertSize); else if ( propertyName == ISDUPLICATE_PROPERTY ) keepAlignment &= valueFilter.check(al.IsDuplicate()); else if ( propertyName == ISFAILEDQC_PROPERTY ) keepAlignment &= valueFilter.check(al.IsFailedQC()); else if ( propertyName == ISFIRSTMATE_PROPERTY ) keepAlignment &= valueFilter.check(al.IsFirstMate()); else if ( propertyName == ISMAPPED_PROPERTY ) keepAlignment &= valueFilter.check(al.IsMapped()); else if ( propertyName == ISMATEMAPPED_PROPERTY ) keepAlignment &= valueFilter.check(al.IsMateMapped()); else if ( propertyName == ISMATEREVERSESTRAND_PROPERTY ) keepAlignment &= valueFilter.check(al.IsMateReverseStrand()); else if ( propertyName == ISPAIRED_PROPERTY ) keepAlignment &= valueFilter.check(al.IsPaired()); else if ( propertyName == ISPRIMARYALIGNMENT_PROPERTY ) keepAlignment &= valueFilter.check(al.IsPrimaryAlignment()); else if ( propertyName == ISPROPERPAIR_PROPERTY ) keepAlignment &= valueFilter.check(al.IsProperPair()); else if ( propertyName == ISREVERSESTRAND_PROPERTY ) keepAlignment &= valueFilter.check(al.IsReverseStrand()); else if ( propertyName == ISSECONDMATE_PROPERTY ) keepAlignment &= valueFilter.check(al.IsSecondMate()); else if ( propertyName == ISSINGLETON_PROPERTY ) { const bool isSingleton = al.IsPaired() && al.IsMapped() && !al.IsMateMapped(); keepAlignment &= valueFilter.check(isSingleton); } else if ( propertyName == MAPQUALITY_PROPERTY ) keepAlignment &= valueFilter.check(al.MapQuality); else if ( propertyName == MATEPOSITION_PROPERTY ) keepAlignment &= ( al.IsPaired() && al.IsMateMapped() && valueFilter.check(al.MateRefID) ); else if ( propertyName == MATEREFERENCE_PROPERTY ) { if ( !al.IsPaired() || !al.IsMateMapped() ) return false; BAMTOOLS_ASSERT_MESSAGE( (al.MateRefID>=0 && (al.MateRefID<(int)filterToolReferences.size())), "Invalid MateRefID"); const string& refName = filterToolReferences.at(al.MateRefID).RefName; keepAlignment &= valueFilter.check(refName); } else if ( propertyName == NAME_PROPERTY ) keepAlignment &= valueFilter.check(al.Name); else if ( propertyName == POSITION_PROPERTY ) keepAlignment &= valueFilter.check(al.Position); else if ( propertyName == QUERYBASES_PROPERTY ) keepAlignment &= valueFilter.check(al.QueryBases); else if ( propertyName == REFERENCE_PROPERTY ) { BAMTOOLS_ASSERT_MESSAGE( (al.RefID>=0 && (al.RefID<(int)filterToolReferences.size())), "Invalid RefID"); const string& refName = filterToolReferences.at(al.RefID).RefName; keepAlignment &= valueFilter.check(refName); } else if ( propertyName == TAG_PROPERTY ) keepAlignment &= checkAlignmentTag(valueFilter, al); else BAMTOOLS_ASSERT_UNREACHABLE; // if alignment fails at ANY point, just quit and return false if ( !keepAlignment ) return false; } BAMTOOLS_ASSERT_MESSAGE( keepAlignment, "Error in BamAlignmentChecker... keepAlignment should be true here"); return keepAlignment; }
int main (int argc, char *argv[]) { if( (argc== 1) || (argc== 2 && string(argv[1]) == "-h") || (argc== 2 && string(argv[1]) == "-help") || (argc== 2 && string(argv[1]) == "--help") ){ cout<<"Usage:"<<endl; cout<<""<<endl; cout<<"plotQualScore input.bam"<<endl; return 1; } string bamfiletopen = string(argv[1]); BamReader reader; if ( !reader.Open(bamfiletopen) ) { cerr << "Could not open input BAM files." << endl; return 1; } // if ( !reader.LocateIndex() ){ // cerr << "warning: cannot locate index for file " << bamfiletopen<<endl; // //return 1; // } BamAlignment al; BamAlignment al2; bool unsurePEorSE=true; bool pe=true; int strLength=-1; int vecLengthToUse=-1; map<short,unsigned long> ** counterA = 0; map<short,unsigned long> ** counterC = 0; map<short,unsigned long> ** counterG = 0; map<short,unsigned long> ** counterT = 0; int lengthIndex1=0; int lengthIndex2=0; string seqInd1; string seqInd2; string qualInd1; string qualInd2; int offsetInd2; while ( reader.GetNextAlignment(al) ) { if(unsurePEorSE){ strLength=al.QueryBases.length(); if(al.IsPaired()){ pe=true; vecLengthToUse=2*strLength; }else{ pe=false; vecLengthToUse=strLength; } string index1; string index2; if(al.HasTag("XI")){ al.GetTag("XI",index1); vecLengthToUse+=index1.length(); lengthIndex1=index1.length(); } if(al.HasTag("XJ")){ al.GetTag("XJ",index2); vecLengthToUse+=index2.length(); lengthIndex2=index2.length(); } counterA = new map<short,unsigned long> * [vecLengthToUse]; counterC = new map<short,unsigned long> * [vecLengthToUse]; counterG = new map<short,unsigned long> * [vecLengthToUse]; counterT = new map<short,unsigned long> * [vecLengthToUse]; for(int i=0;i<vecLengthToUse;i++){ counterA[i]=new map<short,unsigned long> (); counterC[i]=new map<short,unsigned long> (); counterG[i]=new map<short,unsigned long> (); counterT[i]=new map<short,unsigned long> (); for(short k=minQualScore;k<=maxQualScore;k++){ (*counterA[i])[k]=0; (*counterC[i])[k]=0; (*counterG[i])[k]=0; (*counterT[i])[k]=0; } } unsurePEorSE=false; }else{ if(pe && !al.IsPaired()){ cerr << "Cannot have unpaired reads in PE mode" << endl; return 1; } if(!pe && al.IsPaired()){ cerr << "Cannot have unpaired reads in SE mode" << endl; return 1; } } if(al.QueryBases.length() != al.Qualities.length()){ cerr << "Cannot have different lengths for sequence and quality" << endl; return 1; } if(int(al.QueryBases.length()) != strLength){ cerr << "Cannot have different lengths for sequence and quality" << endl; return 1; } if(pe){ if(al.IsFirstMate()){ reader.GetNextAlignment(al2); if(al2.QueryBases.length() != al2.Qualities.length()){ cerr << "Cannot have different lengths for sequence and quality" << endl; return 1; } }else{ cerr << "First read should be the first mate" << endl; return 1; } } //cycle for(unsigned int i=0;i<al.QueryBases.length();i++){ short x=(short(al.Qualities[i])-qualOffset); if(al.QueryBases[i] == 'A'){ (*counterA[i])[x]++; } if(al.QueryBases[i] == 'C'){ (*counterC[i])[x]++; } if(al.QueryBases[i] == 'G'){ (*counterG[i])[x]++; } if(al.QueryBases[i] == 'T'){ (*counterT[i])[x]++; } } //The indices for al and al2 should hopefully be the same if(lengthIndex1>0){ al.GetTag("XI",seqInd1); al.GetTag("YI",qualInd1); int j; for(int i=0;i<lengthIndex1;i++){ j=i+al.QueryBases.length(); short x=(short(qualInd1[i])-qualOffset); if(seqInd1[i] == 'A'){ (*counterA[j])[x]++; } if(seqInd1[i] == 'C'){ (*counterC[j])[x]++; } if(seqInd1[i] == 'G'){ (*counterG[j])[x]++; } if(seqInd1[i] == 'T'){ (*counterT[j])[x]++; } } } if(pe){ offsetInd2=al.QueryBases.length()+lengthIndex1+al2.QueryBases.length(); int j; for(unsigned int i=0;i<al2.QueryBases.length();i++){ j=i+al.QueryBases.length()+lengthIndex1; short x=(short(al2.Qualities[i])-qualOffset); if(al2.QueryBases[i] == 'A'){ (*counterA[j])[x]++; } if(al2.QueryBases[i] == 'C'){ (*counterC[j])[x]++; } if(al2.QueryBases[i] == 'G'){ (*counterG[j])[x]++; } if(al2.QueryBases[i] == 'T'){ (*counterT[j])[x]++; } } }else{ offsetInd2=al.QueryBases.length()+lengthIndex1; } //The indices for al and al2 should hopefully be the same if(lengthIndex2>0){ al.GetTag("XJ",seqInd2); al.GetTag("YJ",qualInd2); int j; for(int i=0;i<lengthIndex2;i++){ j=offsetInd2+i; short x=(short(qualInd2[i])-qualOffset); if(seqInd2[i] == 'A'){ (*counterA[j])[x]++; } if(seqInd2[i] == 'C'){ (*counterC[j])[x]++; } if(seqInd2[i] == 'G'){ (*counterG[j])[x]++; } if(seqInd2[i] == 'T'){ (*counterT[j])[x]++; } } } } reader.Close(); cout<<"cycle\t"<<"nuc\t"; for(short k=minQualScore;k<maxQualScore;k++){ cout<<k<<"\t"; } cout<<maxQualScore<<endl; for(int i=0;i<vecLengthToUse;i++){ cout<<(i+1)<<"\t"; cout<<"A\t"; for(short k=minQualScore;k<maxQualScore;k++){ cout<<(*counterA[i])[k]<<"\t"; } cout<<(*counterA[i])[maxQualScore]<<endl; cout<<(i+1)<<"\t"; cout<<"C\t"; for(short k=minQualScore;k<maxQualScore;k++){ cout<<(*counterC[i])[k]<<"\t"; } cout<<(*counterC[i])[maxQualScore]<<endl; cout<<(i+1)<<"\t"; cout<<"G\t"; for(short k=minQualScore;k<maxQualScore;k++){ cout<<(*counterG[i])[k]<<"\t"; } cout<<(*counterG[i])[maxQualScore]<<endl; cout<<(i+1)<<"\t"; cout<<"T\t"; for(short k=minQualScore;k<maxQualScore;k++){ cout<<(*counterT[i])[k]<<"\t"; } cout<<(*counterT[i])[maxQualScore]<<endl; } return 0; }
int main (int argc, char *argv[]) { int minBaseQuality = 0; string usage=string(""+string(argv[0])+" [in BAM file] [in VCF file] [chr name] [deam out BAM] [not deam out BAM]"+ "\nThis program divides aligned single end reads into potentially deaminated\n"+ "\nreads and the puts the rest into another bam file if the deaminated positions are not called as the alternative base in the VCF.\n"+ "\nThis is like filterDeaminatedVCF but it loads the VCF before then labels the reads instead of doing it on the fly\n"+ "\nwhich is good if you have many reads in the bam file.\n"+ "\nTip: if you do not need one of them, use /dev/null as your output\n"+ "\narguments:\n"+ "\t"+"--bq [base qual] : Minimum base quality to flag a deaminated site (Default: "+stringify(minBaseQuality)+")\n"+ "\t"+"--1000g [vcf file] : VCF file from 1000g to get the putative A and T positions in modern humans (Default: "+vcf1000g+")\n"+ "\n"); if(argc == 1 || argc < 4 || (argc == 2 && (string(argv[0]) == "-h" || string(argv[0]) == "--help") ) ){ cerr << "Usage "<<usage<<endl; return 1; } for(int i=1;i<(argc-2);i++){ if(string(argv[i]) == "--bq"){ minBaseQuality=destringify<int>(argv[i+1]); i++; continue; } if(string(argv[i]) == "--1000g"){ vcf1000g=string(argv[i+1]); i++; continue; } } unsigned int maxSizeChromosome=250000000;//larger than chr1 hg19 bool * hasCnoT; bool * hasGnoA; bool * thousandGenomesHasA; bool * thousandGenomesHasT; cerr<<"Trying to allocating memory"<<endl; try{ hasCnoT = new bool[ maxSizeChromosome ]; hasGnoA = new bool[ maxSizeChromosome ]; thousandGenomesHasA = new bool[ maxSizeChromosome ]; thousandGenomesHasT = new bool[ maxSizeChromosome ]; }catch(bad_alloc& exc){ cerr<<"ERROR: allocating memory failed"<<endl; return 1; } cerr<<"Success in allocating memory"<<endl; for(unsigned int i = 0;i<maxSizeChromosome;i++){ hasCnoT[i]=false; hasGnoA[i]=false; thousandGenomesHasA[i]=false; thousandGenomesHasT[i]=false; } string bamfiletopen = string( argv[ argc-5 ] ); string vcffiletopen = string( argv[ argc-4 ] ); string chrname = string( argv[ argc-3 ] ); string deambam = string( argv[ argc-2 ] ); string nondeambam = string( argv[ argc-1 ] ); cerr<<"Reading consensus VCF "<<vcffiletopen<<" ... "<<endl; VCFreader vcfr (vcffiletopen, // vcffiletopen+".tbi", // chrname, // 1, // maxSizeChromosome, 0); while(vcfr.hasData()){ SimpleVCF * toprint=vcfr.getData(); if(toprint->getRef().length() != 1 ) continue; //if the VCF has a at least one G but no A if( toprint->hasAtLeastOneG() && !toprint->hasAtLeastOneA() ){ hasGnoA[ toprint->getPosition() ] =true; } if( toprint->hasAtLeastOneC() && !toprint->hasAtLeastOneT() ){ hasCnoT[ toprint->getPosition() ] =true; } } cerr<<"done reading VCF"<<endl; cerr<<"Reading 1000g VCF :"<<vcf1000g<<" ..."<<endl; string line1000g; ifstream myFile1000g; myFile1000g.open(vcf1000g.c_str(), ios::in); if (myFile1000g.is_open()){ while ( getline (myFile1000g,line1000g)){ vector<string> fields=allTokens(line1000g,'\t'); //0 chr //1 pos //2 id //3 ref //4 alt //check if same chr if(fields[0] != chrname){ cerr <<"Error, wrong chromosome in 1000g file for line= "<<line1000g<<endl; return 1; } //skip indels if(fields[3].size() != 1 || fields[4].size() != 1 ) continue; char ref=toupper(fields[3][0]); char alt=toupper(fields[4][0]); unsigned int pos=destringify<unsigned int>( fields[1] ); thousandGenomesHasA[ pos ] = ( (ref=='A') || (alt=='A') ); thousandGenomesHasT[ pos ] = ( (ref=='T') || (alt=='T') ); } myFile1000g.close(); }else{ cerr <<"Unable to open file "<<vcf1000g<<endl; return 1; } cerr<<"done reading 1000g VCF"<<endl; BamReader reader; if ( !reader.Open(bamfiletopen) ) { cerr << "Could not open input BAM file"<< bamfiletopen << endl; return 1; } //positioning the bam file int refid=reader.GetReferenceID(chrname); if(refid < 0){ cerr << "Cannot retrieve the reference ID for "<< chrname << endl; return 1; } //cout<<"redif "<<refid<<endl; //setting the BAM reader at that position reader.SetRegion(refid, 0, refid, -1); vector<RefData> testRefData=reader.GetReferenceData(); const SamHeader header = reader.GetHeader(); const RefVector references = reader.GetReferenceData(); BamWriter writerDeam; if ( !writerDeam.Open(deambam, header, references) ) { cerr << "Could not open output BAM file" << endl; return 1; } BamWriter writerNoDeam; if ( !writerNoDeam.Open(nondeambam, header, references) ) { cerr << "Could not open output BAM file" << endl; return 1; } unsigned int totalReads =0; unsigned int deaminatedReads =0; unsigned int ndeaminatedReads =0; unsigned int skipped =0; //iterating over the alignments for these regions BamAlignment al; int i; while ( reader.GetNextAlignment(al) ) { // cerr<<al.Name<<endl; //skip unmapped if(!al.IsMapped()){ skipped++; continue; } //skip paired end ! if(al.IsPaired() ){ continue; // cerr<<"Paired end not yet coded"<<endl; // return 1; } string reconstructedReference = reconstructRef(&al); char refeBase; char readBase; bool isDeaminated; if(al.Qualities.size() != reconstructedReference.size()){ cerr<<"Quality line is not the same size as the reconstructed reference"<<endl; return 1; } isDeaminated=false; if(al.IsReverseStrand()){ //first base next to 3' i = 0 ; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); if( readBase == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){ //isDeaminated=true; } if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } if( hasGnoA[al.Position+1] && !thousandGenomesHasA[al.Position+1] ) isDeaminated=true; // transformRef(&refeBase,&readBase); // vcfr.repositionIterator(chrname,al.Position+1,al.Position+1); // while(vcfr.hasData()){ // SimpleVCF * toprint=vcfr.getData(); // // cout<<*toprint<<endl; // //skip deletions in the alt // if(toprint->getRef().length() != 1 ) // continue; // if(toprint->getRef()[0] != refeBase){ // cerr<<reconstructedReference<<endl; // cerr<<al.Position<<endl; // cerr<<numberOfDeletions(&al)<<endl; // cerr<<"Problem1 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; // exit(1); // } // //if the VCF has a at least one G but no A // if( toprint->hasAtLeastOneG() && // !toprint->hasAtLeastOneA() ){ // isDeaminated=true; // } // } } //second base next to 3' i = 1; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'G' && if( readBase == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){ //isDeaminated=true; } if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } if( hasGnoA[al.Position+2] && !thousandGenomesHasA[al.Position+2] ) isDeaminated=true; // transformRef(&refeBase,&readBase); // vcfr.repositionIterator(chrname,al.Position+2,al.Position+2); // while(vcfr.hasData()){ // SimpleVCF * toprint=vcfr.getData(); // // cout<<*toprint<<endl; // //skip deletions in the alt // if(toprint->getRef().length() != 1 ) // continue; // if(toprint->getRef()[0] != refeBase){ // cerr<<reconstructedReference<<endl; // cerr<<al.Position<<endl; // cerr<<numberOfDeletions(&al)<<endl; // cerr<<"Problem2 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; // exit(1); // } // //if the VCF has at least one G but no A // // if(toprint->hasAtLeastOneG() && // // toprint->getAlt().find("A") == string::npos){ // if( toprint->hasAtLeastOneG() && // !toprint->hasAtLeastOneA() ){ // isDeaminated=true; // } // } } //last base next to 5' i = (al.QueryBases.length()-1) ; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'G' && if( readBase == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){ //isDeaminated=true; } if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); int lengthMatches=countMatchesRecons(reconstructedReference,0); int positionJump = al.Position+lengthMatches+numberOfDeletions(&al); if(hasGnoA[positionJump] && !thousandGenomesHasA[positionJump] ) isDeaminated=true; // vcfr.repositionIterator(chrname,positionJump,positionJump); // while(vcfr.hasData()){ // SimpleVCF * toprint=vcfr.getData(); // //skip deletions in the alt // if(toprint->getRef().length() != 1 ) // continue; // if(toprint->getRef()[0] != refeBase){ // cerr<<reconstructedReference<<endl; // cerr<<al.Position<<endl; // cerr<<lengthMatches<<endl; // cerr<<numberOfDeletions(&al)<<endl; // cerr<<positionJump<<endl; // cerr<<"Problem3 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; // exit(1); // } // //if the VCF has at least one G but no A // if( toprint->hasAtLeastOneG() && // !toprint->hasAtLeastOneA() ){ // isDeaminated=true; // } // } } }else{ //first base next to 5' i = 0; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'C' if( readBase == 'T' && int(al.Qualities[i]-offset) >= minBaseQuality){ if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } // transformRef(&refeBase,&readBase); if(hasCnoT[al.Position+1] && !thousandGenomesHasT[al.Position+1] ) isDeaminated=true; // vcfr.repositionIterator(chrname,al.Position+1,al.Position+1); // while(vcfr.hasData()){ // SimpleVCF * toprint=vcfr.getData(); // //cout<<*toprint<<endl; // //skip deletions in the alt // if(toprint->getRef().length() != 1 ) // continue; // if(toprint->getRef()[0] != refeBase){ // cerr<<reconstructedReference<<endl; // cerr<<al.Position<<endl; // cerr<<numberOfDeletions(&al)<<endl; // cerr<<"Problem4 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; // exit(1); // } // //if the VCF has at least one C but no T // if( toprint->hasAtLeastOneC() && // !toprint->hasAtLeastOneT() ){ // isDeaminated=true; // } // } //cout<<al.Position+ } //second last base next to 3' i = (al.QueryBases.length()-2); refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'C' && if( readBase == 'T' && int(al.Qualities[i]-offset) >= minBaseQuality){ if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } //transformRef(&refeBase,&readBase); int lengthMatches=countMatchesRecons(reconstructedReference,1); int positionJump = al.Position+lengthMatches+numberOfDeletions(&al); if(hasCnoT[positionJump] && !thousandGenomesHasT[positionJump] ) isDeaminated=true; // vcfr.repositionIterator(chrname,positionJump,positionJump); // while(vcfr.hasData()){ // SimpleVCF * toprint=vcfr.getData(); // //skip deletions in the alt // if(toprint->getRef().length() != 1 ) // continue; // if(toprint->getRef()[0] != refeBase){ // cerr<<reconstructedReference<<endl; // cerr<<al.Position<<endl; // cerr<<lengthMatches<<endl; // cerr<<numberOfDeletions(&al)<<endl; // cerr<<positionJump<<endl; // cerr<<"Problem5 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; // exit(1); // } // if( toprint->hasAtLeastOneC() && // !toprint->hasAtLeastOneT() ){ // isDeaminated=true; // } // } } //last base next to 3' i = (al.QueryBases.length()-1); refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //&& refeBase == 'C' if( readBase == 'T' && int(al.Qualities[i]-offset) >= minBaseQuality){ if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); int lengthMatches=countMatchesRecons(reconstructedReference,0); int positionJump = al.Position+lengthMatches+numberOfDeletions(&al); if(hasCnoT[positionJump] && !thousandGenomesHasT[positionJump] ) isDeaminated=true; // vcfr.repositionIterator(chrname,positionJump,positionJump); // while(vcfr.hasData()){ // SimpleVCF * toprint=vcfr.getData(); // //skip deletions in the alt // if(toprint->getRef().length() != 1 ) // continue; // if(toprint->getRef()[0] != refeBase){ // cerr<<reconstructedReference<<endl; // cerr<<al.Position<<endl; // cerr<<lengthMatches<<endl; // cerr<<numberOfDeletions(&al)<<endl; // cerr<<positionJump<<endl; // cerr<<"Problem6 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; // exit(1); // } // if( toprint->hasAtLeastOneC() && // !toprint->hasAtLeastOneT() ){ // isDeaminated=true; // } // } } } totalReads++; if(isDeaminated){ deaminatedReads++; writerDeam.SaveAlignment(al); }else{ ndeaminatedReads++; writerNoDeam.SaveAlignment(al); } }//end for each read reader.Close(); writerDeam.Close(); writerNoDeam.Close(); delete(hasCnoT); delete(hasGnoA); cerr<<"Program finished sucessfully, out of "<<totalReads<<" mapped reads (skipped: "<<skipped<<" reads) we flagged "<<deaminatedReads<<" as deaminated and "<<ndeaminatedReads<<" as not deaminated"<<endl; return 0; }
int main (int argc, char *argv[]) { bool mapped =false; bool unmapped=false; const string usage=string(string(argv[0])+" [options] input.bam out.bam"+"\n\n"+ "This program takes a BAM file as input and produces\n"+ "another where the putative deaminated bases have\n"+ "have been cut\n"+ "\n"+ "Options:\n"); // "\t"+"-u , --unmapped" +"\n\t\t"+"For an unmapped bam file"+"\n"+ // "\t"+"-m , --mapped" +"\n\t\t"+"For an mapped bam file"+"\n"); if( (argc== 1) || (argc== 2 && string(argv[1]) == "-h") || (argc== 2 && string(argv[1]) == "-help") || (argc== 2 && string(argv[1]) == "--help") ){ cout<<"Usage:"<<endl; cout<<usage<<endl; cout<<""<<endl; return 1; } // for(int i=1;i<(argc-1);i++){ //all but the last arg // if(strcmp(argv[i],"-m") == 0 || strcmp(argv[i],"--mapped") == 0 ){ // mapped=true; // continue; // } // if(strcmp(argv[i],"-u") == 0 || strcmp(argv[i],"--unmapped") == 0 ){ // unmapped=true; // continue; // } // cerr<<"Unknown option "<<argv[i] <<" exiting"<<endl; // return 1; // } if(argc != 3){ cerr<<"Error: Must specify the input and output BAM files"; return 1; } string inbamFile =argv[argc-2]; string outbamFile=argv[argc-1]; // if(!mapped && !unmapped){ // cerr << "Please specify whether you reads are mapped or unmapped" << endl; // return 1; // } // if(mapped && unmapped){ // cerr << "Please specify either mapped or unmapped but not both" << endl; // return 1; // } BamReader reader; if ( !reader.Open(inbamFile) ) { cerr << "Could not open input BAM files." << endl; return 1; } vector<RefData> testRefData=reader.GetReferenceData(); const SamHeader header = reader.GetHeader(); const RefVector references = reader.GetReferenceData(); BamWriter writer; if ( !writer.Open(outbamFile, header, references) ) { cerr << "Could not open output BAM file" << endl; return 1; } BamAlignment al; // BamAlignment al2; // bool al2Null=true; while ( reader.GetNextAlignment(al) ) { if(al.IsPaired() ){ if(al.IsFirstMate() ){ //5' end, need to check first base only if(al.IsReverseStrand()){ // if(!al.IsMapped()){ cerr << "Cannot have reverse complemented unmapped reads: " <<al.Name<< endl; //return 1; } int indexToCheck; //last indexToCheck=al.QueryBases.length()-1; if(toupper(al.QueryBases[indexToCheck]) == 'A'){ //al.Qualities[indexToCheck]=char(offset+baseQualForDeam); al.QueryBases = al.QueryBases.substr(0,indexToCheck); al.Qualities = al.Qualities.substr(0, indexToCheck); } }else{ int indexToCheck; //first base indexToCheck=0; if(toupper(al.QueryBases[indexToCheck]) == 'T'){ //al.Qualities[indexToCheck]=char(offset+baseQualForDeam); al.QueryBases = al.QueryBases.substr(indexToCheck+1); al.Qualities = al.Qualities.substr(indexToCheck+1); } } }else{ //3' end, need to check last two bases only if( al.IsSecondMate() ){ if(al.IsReverseStrand()){ // if(!al.IsMapped()){ cerr << "Cannot have reverse complemented unmapped reads: " <<al.Name<< endl; //return 1; } int indexToCheck; //second to last indexToCheck=al.QueryBases.length()-2; if(toupper(al.QueryBases[indexToCheck]) == 'T'){ //al.Qualities[indexToCheck]=char(offset+baseQualForDeam); al.QueryBases = al.QueryBases.substr(0,indexToCheck); al.Qualities = al.Qualities.substr(0, indexToCheck); }else{ //last indexToCheck=al.QueryBases.length()-1; if(toupper(al.QueryBases[indexToCheck]) == 'T'){ //al.Qualities[indexToCheck]=char(offset+baseQualForDeam); al.QueryBases = al.QueryBases.substr(0,indexToCheck); al.Qualities = al.Qualities.substr(0, indexToCheck); } } }else{ int indexToCheck; //second base indexToCheck=1; if(toupper(al.QueryBases[indexToCheck]) == 'A'){ //al.Qualities[indexToCheck]=char(offset+baseQualForDeam); al.QueryBases = al.QueryBases.substr(indexToCheck+1); al.Qualities = al.Qualities.substr(indexToCheck+1); }else{ //first base indexToCheck=0; if(toupper(al.QueryBases[indexToCheck]) == 'A'){ //al.Qualities[indexToCheck]=char(offset+baseQualForDeam); al.QueryBases = al.QueryBases.substr(indexToCheck+1); al.Qualities = al.Qualities.substr(indexToCheck+1); } } } }else{ cerr << "Wrong state" << endl; return 1; } } }//end of paired end else{//we consider single reads to have been sequenced from 5' to 3' if(al.IsReverseStrand()){ //need to consider if(!al.IsMapped()){ cerr << "Cannot have reverse complemented unmapped reads: " <<al.Name<< endl; //return 1; } int indexToCheck; //second base indexToCheck=1; if(toupper(al.QueryBases[indexToCheck]) == 'A'){ // al.Qualities[indexToCheck]=char(offset+baseQualForDeam); // cout<<"51 "<<al.QueryBases<<endl; // cout<<"51 "<<al.Qualities<<endl; al.QueryBases = al.QueryBases.substr(indexToCheck+1); al.Qualities = al.Qualities.substr(indexToCheck+1); // cout<<"52 "<<al.QueryBases<<endl; // cout<<"52 "<<al.Qualities<<endl; }else{ //first base indexToCheck=0; if(toupper(al.QueryBases[indexToCheck]) == 'A'){ // al.Qualities[indexToCheck]=char(offset+baseQualForDeam); // cout<<"61 "<<al.QueryBases<<endl; // cout<<"61 "<<al.Qualities<<endl; al.QueryBases = al.QueryBases.substr(indexToCheck+1); al.Qualities = al.Qualities.substr(indexToCheck+1); // cout<<"62 "<<al.QueryBases<<endl; // cout<<"62 "<<al.Qualities<<endl; } } //last indexToCheck=al.QueryBases.length()-1; if(toupper(al.QueryBases[indexToCheck]) == 'A'){ // al.Qualities[indexToCheck]=char(offset+baseQualForDeam); // cout<<"21 "<<al.QueryBases<<endl; // cout<<"21 "<<al.Qualities<<endl; al.QueryBases = al.QueryBases.substr(0,indexToCheck); al.Qualities = al.Qualities.substr(0, indexToCheck); // cout<<"22 "<<al.QueryBases<<endl; // cout<<"22 "<<al.Qualities<<endl; } }else{ int indexToCheck; //first base indexToCheck=0; if(toupper(al.QueryBases[indexToCheck]) == 'T'){ // al.Qualities[indexToCheck]=char(offset+baseQualForDeam); // cout<<"11 "<<al.QueryBases<<endl; // cout<<"11 "<<al.Qualities<<endl; al.QueryBases = al.QueryBases.substr(indexToCheck+1); al.Qualities = al.Qualities.substr(indexToCheck+1); // cout<<"12 "<<al.QueryBases<<endl; // cout<<"12 "<<al.Qualities<<endl; } //second to last indexToCheck=al.QueryBases.length()-2; if(toupper(al.QueryBases[indexToCheck]) == 'T'){ // al.Qualities[indexToCheck]=char(offset+baseQualForDeam); // cout<<"31 "<<al.QueryBases<<endl; // cout<<"31 "<<al.Qualities<<endl; al.QueryBases = al.QueryBases.substr(0,indexToCheck); al.Qualities = al.Qualities.substr(0, indexToCheck); // cout<<"32 "<<al.QueryBases<<endl; // cout<<"32 "<<al.Qualities<<endl; }else{ //last indexToCheck=al.QueryBases.length()-1; if(toupper(al.QueryBases[indexToCheck]) == 'T'){ // al.Qualities[indexToCheck]=char(offset+baseQualForDeam); // cout<<"41 "<<al.QueryBases<<endl; // cout<<"41 "<<al.Qualities<<endl; al.QueryBases = al.QueryBases.substr(0,indexToCheck); al.Qualities = al.Qualities.substr(0, indexToCheck); // cout<<"42 "<<al.QueryBases<<endl; // cout<<"42 "<<al.Qualities<<endl; } } } }//end of single end writer.SaveAlignment(al); }// while ( reader.GetNextAlignment(al) ) { reader.Close(); writer.Close(); return 0; }
int main (int argc, char *argv[]) { bool produceUnCompressedBAM=false; bool verbose=false; bool ancientDNA=false; bool keepOrig=false; string adapter_F=options_adapter_F_BAM; string adapter_S=options_adapter_S_BAM; string adapter_chimera=options_adapter_chimera_BAM; string key=""; bool allowMissing=false; int trimCutoff=1; bool allowAligned=false; bool printLog=false; string logFileName; BamReader reader; BamWriter writer; string bamFile; string bamFileOUT=""; string key1; string key2; bool useDist=false; double location=-1.0; double scale =-1.0; bool fastqFormat=false; string fastqfile1 = ""; string fastqfile2 = ""; string fastqoutfile = ""; bool singleEndModeFQ=true; const string usage=string(string(argv[0])+ " [options] BAMfile"+"\n"+ "\nThis program takes an unaligned BAM where mates are consecutive\nor fastq files and trims and merges reads\n"+ "\n\tYou can specify a unaligned bam file or one or two fastq :\n"+ "\t\t"+"-fq1" +"\t\t"+"First fastq"+"\n"+ "\t\t"+"-fq2" +"\t\t"+"Second fastq file (for paired-end)"+"\n"+ "\t\t"+"-fqo" +"\t\t"+"Output fastq prefix"+"\n\n"+ //"\t"+"-p , --PIPE"+"\n\t\t"+"Read BAM from and write it to PIPE"+"\n"+ "\t"+"-o , --outfile" +"\t\t"+"Output (BAM format)."+"\n"+ "\t"+"-u " +"\t\t"+"Produce uncompressed bam (good for pipe)"+"\n"+ // "\t"+" , --outprefix" +"\n\t\t"+"Prefix for output files (default '"+outprefix+"')."+"\n"+ //"\t"+" , --SAM" +"\n\t\t"+"Output SAM not BAM."+"\n"+ "\t"+"--aligned" +"\t\t"+"Allow reads to be aligned (default "+boolStringify(allowAligned)+")"+"\n"+ "\t"+"-v , --verbose" +"\t\t"+"Turn all messages on (default "+boolStringify(verbose)+")"+"\n"+ "\t"+"--log [log file]" +"\t"+"Print a tally of merged reads to this log file (default only to stderr)"+"\n"+ "\n\t"+"Paired End merging/Single Read trimming options"+"\n"+ "\t\t"+"You can specify either:"+"\n"+ "\t\t\t"+"--ancientdna"+"\t\t\t"+"ancient DNA (default "+boolStringify(ancientDNA)+")"+"\n"+ "\t\t"+" "+"\t\t\t\t"+"this allows for partial overlap"+"\n"+ "\n\t\t"+"or if you know your size length distribution:"+"\n"+ "\t\t\t"+"--loc"+"\t\t\t\t"+"Location for lognormal dist. (default none)"+"\n"+ "\t\t\t"+"--scale"+"\t\t\t\t"+"Scale for lognormal dist. (default none)"+"\n"+ // "\t\t\t\t\t\t\tGood for merging ancient DNA reads into a single sequence\n\n" "\n\t\t"+"--keepOrig"+"\t\t\t\t"+"Write original reads if they are trimmed or merged (default "+boolStringify(keepOrig)+")"+"\n"+ "\t\t\t\t\t\t\tSuch reads will be marked as PCR duplicates\n\n" "\t\t"+"-f , --adapterFirstRead" +"\t\t\t"+"Adapter that is observed after the forward read (def. Multiplex: "+options_adapter_F_BAM.substr(0,30)+")"+"\n"+ "\t\t"+"-s , --adapterSecondRead" +"\t\t"+"Adapter that is observed after the reverse read (def. Multiplex: "+options_adapter_S_BAM.substr(0,30)+")"+"\n"+ "\t\t"+"-c , --FirstReadChimeraFilter" +"\t\t"+"If the forward read looks like this sequence, the cluster is filtered out.\n\t\t\t\t\t\t\tProvide several sequences separated by comma (def. Multiplex: "+options_adapter_chimera_BAM.substr(0,30)+")"+"\n"+ "\t\t"+"-k , --key"+"\t\t\t\t"+"Key sequence with which each sequence starts. Comma separate for forward and reverse reads. (default '"+key+"')"+"\n"+ "\t\t"+"-i , --allowMissing"+"\t\t\t"+"Allow one base in one key to be missing or wrong. (default "+boolStringify(allowMissing)+")"+"\n"+ "\t\t"+"-t , --trimCutoff"+"\t\t\t"+"Lowest number of adapter bases to be observed for single Read trimming (default "+stringify(trimCutoff)+")"); if( (argc== 1) || (argc== 2 && string(argv[1]) == "-h") || (argc== 2 && string(argv[1]) == "-help") || (argc== 2 && string(argv[1]) == "--help") ){ cout<<"Usage:"<<endl; cout<<""<<endl; cout<<usage<<endl; return 1; } for(int i=1;i<(argc-1);i++){ //all but the last arg if(strcmp(argv[i],"-fq1") == 0 ){ fastqfile1=string(argv[i+1]); fastqFormat=true; i++; continue; } if(strcmp(argv[i],"-fq2") == 0 ){ fastqfile2=string(argv[i+1]); fastqFormat=true; singleEndModeFQ=false; i++; continue; } if(strcmp(argv[i],"-fqo") == 0 ){ fastqoutfile=string(argv[i+1]); fastqFormat=true; i++; continue; } if(strcmp(argv[i],"--log") == 0 ){ logFileName =string(argv[i+1]); printLog=true; i++; continue; } if(strcmp(argv[i],"-p") == 0 || strcmp(argv[i],"--PIPE") == 0 ){ cerr<<"This version no longer works with pipe, exiting"<<endl; return 1; } if(strcmp(argv[i],"-u") == 0 ){ produceUnCompressedBAM=true; continue; } if(strcmp(argv[i],"--aligned") == 0 ){ allowAligned=true; continue; } if(strcmp(argv[i],"-o") == 0 || strcmp(argv[i],"--outfile") == 0 ){ bamFileOUT =string(argv[i+1]); i++; continue; } if(strcmp(argv[i],"-v") == 0 || strcmp(argv[i],"--verbose") == 0 ){ verbose=true; continue; } if(strcmp(argv[i],"--ancientdna") == 0 ){ ancientDNA=true; continue; } if(strcmp(argv[i],"--keepOrig") == 0 ){ keepOrig=true; continue; } if(strcmp(argv[i],"--loc") == 0 ){ location =destringify<double>(argv[i+1]); i++; continue; } if(strcmp(argv[i],"--scale") == 0 ){ scale =destringify<double>(argv[i+1]); i++; continue; } if(strcmp(argv[i],"-f") == 0 || strcmp(argv[i],"--adapterFirstRead") == 0 ){ adapter_F =string(argv[i+1]); i++; continue; } if(strcmp(argv[i],"-s") == 0 || strcmp(argv[i],"--adapterSecondRead") == 0 ){ adapter_S =string(argv[i+1]); i++; continue; } if(strcmp(argv[i],"-c") == 0 || strcmp(argv[i],"--FirstReadChimeraFilter") == 0 ){ adapter_chimera =string(argv[i+1]); i++; continue; } if(strcmp(argv[i],"-k") == 0 || strcmp(argv[i],"--keys") == 0 ){ key =string(argv[i+1]); i++; continue; } if(strcmp(argv[i],"-i") == 0 || strcmp(argv[i],"--allowMissing") == 0 ){ allowMissing=true; continue; } if(strcmp(argv[i],"-t") == 0 || strcmp(argv[i],"--trimCutoff") == 0 ){ trimCutoff=atoi(argv[i+1]); i++; continue; } cerr<<"Unknown option "<<argv[i] <<" exiting"<<endl; return 1; } bamFile=argv[argc-1]; if( (location != -1.0 && scale == -1.0) || (location == -1.0 && scale != -1.0) ){ cerr<<"Cannot specify --location without specifying --scale"<<endl; return 1; } if( (location != -1.0 && scale != -1.0) ){ useDist=true; if(ancientDNA){ cerr<<"Cannot specify --location/--scale and --ancientDNA"<<endl; return 1; } } MergeTrimReads mtr (adapter_F,adapter_S,adapter_chimera, key1,key2, trimCutoff,allowMissing,ancientDNA,location,scale,useDist); fqwriters onereadgroup; if(fastqFormat){ if( bamFileOUT != "" || produceUnCompressedBAM || allowAligned){ cerr<<"ERROR : Cannot specify options like -o, -u or --allowAligned for fastq"<<endl; return 1; } if(fastqfile1 == ""){ cerr<<"ERROR : Must specify as least the first file for fastq"<<endl; return 1; } FastQParser * fqp1; FastQParser * fqp2; if(singleEndModeFQ){ fqp1 = new FastQParser (fastqfile1); string outdirs = fastqoutfile+".fq.gz"; string outdirsf = fastqoutfile+".fail.fq.gz"; onereadgroup.single.open(outdirs.c_str(), ios::out); onereadgroup.singlef.open(outdirsf.c_str(), ios::out); if(!onereadgroup.single.good()){ cerr<<"Cannot write to file "<<outdirs<<endl; return 1; } if(!onereadgroup.singlef.good()){ cerr<<"Cannot write to file "<<outdirsf<<endl; return 1; } }else{ fqp1 = new FastQParser (fastqfile1); fqp2 = new FastQParser (fastqfile2); string outdirs = fastqoutfile+".fq.gz"; string outdir1 = fastqoutfile+"_r1.fq.gz"; string outdir2 = fastqoutfile+"_r2.fq.gz"; string outdirsf = fastqoutfile+".fail.fq.gz"; string outdir1f = fastqoutfile+"_r1.fail.fq.gz"; string outdir2f = fastqoutfile+"_r2.fail.fq.gz"; onereadgroup.single.open(outdirs.c_str(), ios::out); onereadgroup.pairr1.open(outdir1.c_str(), ios::out); onereadgroup.pairr2.open(outdir2.c_str(), ios::out); onereadgroup.singlef.open(outdirsf.c_str(), ios::out); onereadgroup.pairr1f.open(outdir1f.c_str(), ios::out); onereadgroup.pairr2f.open(outdir2f.c_str(), ios::out); if(!onereadgroup.single.good()){ cerr<<"Cannot write to file "<<outdirs<<endl; return 1; } if(!onereadgroup.pairr1.good()){ cerr<<"Cannot write to file "<<outdir1<<endl; return 1; } if(!onereadgroup.pairr2.good()){ cerr<<"Cannot write to file "<<outdir2<<endl; return 1; } if(!onereadgroup.singlef.good()){ cerr<<"Cannot write to file "<<outdirsf<<endl; return 1; } if(!onereadgroup.pairr1f.good()){ cerr<<"Cannot write to file "<<outdir1f<<endl; return 1; } if(!onereadgroup.pairr2f.good()){ cerr<<"Cannot write to file "<<outdir2f<<endl; return 1; } } unsigned int totalSeqs=0; while(fqp1->hasData()){ FastQObj * fo1=fqp1->getData(); vector<string> def1=allTokens( *(fo1->getID()), ' ' ); string def1s=def1[0]; FastQObj * fo2; string def2s; string ext2s; if(!singleEndModeFQ){ if(!fqp2->hasData()){ cerr << "ERROR: Discrepency between fastq files at record " << *(fo1->getID()) <<endl; return 1; } fo2=fqp2->getData(); vector<string> def2=allTokens( *(fo2->getID()), ' ' ); def2s=def2[0]; if(strEndsWith(def1s,"/1")){ def1s=def1s.substr(0,def1s.size()-2); } if(strEndsWith(def2s,"/2")){ def2s=def2s.substr(0,def2s.size()-2); } if(strBeginsWith(def1s,"@")){ def1s=def1s.substr(1,def1s.size()-1); } if(strBeginsWith(def2s,"@")){ def2s=def2s.substr(1,def2s.size()-1); } if(def1s != def2s){ cerr << "ERROR: Discrepency between fastq files, different names " << *(fo1->getID()) <<" and "<< *(fo2->getID()) <<endl; return 1; } merged result= mtr.process_PE(*(fo1->getSeq()),*(fo1->getQual()), *(fo2->getSeq()),*(fo2->getQual())); mtr.incrementCountall(); if(result.code != ' '){ //keys or chimeras if(result.code == 'K'){ mtr.incrementCountfkey(); }else{ if(result.code == 'D'){ mtr.incrementCountchimera(); }else{ cerr << "leehom: Wrong return code =\""<<result.code<<"\""<<endl; exit(1); } } onereadgroup.pairr2f<<"@"<<def2s<<"/2" <<endl <<*(fo2->getSeq())<<endl<<"+"<<endl <<*(fo2->getQual())<<endl; onereadgroup.pairr1f<<"@"<<def1s<<"/1" <<endl <<*(fo1->getSeq())<<endl<<"+"<<endl <<*(fo1->getQual())<<endl; continue; }else{ if(result.sequence != ""){ //new sequence onereadgroup.single<<"@"<<def1s<<"" <<endl << result.sequence<<endl<<"+"<<endl <<result.quality<<endl; if( result.sequence.length() > max(fo1->getSeq()->length(),fo2->getSeq()->length()) ){ mtr.incrementCountmergedoverlap(); }else{ mtr.incrementCountmerged(); } }else{ //keep as is mtr.incrementCountnothing(); onereadgroup.pairr2<<"@"<<def2s<<"/2" <<endl <<*(fo2->getSeq())<<endl<<"+"<<endl <<*(fo2->getQual())<<endl; onereadgroup.pairr1<<"@"<<def1s<<"/1" <<endl <<*(fo1->getSeq())<<endl<<"+"<<endl <<*(fo1->getQual())<<endl; } } }else{ merged result=mtr.process_SR(*(fo1->getSeq()),*(fo1->getQual())); mtr.incrementCountall(); if(result.code != ' '){ //either chimera or missing key if(result.code == 'K'){ mtr.incrementCountfkey(); }else{ if(result.code == 'D'){ mtr.incrementCountchimera(); }else{ cerr << "leehom: Wrong return code =\""<<result.code<<"\""<<endl; exit(1); } } onereadgroup.singlef<<""<<*(fo1->getID())<<"" <<endl << *(fo1->getSeq())<<endl<<"+"<<endl <<*(fo1->getQual())<<endl; continue; } if(result.sequence != ""){ //new sequence mtr.incrementCounttrimmed(); onereadgroup.single<<""<<*(fo1->getID())<<"" <<endl << result.sequence<<endl<<"+"<<endl <<result.quality<<endl; }else{ mtr.incrementCountnothing(); onereadgroup.single<<""<<*(fo1->getID())<<"" <<endl << *(fo1->getSeq())<<endl<<"+"<<endl <<*(fo1->getQual())<<endl; } } totalSeqs++; } delete fqp1; if(!singleEndModeFQ){ delete fqp2; } if(singleEndModeFQ){ onereadgroup.single.close(); onereadgroup.singlef.close(); }else{ onereadgroup.single.close(); onereadgroup.pairr1.close(); onereadgroup.pairr2.close(); onereadgroup.singlef.close(); onereadgroup.pairr1f.close(); onereadgroup.pairr2f.close(); } //fastq }else{ //else BAM // initMerge(); // set_adapter_sequences(adapter_F, // adapter_S, // adapter_chimera); // set_options(trimCutoff,allowMissing,mergeoverlap); if(key != ""){ size_t found=key.find(","); if (found == string::npos){ //single end reads key1=key; key2=""; } else{ //paired-end key1=key.substr(0,found); key2=key.substr(found+1,key.length()-found+1); } } if( bamFileOUT == "" ){ cerr<<"The output must be a be specified, exiting"<<endl; return 1; } if ( !reader.Open(bamFile) ) { cerr << "Could not open input BAM file "<<bamFile << endl; return 1; } SamHeader header = reader.GetHeader(); string pID = "mergeTrimReadsBAM"; string pName = "mergeTrimReadsBAM"; string pCommandLine = ""; for(int i=0;i<(argc);i++){ pCommandLine += (string(argv[i])+" "); } putProgramInHeader(&header,pID,pName,pCommandLine,returnGitHubVersion(string(argv[0]),"..")); const RefVector references = reader.GetReferenceData(); //we will not call bgzip with full compression, good for piping into another program to //lessen the load on the CPU if(produceUnCompressedBAM) writer.SetCompressionMode(BamWriter::Uncompressed); if ( !writer.Open(bamFileOUT,header,references) ) { cerr << "Could not open output BAM file "<<bamFileOUT << endl; return 1; } SamHeader sh=reader.GetHeader(); //Up to the user to be sure that a sequence is followed by his mate // if(!sh.HasSortOrder() || // sh.SortOrder != "queryname"){ // cerr << "Bamfile must be sorted by queryname" << endl; // return 1; // } BamAlignment al; BamAlignment al2; bool al2Null=true; while ( reader.GetNextAlignment(al) ) { if(al.IsMapped() || al.HasTag("NM") || al.HasTag("MD") ){ if(!allowAligned){ cerr << "Reads should not be aligned" << endl; return 1; }else{ //should we remove tags ? } } if(al.IsPaired() && al2Null ){ al2=al; al2Null=false; continue; }else{ if(al.IsPaired() && !al2Null){ bool result = mtr.processPair(al,al2); if( result ){//was merged BamAlignment orig; BamAlignment orig2; if(keepOrig){ orig2 = al2; orig = al; } writer.SaveAlignment(al); if(keepOrig){ orig.SetIsDuplicate(true); orig2.SetIsDuplicate(true); writer.SaveAlignment(orig2); writer.SaveAlignment(orig); } //the second record is empty }else{ //keep the sequences as pairs writer.SaveAlignment(al2); writer.SaveAlignment(al); } // // SINGLE END // }else{ BamAlignment orig; if(keepOrig){ orig =al; } mtr.processSingle(al); if(keepOrig){ //write duplicate if(orig.QueryBases.length() != al.QueryBases.length()){ orig.SetIsDuplicate(true); writer.SaveAlignment(orig); } } writer.SaveAlignment(al); } //end single end al2Null=true; }//second pair } //while al reader.Close(); writer.Close(); } //else BAM cerr <<mtr.reportSingleLine()<<endl; if(printLog){ ofstream fileLog; fileLog.open(logFileName.c_str()); if (fileLog.is_open()){ fileLog <<mtr.reportMultipleLines() <<endl; }else{ cerr << "Unable to print to file "<<logFileName<<endl; } fileLog.close(); } return 0; }
int main (int argc, char *argv[]) { string usage=string(""+string(argv[0])+" [in BAM file]"+ "\nThis program reads a BAM file and computes the error rate for each cycle\n"+ // "\nreads and the puts the rest into another bam file.\n"+ // "\nTip: if you do not need one of them, use /dev/null as your output\n"+ // "arguments:\n"+ // "\t"+"--bq [base qual] : Minimum base quality to flag a deaminated site (Default: "+stringify(minBaseQuality)+")\n"+ "\n"); if(argc == 1 || (argc == 2 && (string(argv[0]) == "-h" || string(argv[0]) == "--help") ) ){ cerr << "Usage "<<usage<<endl; return 1; } // for(int i=1;i<(argc-2);i++){ // if(string(argv[i]) == "--bq"){ // minBaseQuality=destringify<int>(argv[i+1]); // i++; // continue; // } // } string bamfiletopen = string( argv[ argc-1 ] ); // string deambam = string( argv[ argc-2 ] ); // string nondeambam = string( argv[ argc-1 ] ); BamReader reader; if ( !reader.Open(bamfiletopen) ) { cerr << "Could not open input BAM file"<< bamfiletopen << endl; return 1; } //iterating over the alignments for these regions BamAlignment al; bool pairedEnd=false; bool firstRead=true; while ( reader.GetNextAlignment(al) ) { if(firstRead){ //reads are either all paired end or single end, I don't allow a mix numberOfCycles=al.QueryBases.size(); // cout<<"numberOfCycles "<<numberOfCycles<<endl; if(al.IsPaired() ){ pairedEnd=true; matches = vector<unsigned int> (2*numberOfCycles,0); mismatches = vector<unsigned int> (2*numberOfCycles,0); typesOfMismatches = vector< vector<unsigned int> >(); for(int i=0;i<12;i++) typesOfMismatches.push_back( vector<unsigned int> (2*numberOfCycles,0) ); }else{ matches = vector<unsigned int> ( numberOfCycles,0); mismatches = vector<unsigned int> ( numberOfCycles,0); typesOfMismatches = vector< vector<unsigned int> >(); for(int i=0;i<12;i++) typesOfMismatches.push_back( vector<unsigned int> ( numberOfCycles,0) ); } firstRead=false; } if( ( pairedEnd && !al.IsPaired()) || ( !pairedEnd && al.IsPaired()) ){ cerr<<"Read "<<al.Name<<" is wrong, cannot have a mixture of paired and unpaired read for this program"<<endl; return 1; } //skip unmapped if(!al.IsMapped()) continue; if(numberOfCycles!=int(al.QueryBases.size())){ cerr<<"The length of read "<<al.Name<<" is wrong, should be "<<numberOfCycles<<"bp"<<endl; return 1; } string reconstructedReference = reconstructRef(&al); if(al.Qualities.size() != reconstructedReference.size()){ cerr<<"Quality line is not the same size as the reconstructed reference"<<endl; return 1; } if( pairedEnd ){ if( al.IsFirstMate() ){ //start cycle 0 if( al.IsReverseStrand() ){ increaseCounters(al,reconstructedReference,numberOfCycles-1,-1); //start cycle numberOfCycles-1 }else{ increaseCounters(al,reconstructedReference,0 , 1); //start cycle 0 } }else{ if( al.IsSecondMate() ){ if( al.IsReverseStrand() ){ increaseCounters(al,reconstructedReference,2*numberOfCycles-1,-1); //start cycle 2*numberOfCycles-1 }else{ increaseCounters(al,reconstructedReference,numberOfCycles , 1); //start cycle numberOfCycles } }else{ cerr<<"Reads "<<al.Name<<" must be either first or second mate"<<endl; return 1; } } }else{ //single end if( al.IsReverseStrand() ){ increaseCounters(al,reconstructedReference,numberOfCycles-1,-1); //start cycle numberOfCycles-1 }else{ increaseCounters(al,reconstructedReference,0 , 1); //start cycle 0 } } }//end while each read reader.Close(); cout<<"cycle\tmatches\tmismatches\tmismatches%\tA>C\tA>C%\tA>G\tA>G%\tA>T\tA>T%\tC>A\tC>A%\tC>G\tC>G%\tC>T\tC>T%\tG>A\tG>A%\tG>C\tG>C%\tG>T\tG>T%\tT>A\tT>A%\tT>C\tT>C%\tT>G\tT>G%"<<endl; for(unsigned int i=0;i<matches.size();i++){ cout<<(i+1); if( (matches[i]+mismatches[i]!=0) ) cout<<"\t"<<matches[i]<<"\t"<<mismatches[i]<<"\t"<< 100.0*(double(mismatches[i])/double(matches[i]+mismatches[i])) ; else cout<<"\t"<<matches[i]<<"\t"<<mismatches[i]<<"\tNA"; for(int j=0;j<12;j++){ cout<<"\t"<<typesOfMismatches[j][i]; if( (matches[i]+mismatches[i]!=0) ) cout<<"\t"<<100.0*double(typesOfMismatches[j][i])/double(matches[i]+mismatches[i]); else cout<<"\tNA"; } cout<<endl; } return 0; }
virtual void main() { //init QTextStream out(stdout); BamReader reader; NGSHelper::openBAM(reader, getInfile("in")); FastqOutfileStream out1(getOutfile("out1"), false); FastqOutfileStream out2(getOutfile("out2"), false); long long c_unpaired = 0; long long c_paired = 0; int max_cached = 0; //iterate through reads BamAlignment al; QHash<QByteArray, BamAlignment> al_cache; while (reader.GetNextAlignment(al)) { //skip secondary alinments if(!al.IsPrimaryAlignment()) continue; //skip unpaired if(!al.IsPaired()) { ++c_unpaired; continue; } QByteArray name(al.Name.data()); //TODO use QByteArray::fromStdString (when upgraded to Qt5.4) //store cached read when we encounter the mate if (al_cache.contains(name)) { BamAlignment mate = al_cache.take(name); //out << name << " [AL] First: " << al.IsFirstMate() << " Reverse: " << al.IsReverseStrand() << " Seq: " << al.QueryBases.data() << endl; //out << name << " [MA] First: " << mate.IsFirstMate() << " Reverse: " << mate.IsReverseStrand() << " Seq: " << mate.QueryBases.data() << endl; if (al.IsFirstMate()) { write(out1, al, al.IsReverseStrand()); write(out2, mate, mate.IsReverseStrand()); } else { write(out1, mate, mate.IsReverseStrand()); write(out2, al, al.IsReverseStrand()); } ++c_paired; } //cache read for later retrieval else { al_cache.insert(name, al); } max_cached = std::max(max_cached, al_cache.size()); } reader.Close(); out1.close(); out2.close(); //write debug output out << "Pair reads (written) : " << c_paired << endl; out << "Unpaired reads (skipped) : " << c_unpaired << endl; out << "Unmatched paired reads (skipped): " << al_cache.size() << endl; out << endl; out << "Maximum cached reads : " << max_cached << endl; }
void BedGenomeCoverage::CoverageBam(string bamFile) { ResetChromCoverage(); // open the BAM file BamReader reader; if (!reader.Open(bamFile)) { cerr << "Failed to open BAM file " << bamFile << endl; exit(1); } // get header & reference information string header = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); // load the BAM header references into a BEDTools "genome file" _genome = new GenomeFile(refs); // convert each aligned BAM entry to BED // and compute coverage on B BamAlignment bam; while (reader.GetNextAlignment(bam)) { // skip if the read is unaligned if (bam.IsMapped() == false) continue; bool _isReverseStrand = bam.IsReverseStrand(); //changing second mate's strand to opposite if( _dUTP && bam.IsPaired() && bam.IsMateMapped() && bam.IsSecondMate()) _isReverseStrand = !bam.IsReverseStrand(); // skip if we care about strands and the strand isn't what // the user wanted if ( (_filterByStrand == true) && ((_requestedStrand == "-") != _isReverseStrand) ) continue; // extract the chrom, start and end from the BAM alignment string chrom(refs.at(bam.RefID).RefName); CHRPOS start = bam.Position; CHRPOS end = bam.GetEndPosition(false, false) - 1; // are we on a new chromosome? if ( chrom != _currChromName ) StartNewChrom(chrom); if(_pair_chip_) { // Skip if not a proper pair if (bam.IsPaired() && (!bam.IsProperPair() or !bam.IsMateMapped()) ) continue; // Skip if wrong coordinates if( ( (bam.Position<bam.MatePosition) && bam.IsReverseStrand() ) || ( (bam.MatePosition < bam.Position) && bam.IsMateReverseStrand() ) ) { //chemically designed: left on positive strand, right on reverse one continue; } /*if(_haveSize) { if (bam.IsFirstMate() && bam.IsReverseStrand()) { //put fragmentSize in to the middle of pair end_fragment int mid = bam.MatePosition+abs(bam.InsertSize)/2; if(mid<_fragmentSize/2) AddCoverage(0, mid+_fragmentSize/2); else AddCoverage(mid-_fragmentSize/2, mid+_fragmentSize/2); } else if (bam.IsFirstMate() && bam.IsMateReverseStrand()) { //put fragmentSize in to the middle of pair end_fragment int mid = start+abs(bam.InsertSize)/2; if(mid<_fragmentSize/2) AddCoverage(0, mid+_fragmentSize/2); else AddCoverage(mid-_fragmentSize/2, mid+_fragmentSize/2); } } else */ if (bam.IsFirstMate() && bam.IsReverseStrand()) { //prolong to the mate to the left AddCoverage(bam.MatePosition, end); } else if (bam.IsFirstMate() && bam.IsMateReverseStrand()) { //prolong to the mate to the right AddCoverage(start, start + abs(bam.InsertSize) - 1); } } else if (_haveSize) { if(bam.IsReverseStrand()) { if(end<_fragmentSize) { //sometimes fragmentSize is bigger :( AddCoverage(0, end); } else { AddCoverage(end + 1 - _fragmentSize, end ); } } else { AddCoverage(start,start+_fragmentSize - 1); } } else // add coverage accordingly. if (!_only_5p_end && !_only_3p_end) { bedVector bedBlocks; // we always want to split blocks when a D CIGAR op is found. // if the user invokes -split, we want to also split on N ops. if (_obeySplits) { // "D" true, "N" true GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks, true, true); } else { // "D" true, "N" false GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks, true, false); } AddBlockedCoverage(bedBlocks); } else if (_only_5p_end) { CHRPOS pos = ( !bam.IsReverseStrand() ) ? start : end; AddCoverage(pos,pos); } else if (_only_3p_end) { CHRPOS pos = ( bam.IsReverseStrand() ) ? start : end; AddCoverage(pos,pos); } } // close the BAM reader.Close(); // process the results of the last chromosome. ReportChromCoverage(_currChromCoverage, _currChromSize, _currChromName, _currChromDepthHist); // report all empty chromsomes PrintEmptyChromosomes(); // report the overall coverage if asked. PrintFinalCoverage(); }
int main (int argc, char *argv[]) { // bool mapped =false; // bool unmapped=false; int bpToDecrease5=1; int bpToDecrease3=2; const string usage=string(string(argv[0])+" [options] input.bam out.bam"+"\n\n"+ "\tThis program takes a BAM file as input and produces\n"+ "\tanother where the putative deaminated bases have\n"+ "\ta base quality score of "+intStringify(baseQualForDeam)+"\n"+ "\tgiven an "+intStringify(offset)+" offset \n"+ "\n"+ "\tOptions:\n"+ "\t\t"+"-n5" +"\t\t\t"+"Decrease the nth bases surrounding the 5' ends (Default:"+stringify(bpToDecrease5)+") "+"\n"+ "\t\t"+"-n3" +"\t\t\t"+"Decrease the nth bases surrounding the 3' ends (Default:"+stringify(bpToDecrease3)+") "+"\n" ); // "\t"+"-u , --unmapped" +"\n\t\t"+"For an unmapped bam file"+"\n"+ // "\t"+"-m , --mapped" +"\n\t\t"+"For an mapped bam file"+"\n"); if( (argc== 1) || (argc== 2 && string(argv[1]) == "-h") || (argc== 2 && string(argv[1]) == "-help") || (argc== 2 && string(argv[1]) == "--help") ){ cout<<"Usage:"<<endl; cout<<usage<<endl; cout<<""<<endl; return 1; } for(int i=1;i<(argc-2);i++){ //all but the last arg if( string(argv[i]) == "-n5" ){ bpToDecrease5 = destringify<int>(argv[i+1]); i++; continue; } if( string(argv[i]) == "-n3" ){ bpToDecrease3 = destringify<int>(argv[i+1]); i++; continue; } cerr<<"Unknown option "<<argv[i] <<" exiting"<<endl; return 1; } if(argc < 3){ cerr<<"Error: Must specify the input and output BAM files"; return 1; } string inbamFile =argv[argc-2]; string outbamFile=argv[argc-1]; if(inbamFile == outbamFile){ cerr<<"Input and output files are the same"<<endl; return 1; } // if(!mapped && !unmapped){ // cerr << "Please specify whether you reads are mapped or unmapped" << endl; // return 1; // } // if(mapped && unmapped){ // cerr << "Please specify either mapped or unmapped but not both" << endl; // return 1; // } BamReader reader; if ( !reader.Open(inbamFile) ) { cerr << "Could not open input BAM files." << endl; return 1; } vector<RefData> testRefData=reader.GetReferenceData(); SamHeader header = reader.GetHeader(); string pID = "decrQualDeaminated"; string pName = "decrQualDeaminated"; string pCommandLine = ""; for(int i=0;i<(argc);i++){ pCommandLine += (string(argv[i])+" "); } putProgramInHeader(&header,pID,pName,pCommandLine); const RefVector references = reader.GetReferenceData(); BamWriter writer; if ( !writer.Open(outbamFile, header, references) ) { cerr << "Could not open output BAM file" << endl; return 1; } BamAlignment al; // BamAlignment al2; // bool al2Null=true; while ( reader.GetNextAlignment(al) ) { if(al.IsPaired() ){ if(al.IsFirstMate() ){ //5' end, need to check first base only if(al.IsReverseStrand()){ // if(!al.IsMapped()){ cerr << "Cannot have reverse complemented unmapped reads :" <<al.Name<< endl; //return 1; } int indexToCheck; //5' of first mate reversed indexToCheck=al.QueryBases.length()-1; for(int i=0;i<bpToDecrease5;i++){ if(toupper(al.QueryBases[indexToCheck]) == 'A'){ al.Qualities[indexToCheck]=char(offset+baseQualForDeam); } indexToCheck=max(indexToCheck-1,0); } }else{ int indexToCheck; //5' of first mate indexToCheck=0; for(int i=0;i<bpToDecrease5;i++){ //first base if(toupper(al.QueryBases[indexToCheck]) == 'T'){ al.Qualities[indexToCheck]=char(offset+baseQualForDeam); } indexToCheck=min(indexToCheck+1,int(al.Qualities.size())); } } }else{ //3' end, need to check last two bases only if( al.IsSecondMate() ){ if(al.IsReverseStrand()){ // if(!al.IsMapped()){ cerr << "Cannot have reverse complemented unmapped reads :" <<al.Name<< endl; //return 1; } int indexToCheck; //3' of second mate reversed indexToCheck=al.QueryBases.length()-1; for(int i=0;i<bpToDecrease3;i++){ if(toupper(al.QueryBases[indexToCheck]) == 'T'){ al.Qualities[indexToCheck]=char(offset+baseQualForDeam); } indexToCheck=max(indexToCheck-1,0); } }else{ int indexToCheck; //3' of second mate forward indexToCheck=0; for(int i=0;i<bpToDecrease3;i++){ //first base if(toupper(al.QueryBases[indexToCheck]) == 'A'){ al.Qualities[indexToCheck]=char(offset+baseQualForDeam); } indexToCheck=min(indexToCheck+1,int(al.Qualities.size())); } } }else{ cerr << "Wrong state" << endl; return 1; } } }//end of paired end else{//we consider single reads to have been sequenced from 5' to 3' if(al.IsReverseStrand()){ //need to consider if(!al.IsMapped()){ cerr << "Cannot have reverse complemented unmapped reads :" <<al.Name<< endl; //return 1; } int indexToCheck; //5' of single read reversed indexToCheck=al.QueryBases.length()-1; for(int i=0;i<bpToDecrease5;i++){ if(toupper(al.QueryBases[indexToCheck]) == 'A'){ al.Qualities[indexToCheck]=char(offset+baseQualForDeam); } indexToCheck=max(indexToCheck-1,0); } //3' of single read reversed indexToCheck=0; for(int i=0;i<bpToDecrease3;i++){ //first base if(toupper(al.QueryBases[indexToCheck]) == 'A'){ al.Qualities[indexToCheck]=char(offset+baseQualForDeam); } indexToCheck=min(indexToCheck+1,int(al.Qualities.size())); } }else{ int indexToCheck; //5' of single read indexToCheck=0; for(int i=0;i<bpToDecrease5;i++){ //first base if(toupper(al.QueryBases[indexToCheck]) == 'T'){ al.Qualities[indexToCheck]=char(offset+baseQualForDeam); } indexToCheck=min(indexToCheck+1,int(al.Qualities.size())); } //3' of single read indexToCheck=al.QueryBases.length()-1; for(int i=0;i<bpToDecrease3;i++){ if(toupper(al.QueryBases[indexToCheck]) == 'T'){ al.Qualities[indexToCheck]=char(offset+baseQualForDeam); } indexToCheck=max(indexToCheck-1,0); } } }//end of single end writer.SaveAlignment(al); }// while ( reader.GetNextAlignment(al) ) { reader.Close(); writer.Close(); cerr<<"Program terminated gracefully"<<endl; return 0; }
int main (int argc, char *argv[]) { if(argc != 4){ cerr<<"This program strips the mapping information and cuts sequences"<<endl; cerr<<"Usage "<<argv[0]<<" [bam file in] [bam file out] [distribution]"<<endl; cerr<<"The distribution is one per line"<<endl; return 1; } string bamfiletopen = string(argv[1]); string bamfiletwrite = string(argv[2]); string fileDist = string(argv[3]); igzstream myFile; string line; vector<int> distToUse; myFile.open(fileDist.c_str(), ios::in); if (myFile.good()){ while ( getline (myFile,line)){ distToUse.push_back( destringify<int>(line) ); } myFile.close(); }else{ cerr << "Unable to open file "<<fileDist<<endl; return 1; } cerr<<"Read "<<distToUse.size()<<" data points "<<endl; cerr<<"Reading "<<bamfiletopen<<" writing to "<<bamfiletwrite<<endl; BamReader reader; BamWriter writer; if ( !reader.Open(bamfiletopen) ) { cerr << "Could not open input BAM files." << endl; return 1; } SamHeader myHeader=reader.GetHeader(); SamProgram sp; string pID = "removeTagsMapping"; string pName = "removeTagsMapping"; string pCommandLine = ""; for(int i=0;i<(argc);i++){ pCommandLine += (string(argv[i])+" "); } putProgramInHeader(&myHeader,pID,pName,pCommandLine,returnGitHubVersion(string(argv[0]),".")); //no @SQ myHeader.Sequences.Clear(); vector< RefData > emptyRefVector; if( !writer.Open(bamfiletwrite,myHeader,emptyRefVector ) ) { cerr << "Could not open output BAM file "<<bamfiletwrite << endl; return 1; } unsigned int readsTotal=0; BamAlignment al; while ( reader.GetNextAlignment(al) ) { //deleting tag data al.TagData=""; //reset the flag // if(al.IsPaired()){ // if(al.IsFirstMate()){ // al.AlignmentFlag = flagFirstPair; // }else{ // al.AlignmentFlag = flagSecondPair; // } // }else{ // } if(al.IsPaired()){ if(al.IsFirstMate()){ al.Name = al.Name+"/1"; }else{ al.Name = al.Name+"/2"; } } al.AlignmentFlag = flagSingleReads; //no ref or positon al.RefID=-1; al.MateRefID=-1; al.Position=-1; al.MatePosition=-1; //no insert size al.InsertSize=0; //no cigar al.CigarData.clear(); //no mapping quality al.MapQuality=0; int length = distToUse[ randomInt(0,distToUse.size()-1) ]; length = MIN( length, al.Length); al.QueryBases = al.QueryBases.substr(0,length); al.Qualities = al.Qualities.substr( 0,length); writer.SaveAlignment(al); readsTotal++; } reader.Close(); writer.Close(); cerr<<"Program "<<argv[0]<<" terminated gracefully, looked at "<<readsTotal<<endl; return 0; }