/** * Main work method. Reads the BAM file once and collects sorted information about * the 5' ends of both ends of each read (or just one end in the case of pairs). * Then makes a pass through those determining duplicates before re-reading the * input file and writing it out with duplication flags set correctly. */ int MarkDuplicates::runInternal() { ogeNameThread("am_MarkDuplicates"); if(verbose) cerr << "Reading input file and constructing read end information." << endl; buildSortedReadEndLists(); generateDuplicateIndexes(); if(verbose) cerr << "Marking " << numDuplicateIndices << " records as duplicates." << endl; BamReader in; in.Open(getBufferFileName()); // Now copy over the file while marking all the necessary indexes as duplicates long recordInFileIndex = 0; long written = 0; while (true) { BamAlignment * prec = in.GetNextAlignment(); if(!prec) break; if (prec->IsPrimaryAlignment()) { if (duplicateIndexes.count(recordInFileIndex) == 1) prec->SetIsDuplicate(true); else prec->SetIsDuplicate(false); } recordInFileIndex++; if (removeDuplicates && prec->IsDuplicate()) { // do nothing } else { putOutputAlignment(prec); if (verbose && read_count && ++written % 100000 == 0) { cerr << "\rWritten " << written << " records (" << written * 100 / read_count <<"%)." << std::flush; } } } if (verbose && read_count) cerr << "\rWritten " << written << " records (" << written * 100 / read_count <<"%)." << endl; in.Close(); remove(getBufferFileName().c_str()); return 0; }
int DataStatisticsTool::Execute() { // iterate over reads in BAM file(s) BamAlignment alignObj; while(bamReader.GetNextAlignment(alignObj)) { if (alignObj.IsDuplicate()) continue; if (alignObj.IsFailedQC()) continue; if (!alignObj.IsMapped()) continue; if (!alignObj.IsPrimaryAlignment()) continue; if (alignObj.IsPaired() && !alignObj.IsProperPair()) continue; if (alignObj.IsPaired() && !alignObj.IsMateMapped()) continue; if (!alignObj.HasTag("MD")) continue; // // debug // GenericBamAlignmentTools::printBamAlignmentCigar(alignObj); // GenericBamAlignmentTools::printBamAlignmentMD(alignObj); // shift InDel GenericBamAlignmentTools::leftShiftInDel(alignObj); // // debug // GenericBamAlignmentTools::printBamAlignmentCigar(alignObj); // GenericBamAlignmentTools::printBamAlignmentMD(alignObj); // get the alignment sequences string alignRead; string alignGenome; GenericBamAlignmentTools::getAlignmentSequences(alignObj, alignRead, alignGenome); // update the statistics statistics.update(alignRead, alignGenome); } // print to screen cout << statistics << endl; // statistics.printMatchMismatch(); // close BAM reader bamReader.Close(); // close Fasta genomeFasta.Close(); return 1; }
bool check(const PropertyFilter& filter, const BamAlignment& al) { bool keepAlignment = true; const PropertyMap& properties = filter.Properties; PropertyMap::const_iterator propertyIter = properties.begin(); PropertyMap::const_iterator propertyEnd = properties.end(); for ( ; propertyIter != propertyEnd; ++propertyIter ) { // check alignment data field depending on propertyName const string& propertyName = (*propertyIter).first; const PropertyFilterValue& valueFilter = (*propertyIter).second; if ( propertyName == ALIGNMENTFLAG_PROPERTY ) keepAlignment &= valueFilter.check(al.AlignmentFlag); else if ( propertyName == CIGAR_PROPERTY ) { stringstream cigarSs; const vector<CigarOp>& cigarData = al.CigarData; if ( !cigarData.empty() ) { vector<CigarOp>::const_iterator cigarBegin = cigarData.begin(); vector<CigarOp>::const_iterator cigarIter = cigarBegin; vector<CigarOp>::const_iterator cigarEnd = cigarData.end(); for ( ; cigarIter != cigarEnd; ++cigarIter ) { const CigarOp& op = (*cigarIter); cigarSs << op.Length << op.Type; } keepAlignment &= valueFilter.check(cigarSs.str()); } } else if ( propertyName == INSERTSIZE_PROPERTY ) keepAlignment &= valueFilter.check(al.InsertSize); else if ( propertyName == ISDUPLICATE_PROPERTY ) keepAlignment &= valueFilter.check(al.IsDuplicate()); else if ( propertyName == ISFAILEDQC_PROPERTY ) keepAlignment &= valueFilter.check(al.IsFailedQC()); else if ( propertyName == ISFIRSTMATE_PROPERTY ) keepAlignment &= valueFilter.check(al.IsFirstMate()); else if ( propertyName == ISMAPPED_PROPERTY ) keepAlignment &= valueFilter.check(al.IsMapped()); else if ( propertyName == ISMATEMAPPED_PROPERTY ) keepAlignment &= valueFilter.check(al.IsMateMapped()); else if ( propertyName == ISMATEREVERSESTRAND_PROPERTY ) keepAlignment &= valueFilter.check(al.IsMateReverseStrand()); else if ( propertyName == ISPAIRED_PROPERTY ) keepAlignment &= valueFilter.check(al.IsPaired()); else if ( propertyName == ISPRIMARYALIGNMENT_PROPERTY ) keepAlignment &= valueFilter.check(al.IsPrimaryAlignment()); else if ( propertyName == ISPROPERPAIR_PROPERTY ) keepAlignment &= valueFilter.check(al.IsProperPair()); else if ( propertyName == ISREVERSESTRAND_PROPERTY ) keepAlignment &= valueFilter.check(al.IsReverseStrand()); else if ( propertyName == ISSECONDMATE_PROPERTY ) keepAlignment &= valueFilter.check(al.IsSecondMate()); else if ( propertyName == ISSINGLETON_PROPERTY ) { const bool isSingleton = al.IsPaired() && al.IsMapped() && !al.IsMateMapped(); keepAlignment &= valueFilter.check(isSingleton); } else if ( propertyName == MAPQUALITY_PROPERTY ) keepAlignment &= valueFilter.check(al.MapQuality); else if ( propertyName == MATEPOSITION_PROPERTY ) keepAlignment &= ( al.IsPaired() && al.IsMateMapped() && valueFilter.check(al.MateRefID) ); else if ( propertyName == MATEREFERENCE_PROPERTY ) { if ( !al.IsPaired() || !al.IsMateMapped() ) return false; BAMTOOLS_ASSERT_MESSAGE( (al.MateRefID>=0 && (al.MateRefID<(int)filterToolReferences.size())), "Invalid MateRefID"); const string& refName = filterToolReferences.at(al.MateRefID).RefName; keepAlignment &= valueFilter.check(refName); } else if ( propertyName == NAME_PROPERTY ) keepAlignment &= valueFilter.check(al.Name); else if ( propertyName == POSITION_PROPERTY ) keepAlignment &= valueFilter.check(al.Position); else if ( propertyName == QUERYBASES_PROPERTY ) keepAlignment &= valueFilter.check(al.QueryBases); else if ( propertyName == REFERENCE_PROPERTY ) { BAMTOOLS_ASSERT_MESSAGE( (al.RefID>=0 && (al.RefID<(int)filterToolReferences.size())), "Invalid RefID"); const string& refName = filterToolReferences.at(al.RefID).RefName; keepAlignment &= valueFilter.check(refName); } else if ( propertyName == TAG_PROPERTY ) keepAlignment &= checkAlignmentTag(valueFilter, al); else BAMTOOLS_ASSERT_UNREACHABLE; // if alignment fails at ANY point, just quit and return false if ( !keepAlignment ) return false; } BAMTOOLS_ASSERT_MESSAGE( keepAlignment, "Error in BamAlignmentChecker... keepAlignment should be true here"); return keepAlignment; }
virtual void main() { //init QTextStream out(stdout); BamReader reader; NGSHelper::openBAM(reader, getInfile("in")); FastqOutfileStream out1(getOutfile("out1"), false); FastqOutfileStream out2(getOutfile("out2"), false); long long c_unpaired = 0; long long c_paired = 0; int max_cached = 0; //iterate through reads BamAlignment al; QHash<QByteArray, BamAlignment> al_cache; while (reader.GetNextAlignment(al)) { //skip secondary alinments if(!al.IsPrimaryAlignment()) continue; //skip unpaired if(!al.IsPaired()) { ++c_unpaired; continue; } QByteArray name(al.Name.data()); //TODO use QByteArray::fromStdString (when upgraded to Qt5.4) //store cached read when we encounter the mate if (al_cache.contains(name)) { BamAlignment mate = al_cache.take(name); //out << name << " [AL] First: " << al.IsFirstMate() << " Reverse: " << al.IsReverseStrand() << " Seq: " << al.QueryBases.data() << endl; //out << name << " [MA] First: " << mate.IsFirstMate() << " Reverse: " << mate.IsReverseStrand() << " Seq: " << mate.QueryBases.data() << endl; if (al.IsFirstMate()) { write(out1, al, al.IsReverseStrand()); write(out2, mate, mate.IsReverseStrand()); } else { write(out1, mate, mate.IsReverseStrand()); write(out2, al, al.IsReverseStrand()); } ++c_paired; } //cache read for later retrieval else { al_cache.insert(name, al); } max_cached = std::max(max_cached, al_cache.size()); } reader.Close(); out1.close(); out2.close(); //write debug output out << "Pair reads (written) : " << c_paired << endl; out << "Unpaired reads (skipped) : " << c_unpaired << endl; out << "Unmatched paired reads (skipped): " << al_cache.size() << endl; out << endl; out << "Maximum cached reads : " << max_cached << endl; }