int main(int argc, char* argv[]) { // validate argument count if( argc != 2 ) { cerr << "USAGE: " << argv[0] << " <input BAM file> " << endl; return EXIT_FAILURE; } string filename = argv[1]; //cerr << "Printing alignments from file: " << filename << endl; BamReader reader; if (!reader.Open(filename)) { cerr << "could not open filename " << filename << endl; return EXIT_FAILURE; } cerr << filename << ": Done opening" << endl; // Header can't be used to accurately determine sort order because samtools never // changes it; instead, check after loading each read as is done with "samtools index" // We don't need to load an index (right?) // if (!reader.LocateIndex()) { // const string index_filename = filename + ".bai"; // if (!reader.OpenIndex(index_filename)) { // cerr << "could not open index" << endl; // } // } const SamHeader header = reader.GetHeader(); cerr << filename << ": Done getting header" << endl; const RefVector refs = reader.GetReferenceData(); cerr << filename << ": Done getting reference data" << endl; BamWriter writer; if (! output_bam_filename.empty()) { if (! writer.Open(output_bam_filename, header, refs)) { cerr << "Could not open BAM output file " << output_bam_filename << endl; return EXIT_FAILURE; } cerr << filename << ": Done opening BAM output file " << output_bam_filename << endl; } alignmentMap read1Map; // a single map, for all reads awaiting their mate typedef map<string,int32_t> stringMap; typedef stringMap::iterator stringMapI; stringMap ref_mates; // alignmentMap read1Map, read2Map; BamAlignment full_al; int32_t count = 0; uint32_t max_reads_in_map = 0; int32_t n_reads_skipped_unmapped = 0; int32_t n_reads_skipped_mate_unmapped = 0; int32_t n_reads_skipped_wont_see_mate = 0; int32_t n_reads_skipped_mate_tail_est = 0; int32_t n_reads_skipped_ref_mate = 0; int32_t n_reads = 0; int32_t n_singleton_reads = 0; int32_t last_RefID = -1; int32_t last_Position = -1; cerr << filename << ": Looking for up to " << pairs_to_process << " link pairs," << " total tail = " << link_pair_total_tail << " critical tail = " << link_pair_crit_tail << ", must be on diff chromosome = " << link_pair_diff_chrom << endl; while (reader.GetNextAlignment(full_al) && (! pairs_to_process || count < pairs_to_process)) { BamAlignment al = full_al; //printAlignmentInfo(al, refs); //++count; ++n_reads; if (last_RefID < 0) last_RefID = al.RefID; if (last_Position < 0) last_Position = al.Position; if (al.RefID > last_RefID) { // We've moved to the next reference sequence // Clean up reads with mates expected here that haven't been seen if (debug_ref_mate) { cerr << "MISSED " << ref_mates.size() << " ref_mates on this reference " << last_RefID << " " << refs[last_RefID].RefName << endl; } for (stringMapI rmI = ref_mates.begin(); rmI != ref_mates.end(); ++rmI) { ++n_reads_skipped_ref_mate; read1Map.erase(read1Map.find(rmI->first)); ref_mates.erase(ref_mates.find(rmI->first)); } last_RefID = al.RefID; last_Position = al.Position; } else if (! isCoordinateSorted(al.RefID, al.Position, last_RefID, last_Position)) { cerr << filename << " is not sorted, " << al.Name << " out of position" << endl; return EXIT_FAILURE; } if (! al.IsMapped()) { ++n_reads_skipped_unmapped; continue; } if (! al.IsMateMapped()) { ++n_reads_skipped_mate_unmapped; continue; } alignmentMapI mI = read1Map.find(al.Name); if (mI == read1Map.end()) { // the read name has not been seen before if (al.MateRefID < al.RefID || (al.MateRefID == al.RefID && al.MatePosition < al.Position)) { // we should have seen its mate earlier, so skip it ++n_reads_skipped_wont_see_mate; continue; } // If the mate likely to also be a link pair candidate, add the read int32_t mate_tail_est = readTailS(al.IsMateMapped(), al.IsMateReverseStrand(), al.MatePosition, refs[al.MateRefID].RefLength, max_read_length); if (mate_tail_est <= mate_tail_est_crit) { // the mate tail estimate suggests it might be a link pair candidate read1Map[al.Name] = al; // add the read to the map } else { // the mate tail estimate appears too long for the mate to be a candidate ++n_reads_skipped_mate_tail_est; continue; } if (read1Map.size() > max_reads_in_map) max_reads_in_map = read1Map.size(); if (al.MateRefID == al.RefID && al.MatePosition >= al.Position) { // the mate is expected later on this contig ref_mates[al.Name] = al.MateRefID; } } else { // get the mate's alignment, and process the pair const BamAlignment& al_mate = mI->second; if (processReadPair(al, al_mate, refs, link_pair_total_tail, link_pair_crit_tail, link_pair_diff_chrom)) { ++count; // write to the new BAM file, if the string is not empty if (! output_bam_filename.empty()) { writer.SaveAlignment(al_mate); // the first one seen writer.SaveAlignment(al); // the second one seen } } read1Map.erase(mI); if (al.MateRefID == al.RefID) { stringMapI rmI = ref_mates.find(al.Name); if (rmI == ref_mates.end()) { cerr << "expected a ref_mate, couldn't find its name: " << al.Name << endl; return EXIT_FAILURE; } ref_mates.erase(rmI); } } } cerr << "===============================" << endl; cerr << read1Map.size() << " alignments left in read1Map" << endl; cerr << max_reads_in_map << " maximum number of reads in read1Map" << endl; cerr << count << " pairs processed" << endl; cerr << "===============================" << endl; cerr << n_reads << " total reads" << endl; cerr << n_singleton_reads << " singleton reads" << endl; cerr << n_reads_skipped_unmapped << " reads skipped because unmapped" << endl; cerr << n_reads_skipped_mate_unmapped << " reads skipped because mate unmapped" << endl; cerr << n_reads_skipped_wont_see_mate << " reads skipped because mate won't be seen" << endl; cerr << n_reads_skipped_mate_tail_est << " reads skipped because mate tail appears too long" << endl; cerr << n_reads_skipped_ref_mate << " reads skipped because mate not on reference" << endl; reader.Close(); if (! output_bam_filename.empty()) { writer.Close(); } return EXIT_SUCCESS; }
bool check(const PropertyFilter& filter, const BamAlignment& al) { bool keepAlignment = true; const PropertyMap& properties = filter.Properties; PropertyMap::const_iterator propertyIter = properties.begin(); PropertyMap::const_iterator propertyEnd = properties.end(); for ( ; propertyIter != propertyEnd; ++propertyIter ) { // check alignment data field depending on propertyName const string& propertyName = (*propertyIter).first; const PropertyFilterValue& valueFilter = (*propertyIter).second; if ( propertyName == ALIGNMENTFLAG_PROPERTY ) keepAlignment &= valueFilter.check(al.AlignmentFlag); else if ( propertyName == CIGAR_PROPERTY ) { stringstream cigarSs; const vector<CigarOp>& cigarData = al.CigarData; if ( !cigarData.empty() ) { vector<CigarOp>::const_iterator cigarBegin = cigarData.begin(); vector<CigarOp>::const_iterator cigarIter = cigarBegin; vector<CigarOp>::const_iterator cigarEnd = cigarData.end(); for ( ; cigarIter != cigarEnd; ++cigarIter ) { const CigarOp& op = (*cigarIter); cigarSs << op.Length << op.Type; } keepAlignment &= valueFilter.check(cigarSs.str()); } } else if ( propertyName == INSERTSIZE_PROPERTY ) keepAlignment &= valueFilter.check(al.InsertSize); else if ( propertyName == ISDUPLICATE_PROPERTY ) keepAlignment &= valueFilter.check(al.IsDuplicate()); else if ( propertyName == ISFAILEDQC_PROPERTY ) keepAlignment &= valueFilter.check(al.IsFailedQC()); else if ( propertyName == ISFIRSTMATE_PROPERTY ) keepAlignment &= valueFilter.check(al.IsFirstMate()); else if ( propertyName == ISMAPPED_PROPERTY ) keepAlignment &= valueFilter.check(al.IsMapped()); else if ( propertyName == ISMATEMAPPED_PROPERTY ) keepAlignment &= valueFilter.check(al.IsMateMapped()); else if ( propertyName == ISMATEREVERSESTRAND_PROPERTY ) keepAlignment &= valueFilter.check(al.IsMateReverseStrand()); else if ( propertyName == ISPAIRED_PROPERTY ) keepAlignment &= valueFilter.check(al.IsPaired()); else if ( propertyName == ISPRIMARYALIGNMENT_PROPERTY ) keepAlignment &= valueFilter.check(al.IsPrimaryAlignment()); else if ( propertyName == ISPROPERPAIR_PROPERTY ) keepAlignment &= valueFilter.check(al.IsProperPair()); else if ( propertyName == ISREVERSESTRAND_PROPERTY ) keepAlignment &= valueFilter.check(al.IsReverseStrand()); else if ( propertyName == ISSECONDMATE_PROPERTY ) keepAlignment &= valueFilter.check(al.IsSecondMate()); else if ( propertyName == ISSINGLETON_PROPERTY ) { const bool isSingleton = al.IsPaired() && al.IsMapped() && !al.IsMateMapped(); keepAlignment &= valueFilter.check(isSingleton); } else if ( propertyName == MAPQUALITY_PROPERTY ) keepAlignment &= valueFilter.check(al.MapQuality); else if ( propertyName == MATEPOSITION_PROPERTY ) keepAlignment &= ( al.IsPaired() && al.IsMateMapped() && valueFilter.check(al.MateRefID) ); else if ( propertyName == MATEREFERENCE_PROPERTY ) { if ( !al.IsPaired() || !al.IsMateMapped() ) return false; BAMTOOLS_ASSERT_MESSAGE( (al.MateRefID>=0 && (al.MateRefID<(int)filterToolReferences.size())), "Invalid MateRefID"); const string& refName = filterToolReferences.at(al.MateRefID).RefName; keepAlignment &= valueFilter.check(refName); } else if ( propertyName == NAME_PROPERTY ) keepAlignment &= valueFilter.check(al.Name); else if ( propertyName == POSITION_PROPERTY ) keepAlignment &= valueFilter.check(al.Position); else if ( propertyName == QUERYBASES_PROPERTY ) keepAlignment &= valueFilter.check(al.QueryBases); else if ( propertyName == REFERENCE_PROPERTY ) { BAMTOOLS_ASSERT_MESSAGE( (al.RefID>=0 && (al.RefID<(int)filterToolReferences.size())), "Invalid RefID"); const string& refName = filterToolReferences.at(al.RefID).RefName; keepAlignment &= valueFilter.check(refName); } else if ( propertyName == TAG_PROPERTY ) keepAlignment &= checkAlignmentTag(valueFilter, al); else BAMTOOLS_ASSERT_UNREACHABLE; // if alignment fails at ANY point, just quit and return false if ( !keepAlignment ) return false; } BAMTOOLS_ASSERT_MESSAGE( keepAlignment, "Error in BamAlignmentChecker... keepAlignment should be true here"); return keepAlignment; }
void BedGenomeCoverage::CoverageBam(string bamFile) { ResetChromCoverage(); // open the BAM file BamReader reader; if (!reader.Open(bamFile)) { cerr << "Failed to open BAM file " << bamFile << endl; exit(1); } // get header & reference information string header = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); // load the BAM header references into a BEDTools "genome file" _genome = new GenomeFile(refs); // convert each aligned BAM entry to BED // and compute coverage on B BamAlignment bam; while (reader.GetNextAlignment(bam)) { // skip if the read is unaligned if (bam.IsMapped() == false) continue; bool _isReverseStrand = bam.IsReverseStrand(); //changing second mate's strand to opposite if( _dUTP && bam.IsPaired() && bam.IsMateMapped() && bam.IsSecondMate()) _isReverseStrand = !bam.IsReverseStrand(); // skip if we care about strands and the strand isn't what // the user wanted if ( (_filterByStrand == true) && ((_requestedStrand == "-") != _isReverseStrand) ) continue; // extract the chrom, start and end from the BAM alignment string chrom(refs.at(bam.RefID).RefName); CHRPOS start = bam.Position; CHRPOS end = bam.GetEndPosition(false, false) - 1; // are we on a new chromosome? if ( chrom != _currChromName ) StartNewChrom(chrom); if(_pair_chip_) { // Skip if not a proper pair if (bam.IsPaired() && (!bam.IsProperPair() or !bam.IsMateMapped()) ) continue; // Skip if wrong coordinates if( ( (bam.Position<bam.MatePosition) && bam.IsReverseStrand() ) || ( (bam.MatePosition < bam.Position) && bam.IsMateReverseStrand() ) ) { //chemically designed: left on positive strand, right on reverse one continue; } /*if(_haveSize) { if (bam.IsFirstMate() && bam.IsReverseStrand()) { //put fragmentSize in to the middle of pair end_fragment int mid = bam.MatePosition+abs(bam.InsertSize)/2; if(mid<_fragmentSize/2) AddCoverage(0, mid+_fragmentSize/2); else AddCoverage(mid-_fragmentSize/2, mid+_fragmentSize/2); } else if (bam.IsFirstMate() && bam.IsMateReverseStrand()) { //put fragmentSize in to the middle of pair end_fragment int mid = start+abs(bam.InsertSize)/2; if(mid<_fragmentSize/2) AddCoverage(0, mid+_fragmentSize/2); else AddCoverage(mid-_fragmentSize/2, mid+_fragmentSize/2); } } else */ if (bam.IsFirstMate() && bam.IsReverseStrand()) { //prolong to the mate to the left AddCoverage(bam.MatePosition, end); } else if (bam.IsFirstMate() && bam.IsMateReverseStrand()) { //prolong to the mate to the right AddCoverage(start, start + abs(bam.InsertSize) - 1); } } else if (_haveSize) { if(bam.IsReverseStrand()) { if(end<_fragmentSize) { //sometimes fragmentSize is bigger :( AddCoverage(0, end); } else { AddCoverage(end + 1 - _fragmentSize, end ); } } else { AddCoverage(start,start+_fragmentSize - 1); } } else // add coverage accordingly. if (!_only_5p_end && !_only_3p_end) { bedVector bedBlocks; // we always want to split blocks when a D CIGAR op is found. // if the user invokes -split, we want to also split on N ops. if (_obeySplits) { // "D" true, "N" true GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks, true, true); } else { // "D" true, "N" false GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks, true, false); } AddBlockedCoverage(bedBlocks); } else if (_only_5p_end) { CHRPOS pos = ( !bam.IsReverseStrand() ) ? start : end; AddCoverage(pos,pos); } else if (_only_3p_end) { CHRPOS pos = ( bam.IsReverseStrand() ) ? start : end; AddCoverage(pos,pos); } } // close the BAM reader.Close(); // process the results of the last chromosome. ReportChromCoverage(_currChromCoverage, _currChromSize, _currChromName, _currChromDepthHist); // report all empty chromsomes PrintEmptyChromosomes(); // report the overall coverage if asked. PrintFinalCoverage(); }