//{{{ SV_Pair:: SV_Pair(const BamAlignment &bam_a, // if both reads are on the same chrome, then read_l must map before read_r // if the reads are on different strands then read_l must be on the lexo // lesser chrom (using the string.compare() method) SV_Pair:: SV_Pair(const BamAlignment &bam_a, const BamAlignment &bam_b, const RefVector &refs, int _weight, int _ev_id, SV_PairReader *_reader) { reader = _reader; if ( bam_a.MapQuality < bam_b.MapQuality ) min_mapping_quality = bam_a.MapQuality; else min_mapping_quality = bam_b.MapQuality; struct interval tmp_a, tmp_b; tmp_a.start = bam_a.Position; tmp_a.end = bam_a.GetEndPosition(false, false) - 1; tmp_a.chr = refs.at(bam_a.RefID).RefName; if ( bam_a.IsReverseStrand() == true ) tmp_a.strand = '-'; else tmp_a.strand = '+'; tmp_b.start = bam_b.Position; tmp_b.end = bam_b.GetEndPosition(false, false) - 1; tmp_b.chr = refs.at(bam_b.RefID).RefName; if ( bam_b.IsReverseStrand() == true ) tmp_b.strand = '-'; else tmp_b.strand = '+'; //if ( tmp_a.chr.compare(tmp_b.chr) > 0 ) { if ( bam_a.RefID < bam_b.RefID ) { read_l = tmp_a; read_r = tmp_b; //} else if ( tmp_a.chr.compare(tmp_b.chr) < 0 ) { } else if ( bam_a.RefID > bam_b.RefID) { read_l = tmp_b; read_r = tmp_a; } else { // == if (tmp_a.start > tmp_b.start) { read_l = tmp_b; read_r = tmp_a; } else { read_l = tmp_a; read_r = tmp_b; } } weight = _weight; ev_id = _ev_id; }
void BedCoverage::CollectCoverageBam(string bamFile) { // load the "B" bed file into a map so // that we can easily compare "A" to it for overlaps _bedB->loadBedCovFileIntoMap(); // open the BAM file BamReader reader; reader.Open(bamFile); // get header & reference information string header = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); // convert each aligned BAM entry to BED // and compute coverage on B BamAlignment bam; while (reader.GetNextAlignment(bam)) { if (bam.IsMapped()) { // treat the BAM alignment as a single "block" if (_obeySplits == false) { // construct a new BED entry from the current BAM alignment. BED a; a.chrom = refs.at(bam.RefID).RefName; a.start = bam.Position; a.end = bam.GetEndPosition(false, false); a.strand = "+"; if (bam.IsReverseStrand()) a.strand = "-"; _bedB->countHits(a, _sameStrand, _diffStrand, _countsOnly); } // split the BAM alignment into discrete blocks and // look for overlaps only within each block. else { // vec to store the discrete BED "blocks" from a bedVector bedBlocks; // since we are counting coverage, we do want to split blocks when a // deletion (D) CIGAR op is encountered (hence the true for the last parm) GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks, false, true); // use countSplitHits to avoid over-counting each split chunk // as distinct read coverage. _bedB->countSplitHits(bedBlocks, _sameStrand, _diffStrand, _countsOnly); } } } // report the coverage (summary or histogram) for BED B. if (_countsOnly == true) ReportCounts(); else ReportCoverage(); // close the BAM file reader.Close(); }
//{{{ void process_pair(const BamAlignment &curr, void SV_Pair:: process_pair(const BamAlignment &curr, const RefVector refs, map<string, BamAlignment> &mapped_pairs, UCSCBins<SV_BreakPoint*> &r_bin, int weight, int ev_id, SV_PairReader *reader) { if (mapped_pairs.find(curr.Name) == mapped_pairs.end()) mapped_pairs[curr.Name] = curr; else { SV_Pair *new_pair = new SV_Pair(mapped_pairs[curr.Name], curr, refs, weight, ev_id, reader); //cerr << count_clipped(curr.CigarData) << "\t" << //count_clipped(mapped_pairs[curr.Name].CigarData) << endl; if ( new_pair->is_sane() && new_pair->is_aberrant() && (count_clipped(curr.CigarData) > 0) && (count_clipped(mapped_pairs[curr.Name].CigarData) > 0) ) { SV_BreakPoint *new_bp = new_pair->get_bp(); #ifdef TRACE cerr << "READ\t" << refs.at(mapped_pairs[curr.Name].RefID).RefName << "," << mapped_pairs[curr.Name].Position << "," << (mapped_pairs[curr.Name].GetEndPosition(false, false) - 1) << "\t" << refs.at(curr.RefID).RefName << "," << curr.Position << "," << (curr.GetEndPosition(false, false) - 1) << endl; cerr << "\tPE\t" << *new_bp << endl; #endif new_bp->cluster(r_bin); } else { delete(new_pair); } mapped_pairs.erase(curr.Name); } }
long get_ref_lengths(int id, RefVector ref) { long length = 0; for (size_t i = 0; i < (size_t) id && i < ref.size(); i++) { length += (long) ref[i].RefLength + (long) Parameter::Instance()->max_dist; } return length; }
GenomeFile::GenomeFile(const RefVector &genome) { for (size_t i = 0; i < genome.size(); ++i) { string chrom = genome[i].RefName; int length = genome[i].RefLength; _chromSizes[chrom] = length; _chromList.push_back(chrom); } }
long Breakpoint::calc_pos(long pos, RefVector ref) { size_t i = 0; pos -= ref[i].RefLength; while (i < ref.size() && pos >= 0) { i++; pos -= ref[i].RefLength; } return pos + ref[i].RefLength; }
void getBamBlocks(const BamAlignment &bam, const RefVector &refs, vector<BED> &blocks, bool breakOnDeletionOps) { CHRPOS currPosition = bam.Position; CHRPOS blockStart = bam.Position; string chrom = refs.at(bam.RefID).RefName; string name = bam.Name; string strand = "+"; string score = ToString(bam.MapQuality); char prevOp = '\0'; if (bam.IsReverseStrand()) strand = "-"; bool blocksFound = false; vector<CigarOp>::const_iterator cigItr = bam.CigarData.begin(); vector<CigarOp>::const_iterator cigEnd = bam.CigarData.end(); for ( ; cigItr != cigEnd; ++cigItr ) { if (cigItr->Type == 'M') { currPosition += cigItr->Length; // we only want to create a new block if the current M op // was preceded by an N op or a D op (and we are breaking on D ops) if ((prevOp == 'D' && breakOnDeletionOps == true) || (prevOp == 'N')) { blocks.push_back( BED(chrom, blockStart, currPosition, name, score, strand) ); blockStart = currPosition; } } else if (cigItr->Type == 'D') { if (breakOnDeletionOps == false) currPosition += cigItr->Length; else { blocksFound = true; currPosition += cigItr->Length; blockStart = currPosition; } } else if (cigItr->Type == 'N') { blocks.push_back( BED(chrom, blockStart, currPosition, name, score, strand) ); blocksFound = true; currPosition += cigItr->Length; blockStart = currPosition; } else if (cigItr->Type == 'S' || cigItr->Type == 'H' || cigItr->Type == 'P' || cigItr->Type == 'I') { // do nothing } else { cerr << "Input error: invalid CIGAR type (" << cigItr->Type << ") for: " << bam.Name << endl; exit(1); } prevOp = cigItr->Type; } // if there were no splits, we just create a block representing the contiguous alignment. if (blocksFound == false) { blocks.push_back( BED(chrom, bam.Position, currPosition, name, score, strand) ); } }
long fuck_off(long pos, RefVector ref, std::string &chr) { size_t i = 0; pos -= (ref[i].RefLength + Parameter::Instance()->max_dist); while (i < ref.size() && pos >= 0) { i++; pos -= ((long) ref[i].RefLength + (long) Parameter::Instance()->max_dist); } chr = ref[i].RefName; return pos + ref[i].RefLength + (long) Parameter::Instance()->max_dist; }
std::string Breakpoint::get_chr(long pos, RefVector ref) { // std::cout << "pos: " << pos << std::endl; size_t id = 0; while (id < ref.size() && pos >= 0) { pos -= (long) ref[id].RefLength; // std::cout << id << std::endl; id++; } return ref[id - 1].RefName; }
//------------------------------------------------------------------------- void XList::sortByElementNumber(String order){ // Get the number of sessions per speaker LKVector spk(0,0); for(unsigned long i=0;i<_vector.size();i++){ LKVector::type sps; sps.idx = i; sps.lk = _vector.getObject(i).getElementCount(); spk.addValue(sps); } // Sort Xlines of the temporary XList by element number spk.descendingSort(); // Copy the current RefVector<XLine> into a temporary one RefVector<XLine> tmpX; for(unsigned long i=0;i<_vector.size();i++){ XLine *ll = new XLine(_vector.getObject(i)); tmpX.addObject(*ll); } // Remove all elements from the XList _vector.deleteAllObjects(); // Fill the XList according to the number of elements if(order == "descend"){ for(unsigned long i=0;i<tmpX.size();i++){ _vector.addObject(tmpX.getObject(spk[i].idx)); } } else if(order == "ascend"){ for(long i=tmpX.size()-1;i>=0;i--){ _vector.addObject(tmpX.getObject(spk[i].idx)); } } }
void getBamBlocks(const BamAlignment &bam, const RefVector &refs, BedVec &blocks, bool breakOnDeletionOps) { CHRPOS currPosition = bam.Position; CHRPOS blockStart = bam.Position; string chrom = refs.at(bam.RefID).RefName; string name = bam.Name; string strand = "+"; float score = bam.MapQuality; if (bam.IsReverseStrand()) strand = "-"; vector<CigarOp>::const_iterator cigItr = bam.CigarData.begin(); vector<CigarOp>::const_iterator cigEnd = bam.CigarData.end(); for ( ; cigItr != cigEnd; ++cigItr ) { if (cigItr->Type == 'M') { currPosition += cigItr->Length; blocks.push_back( Bed(chrom, blockStart, currPosition, name, score, strand) ); blockStart = currPosition; } else if (cigItr->Type == 'D') { if (breakOnDeletionOps == false) currPosition += cigItr->Length; else { currPosition += cigItr->Length; blockStart = currPosition; } } else if (cigItr->Type == 'N') { currPosition += cigItr->Length; blockStart = currPosition; } else if (cigItr->Type == 'S' || cigItr->Type == 'H' || cigItr->Type == 'P' || cigItr->Type == 'I') { // do nothing } else { cerr << "Input error: invalid CIGAR type (" << cigItr->Type << ") for: " << bam.Name << endl; exit(1); } } }
void BedIntersect::IntersectBam(string bamFile) { // load the "B" bed file into a map so // that we can easily compare "A" to it for overlaps _bedB = new BedFile(_bedBFile); _bedB->loadBedFileIntoMap(); // create a dummy BED A file for printing purposes if not // using BAM output. if (_bamOutput == false) { _bedA = new BedFile(_bedAFile); _bedA->bedType = 12; } // open the BAM file BamReader reader; BamWriter writer; reader.Open(bamFile); // get header & reference information string bamHeader = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); // open a BAM output to stdout if we are writing BAM if (_bamOutput == true) { // set compression mode BamWriter::CompressionMode compressionMode = BamWriter::Compressed; if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed; writer.SetCompressionMode(compressionMode); // open our BAM writer writer.Open("stdout", bamHeader, refs); } vector<BED> hits; // reserve some space hits.reserve(100); BamAlignment bam; // get each set of alignments for each pair. while (reader.GetNextAlignment(bam)) { // save an unaligned read if -v if (!bam.IsMapped()) { if (_noHit == true) writer.SaveAlignment(bam); continue; } // break alignment into discrete blocks, bedVector bed_blocks; string chrom = refs.at(bam.RefID).RefName; GetBamBlocks(bam, chrom, bed_blocks, false, true); // create a basic BED entry from the BAM alignment BED bed; MakeBedFromBam(bam, chrom, bed_blocks, bed); bool overlapsFound = false; if ((_bamOutput == true) && (_obeySplits == false)) { overlapsFound = _bedB->anyHits(bed.chrom, bed.start, bed.end, bed.strand, _sameStrand, _diffStrand, _overlapFraction, _reciprocal); } else if ( ((_bamOutput == true) && (_obeySplits == true)) || ((_bamOutput == false) && (_obeySplits == true)) ) { // find the hits that overlap with the full span of the blocked BED _bedB->allHits(bed.chrom, bed.start, bed.end, bed.strand, hits, _sameStrand, _diffStrand, _overlapFraction, _reciprocal); // find the overlaps between the block in A and B overlapsFound = FindBlockedOverlaps(bed, bed_blocks, hits, _bamOutput); } else if ((_bamOutput == false) && (_obeySplits == false)) { FindOverlaps(bed, hits); } // save the BAM alignment if overlap reqs. were met if (_bamOutput == true) { if ((overlapsFound == true) && (_noHit == false)) writer.SaveAlignment(bam); else if ((overlapsFound == false) && (_noHit == true)) writer.SaveAlignment(bam); } hits.clear(); } // close the relevant BAM files. reader.Close(); if (_bamOutput == true) { writer.Close(); } }
void TagBam::Tag() { // open the annotations files for processing; OpenAnnoFiles(); // open the BAM file BamReader reader; BamWriter writer; if (!reader.Open(_bamFile)) { cerr << "Failed to open BAM file " << _bamFile << endl; exit(1); } // get header & reference information string bamHeader = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); // set compression mode BamWriter::CompressionMode compressionMode = BamWriter::Compressed; // if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed; writer.SetCompressionMode(compressionMode); // open our BAM writer writer.Open("stdout", bamHeader, refs); // rip through the BAM file and test for overlaps with each annotation file. BamAlignment al; vector<BED> hits; while (reader.GetNextAlignment(al)) { if (al.IsMapped() == true) { BED a; a.chrom = refs.at(al.RefID).RefName; a.start = al.Position; a.end = al.GetEndPosition(false, false); a.strand = "+"; if (al.IsReverseStrand()) a.strand = "-"; ostringstream annotations; // annotate the BAM file based on overlaps with the annotation files. for (size_t i = 0; i < _annoFiles.size(); ++i) { // grab the current annotation file. BedFile *anno = _annoFiles[i]; if (!_useNames && !_useScores && !_useIntervals) { // add the label for this annotation file to tag if there is overlap if (anno->anyHits(a.chrom, a.start, a.end, a.strand, _sameStrand, _diffStrand, _overlapFraction, false)) { annotations << _annoLabels[i] << ";"; } } // use the score field else if (!_useNames && _useScores && !_useIntervals) { anno->allHits(a.chrom, a.start, a.end, a.strand, hits, _sameStrand, _diffStrand, 0.0, false); for (size_t i = 0; i < hits.size(); ++i) { annotations << hits[i].score; if (i < hits.size() - 1) annotations << ","; } if (hits.size() > 0) annotations << ";"; hits.clear(); } // use the name field from the annotation files to populate tag else if (_useNames && !_useScores && !_useIntervals) { anno->allHits(a.chrom, a.start, a.end, a.strand, hits, _sameStrand, _diffStrand, 0.0, false); for (size_t j = 0; j < hits.size(); ++j) { annotations << hits[j].name; if (j < hits.size() - 1) annotations << ","; } if (hits.size() > 0) annotations << ";"; hits.clear(); } // use the full interval information annotation files to populate tag else if (!_useNames && !_useScores && _useIntervals) { anno->allHits(a.chrom, a.start, a.end, a.strand, hits, _sameStrand, _diffStrand, 0.0, false); for (size_t j = 0; j < hits.size(); ++j) { annotations << _annoLabels[i] << ":" << hits[j].chrom << ":" << hits[j].start << "-" << hits[j].end << "," << hits[j].name << "," << hits[j].score << "," << hits[j].strand; if (j < hits.size() - 1) annotations << ","; } if (hits.size() > 0) annotations << ";"; hits.clear(); } } // were there any overlaps with which to make a tag? if (annotations.str().size() > 0) { al.AddTag(_tag, "Z", annotations.str().substr(0, annotations.str().size() - 1)); // get rid of the last ";" } } writer.SaveAlignment(al); } reader.Close(); writer.Close(); // close the annotations files; CloseAnnoFiles(); }
// this has been copied from bamtools utilities, since it isn't in the API. Original file is bamtools_utilities.cpp. // Like the rest of Bamtools, it is under the BSD license. bool Filter::ParseRegionString(const string& regionString, BamRegion& region) { // ------------------------------- // parse region string // check first for empty string if ( regionString.empty() ) return false; // non-empty string, look for a colom size_t foundFirstColon = regionString.find(':'); // store chrom strings, and numeric positions string chrom; int startPos; int stopPos; // no colon found // going to use entire contents of requested chromosome // just store entire region string as startChrom name // use BamReader methods to check if its valid for current BAM file if ( foundFirstColon == string::npos ) { chrom = regionString; startPos = 0; stopPos = -1; } // colon found, so we at least have some sort of startPos requested else { // store start chrom from beginning to first colon chrom = regionString.substr(0,foundFirstColon); // look for ".." after the colon size_t foundRangeDots = regionString.find("..", foundFirstColon+1); // no dots found // so we have a startPos but no range // store contents before colon as startChrom, after as startPos if ( foundRangeDots == string::npos ) { startPos = atoi( regionString.substr(foundFirstColon+1).c_str() ); stopPos = -1; } // ".." found, so we have some sort of range selected else { // store startPos between first colon and range dots ".." startPos = atoi( regionString.substr(foundFirstColon+1, foundRangeDots-foundFirstColon-1).c_str() ); // look for second colon size_t foundSecondColon = regionString.find(':', foundRangeDots+1); // no second colon found // so we have a "standard" chrom:start..stop input format (on single chrom) if ( foundSecondColon == string::npos ) { stopPos = atoi( regionString.substr(foundRangeDots+2).c_str() ); } else { return false; } } } // ------------------------------- // validate reference IDs & genomic positions const RefVector references = getReferences(); int RefID = -1; for(int i = 0; i < references.size(); i++) { if(references[i].RefName == chrom) RefID = i; } // if startRefID not found, return false if ( RefID == -1 ) { cerr << "Can't find chromosome'" << chrom << "'" << endl; return false; } // startPos cannot be greater than or equal to reference length const RefData& startReference = references.at(RefID); if ( startPos >= startReference.RefLength ) { cerr << "Start position (" << startPos << ") after end of the reference sequence (" << startReference.RefLength << ")" << endl; return false; } // stopPosition cannot be larger than reference length const RefData& stopReference = references.at(RefID); if ( stopPos > stopReference.RefLength ) { cerr << "Start position (" << stopPos << ") after end of the reference sequence (" << stopReference.RefLength << ")" << endl; return false; } // if no stopPosition specified, set to reference end if ( stopPos == -1 ) stopPos = stopReference.RefLength; // ------------------------------- // set up Region struct & return region.LeftRefID = RefID; region.LeftPosition = startPos; region.RightRefID = RefID;; region.RightPosition = stopPos; return true; }
bool check(const PropertyFilter& filter, const BamAlignment& al) { bool keepAlignment = true; const PropertyMap& properties = filter.Properties; PropertyMap::const_iterator propertyIter = properties.begin(); PropertyMap::const_iterator propertyEnd = properties.end(); for ( ; propertyIter != propertyEnd; ++propertyIter ) { // check alignment data field depending on propertyName const string& propertyName = (*propertyIter).first; const PropertyFilterValue& valueFilter = (*propertyIter).second; if ( propertyName == ALIGNMENTFLAG_PROPERTY ) keepAlignment &= valueFilter.check(al.AlignmentFlag); else if ( propertyName == CIGAR_PROPERTY ) { stringstream cigarSs; const vector<CigarOp>& cigarData = al.CigarData; if ( !cigarData.empty() ) { vector<CigarOp>::const_iterator cigarBegin = cigarData.begin(); vector<CigarOp>::const_iterator cigarIter = cigarBegin; vector<CigarOp>::const_iterator cigarEnd = cigarData.end(); for ( ; cigarIter != cigarEnd; ++cigarIter ) { const CigarOp& op = (*cigarIter); cigarSs << op.Length << op.Type; } keepAlignment &= valueFilter.check(cigarSs.str()); } } else if ( propertyName == INSERTSIZE_PROPERTY ) keepAlignment &= valueFilter.check(al.InsertSize); else if ( propertyName == ISDUPLICATE_PROPERTY ) keepAlignment &= valueFilter.check(al.IsDuplicate()); else if ( propertyName == ISFAILEDQC_PROPERTY ) keepAlignment &= valueFilter.check(al.IsFailedQC()); else if ( propertyName == ISFIRSTMATE_PROPERTY ) keepAlignment &= valueFilter.check(al.IsFirstMate()); else if ( propertyName == ISMAPPED_PROPERTY ) keepAlignment &= valueFilter.check(al.IsMapped()); else if ( propertyName == ISMATEMAPPED_PROPERTY ) keepAlignment &= valueFilter.check(al.IsMateMapped()); else if ( propertyName == ISMATEREVERSESTRAND_PROPERTY ) keepAlignment &= valueFilter.check(al.IsMateReverseStrand()); else if ( propertyName == ISPAIRED_PROPERTY ) keepAlignment &= valueFilter.check(al.IsPaired()); else if ( propertyName == ISPRIMARYALIGNMENT_PROPERTY ) keepAlignment &= valueFilter.check(al.IsPrimaryAlignment()); else if ( propertyName == ISPROPERPAIR_PROPERTY ) keepAlignment &= valueFilter.check(al.IsProperPair()); else if ( propertyName == ISREVERSESTRAND_PROPERTY ) keepAlignment &= valueFilter.check(al.IsReverseStrand()); else if ( propertyName == ISSECONDMATE_PROPERTY ) keepAlignment &= valueFilter.check(al.IsSecondMate()); else if ( propertyName == ISSINGLETON_PROPERTY ) { const bool isSingleton = al.IsPaired() && al.IsMapped() && !al.IsMateMapped(); keepAlignment &= valueFilter.check(isSingleton); } else if ( propertyName == MAPQUALITY_PROPERTY ) keepAlignment &= valueFilter.check(al.MapQuality); else if ( propertyName == MATEPOSITION_PROPERTY ) keepAlignment &= ( al.IsPaired() && al.IsMateMapped() && valueFilter.check(al.MateRefID) ); else if ( propertyName == MATEREFERENCE_PROPERTY ) { if ( !al.IsPaired() || !al.IsMateMapped() ) return false; BAMTOOLS_ASSERT_MESSAGE( (al.MateRefID>=0 && (al.MateRefID<(int)filterToolReferences.size())), "Invalid MateRefID"); const string& refName = filterToolReferences.at(al.MateRefID).RefName; keepAlignment &= valueFilter.check(refName); } else if ( propertyName == NAME_PROPERTY ) keepAlignment &= valueFilter.check(al.Name); else if ( propertyName == POSITION_PROPERTY ) keepAlignment &= valueFilter.check(al.Position); else if ( propertyName == QUERYBASES_PROPERTY ) keepAlignment &= valueFilter.check(al.QueryBases); else if ( propertyName == REFERENCE_PROPERTY ) { BAMTOOLS_ASSERT_MESSAGE( (al.RefID>=0 && (al.RefID<(int)filterToolReferences.size())), "Invalid RefID"); const string& refName = filterToolReferences.at(al.RefID).RefName; keepAlignment &= valueFilter.check(refName); } else if ( propertyName == TAG_PROPERTY ) keepAlignment &= checkAlignmentTag(valueFilter, al); else BAMTOOLS_ASSERT_UNREACHABLE; // if alignment fails at ANY point, just quit and return false if ( !keepAlignment ) return false; } BAMTOOLS_ASSERT_MESSAGE( keepAlignment, "Error in BamAlignmentChecker... keepAlignment should be true here"); return keepAlignment; }
void BedGenomeCoverage::CoverageBam(string bamFile) { ResetChromCoverage(); // open the BAM file BamReader reader; if (!reader.Open(bamFile)) { cerr << "Failed to open BAM file " << bamFile << endl; exit(1); } // get header & reference information string header = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); // load the BAM header references into a BEDTools "genome file" _genome = new GenomeFile(refs); // convert each aligned BAM entry to BED // and compute coverage on B BamAlignment bam; while (reader.GetNextAlignment(bam)) { // skip if the read is unaligned if (bam.IsMapped() == false) continue; bool _isReverseStrand = bam.IsReverseStrand(); //changing second mate's strand to opposite if( _dUTP && bam.IsPaired() && bam.IsMateMapped() && bam.IsSecondMate()) _isReverseStrand = !bam.IsReverseStrand(); // skip if we care about strands and the strand isn't what // the user wanted if ( (_filterByStrand == true) && ((_requestedStrand == "-") != _isReverseStrand) ) continue; // extract the chrom, start and end from the BAM alignment string chrom(refs.at(bam.RefID).RefName); CHRPOS start = bam.Position; CHRPOS end = bam.GetEndPosition(false, false) - 1; // are we on a new chromosome? if ( chrom != _currChromName ) StartNewChrom(chrom); if(_pair_chip_) { // Skip if not a proper pair if (bam.IsPaired() && (!bam.IsProperPair() or !bam.IsMateMapped()) ) continue; // Skip if wrong coordinates if( ( (bam.Position<bam.MatePosition) && bam.IsReverseStrand() ) || ( (bam.MatePosition < bam.Position) && bam.IsMateReverseStrand() ) ) { //chemically designed: left on positive strand, right on reverse one continue; } /*if(_haveSize) { if (bam.IsFirstMate() && bam.IsReverseStrand()) { //put fragmentSize in to the middle of pair end_fragment int mid = bam.MatePosition+abs(bam.InsertSize)/2; if(mid<_fragmentSize/2) AddCoverage(0, mid+_fragmentSize/2); else AddCoverage(mid-_fragmentSize/2, mid+_fragmentSize/2); } else if (bam.IsFirstMate() && bam.IsMateReverseStrand()) { //put fragmentSize in to the middle of pair end_fragment int mid = start+abs(bam.InsertSize)/2; if(mid<_fragmentSize/2) AddCoverage(0, mid+_fragmentSize/2); else AddCoverage(mid-_fragmentSize/2, mid+_fragmentSize/2); } } else */ if (bam.IsFirstMate() && bam.IsReverseStrand()) { //prolong to the mate to the left AddCoverage(bam.MatePosition, end); } else if (bam.IsFirstMate() && bam.IsMateReverseStrand()) { //prolong to the mate to the right AddCoverage(start, start + abs(bam.InsertSize) - 1); } } else if (_haveSize) { if(bam.IsReverseStrand()) { if(end<_fragmentSize) { //sometimes fragmentSize is bigger :( AddCoverage(0, end); } else { AddCoverage(end + 1 - _fragmentSize, end ); } } else { AddCoverage(start,start+_fragmentSize - 1); } } else // add coverage accordingly. if (!_only_5p_end && !_only_3p_end) { bedVector bedBlocks; // we always want to split blocks when a D CIGAR op is found. // if the user invokes -split, we want to also split on N ops. if (_obeySplits) { // "D" true, "N" true GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks, true, true); } else { // "D" true, "N" false GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks, true, false); } AddBlockedCoverage(bedBlocks); } else if (_only_5p_end) { CHRPOS pos = ( !bam.IsReverseStrand() ) ? start : end; AddCoverage(pos,pos); } else if (_only_3p_end) { CHRPOS pos = ( bam.IsReverseStrand() ) ? start : end; AddCoverage(pos,pos); } } // close the BAM reader.Close(); // process the results of the last chromosome. ReportChromCoverage(_currChromCoverage, _currChromSize, _currChromName, _currChromDepthHist); // report all empty chromsomes PrintEmptyChromosomes(); // report the overall coverage if asked. PrintFinalCoverage(); }
//{{{ SV_SplitRead:: SV_SplitRead(vector< BamAlignment > &block, SV_SplitRead:: SV_SplitRead(const BamAlignment &bam_a, const BamAlignment &bam_b, const RefVector &refs, int _weight, int _id, int _sample_id, SV_SplitReadReader *_reader) { reader = _reader; sample_id = _sample_id; if ( bam_a.MapQuality < bam_b.MapQuality ) min_mapping_quality = bam_a.MapQuality; else min_mapping_quality = bam_b.MapQuality; struct cigar_query query_a = calc_query_pos_from_cigar(bam_a.CigarData, bam_a.IsReverseStrand() ); struct cigar_query query_b = calc_query_pos_from_cigar(bam_b.CigarData, bam_b.IsReverseStrand() ); struct interval tmp_a, tmp_b; tmp_a.strand = '+'; if (bam_a.IsReverseStrand()) tmp_a.strand = '-'; tmp_a.chr = refs.at(bam_a.RefID).RefName; tmp_a.start = bam_a.Position; tmp_a.end = bam_a.GetEndPosition(); tmp_b.strand = '+'; if (bam_b.IsReverseStrand()) tmp_b.strand = '-'; tmp_b.chr = refs.at(bam_b.RefID).RefName; tmp_b.start = bam_b.Position; tmp_b.end = bam_b.GetEndPosition(); //if ( ( tmp_a.chr.compare(tmp_b.chr) > 0 ) || //( ( tmp_a.chr.compare(tmp_b.chr) == 0 ) && //( tmp_a.start > tmp_b.start ) ) ) { if ( (bam_a.RefID > bam_b.RefID) || ( (bam_a.RefID == bam_b.RefID) && (tmp_a.start > tmp_b.start ) ) ) { side_r = tmp_a; side_l = tmp_b; query_r = query_a; query_l = query_b; } else { side_l = tmp_a; side_r = tmp_b; query_l = query_a; query_r = query_b; } if (side_l.strand != side_r.strand) type = SV_BreakPoint::INVERSION; else if ( ( ( side_l.strand == '+' ) && ( side_r.strand == '+' ) && ( query_l.qs_pos < query_r.qs_pos ) ) || ( ( side_l.strand == '-' ) && ( side_r.strand == '-' ) && ( query_l.qs_pos > query_r.qs_pos) ) ) type = SV_BreakPoint::DELETION; else if ( ( ( side_l.strand == '+' ) && ( side_r.strand == '+' ) && ( query_l.qs_pos > query_r.qs_pos ) ) || ( ( side_l.strand == '-' ) && ( side_r.strand == '-' ) && ( query_l.qs_pos < query_r.qs_pos) ) ) type = SV_BreakPoint::DUPLICATION; else { cerr << "ERROR IN BAM FILE. " << "TYPE not detected (DELETION,DUPLICATION,INVERSION)" << endl; cerr << "\t" << query_l.qs_pos << "," << side_l.strand << "\t" << query_r.qs_pos << "," << side_r.strand << "\t" << tmp_a.chr << "," << tmp_a.start << "," << tmp_a.end << "\t" << tmp_b.chr << "," << tmp_b.start << "," << tmp_b.end << "\t" << endl; throw(1); } weight = _weight; id = _id; }
// ValidateReaders checks that all the readers point to BAM files representing // alignments against the same set of reference sequences, and that the // sequences are identically ordered. If these checks fail the operation of // the multireader is undefined, so we force program exit. bool BamMultiReaderPrivate::ValidateReaders() const { m_errorString.clear(); // skip if 0 or 1 readers opened if (m_readers.empty() || (m_readers.size() == 1)) return true; // retrieve first reader const MergeItem& firstItem = m_readers.front(); const BamReader* firstReader = firstItem.Reader; if (firstReader == 0) return false; // retrieve first reader's header data const SamHeader& firstReaderHeader = firstReader->GetHeader(); const std::string& firstReaderSortOrder = firstReaderHeader.SortOrder; // retrieve first reader's reference data const RefVector& firstReaderRefData = firstReader->GetReferenceData(); const int firstReaderRefCount = firstReader->GetReferenceCount(); const int firstReaderRefSize = firstReaderRefData.size(); // iterate over all readers std::vector<MergeItem>::const_iterator readerIter = m_readers.begin(); std::vector<MergeItem>::const_iterator readerEnd = m_readers.end(); for (; readerIter != readerEnd; ++readerIter) { const MergeItem& item = (*readerIter); BamReader* reader = item.Reader; if (reader == 0) continue; // get current reader's header data const SamHeader& currentReaderHeader = reader->GetHeader(); const std::string& currentReaderSortOrder = currentReaderHeader.SortOrder; // check compatible sort order if (currentReaderSortOrder != firstReaderSortOrder) { const std::string message = std::string("mismatched sort order in ") + reader->GetFilename() + ", expected " + firstReaderSortOrder + ", but found " + currentReaderSortOrder; SetErrorString("BamMultiReader::ValidateReaders", message); return false; } // get current reader's reference data const RefVector currentReaderRefData = reader->GetReferenceData(); const int currentReaderRefCount = reader->GetReferenceCount(); const int currentReaderRefSize = currentReaderRefData.size(); // init reference data iterators RefVector::const_iterator firstRefIter = firstReaderRefData.begin(); RefVector::const_iterator firstRefEnd = firstReaderRefData.end(); RefVector::const_iterator currentRefIter = currentReaderRefData.begin(); // compare reference counts from BamReader ( & container size, in case of BR error) if ((currentReaderRefCount != firstReaderRefCount) || (firstReaderRefSize != currentReaderRefSize)) { std::stringstream s; s << "mismatched reference count in " << reader->GetFilename() << ", expected " << firstReaderRefCount << ", but found " << currentReaderRefCount; SetErrorString("BamMultiReader::ValidateReaders", s.str()); return false; } // this will be ok; we just checked above that we have identically-sized sets of references // here we simply check if they are all, in fact, equal in content while (firstRefIter != firstRefEnd) { const RefData& firstRef = (*firstRefIter); const RefData& currentRef = (*currentRefIter); // compare reference name & length if ((firstRef.RefName != currentRef.RefName) || (firstRef.RefLength != currentRef.RefLength)) { std::stringstream s; s << "mismatched references found in" << reader->GetFilename() << "expected: " << std::endl; // print first reader's reference data RefVector::const_iterator refIter = firstReaderRefData.begin(); RefVector::const_iterator refEnd = firstReaderRefData.end(); for (; refIter != refEnd; ++refIter) { const RefData& entry = (*refIter); std::stringstream s; s << entry.RefName << ' ' << std::endl; } s << "but found: " << std::endl; // print current reader's reference data refIter = currentReaderRefData.begin(); refEnd = currentReaderRefData.end(); for (; refIter != refEnd; ++refIter) { const RefData& entry = (*refIter); s << entry.RefName << ' ' << entry.RefLength << std::endl; } SetErrorString("BamMultiReader::ValidateReaders", s.str()); return false; } // update iterators ++firstRefIter; ++currentRefIter; } } // if we get here, everything checks out return true; }
void BedWindow::WindowIntersectBam(string bamFile) { // load the "B" bed file into a map so // that we can easily compare "A" to it for overlaps _bedB->loadBedFileIntoMap(); // open the BAM file BamReader reader; BamWriter writer; reader.Open(bamFile); // get header & reference information string bamHeader = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); // open a BAM output to stdout if we are writing BAM if (_bamOutput == true) { // set compression mode BamWriter::CompressionMode compressionMode = BamWriter::Compressed; if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed; writer.SetCompressionMode(compressionMode); // open our BAM writer writer.Open("stdout", bamHeader, refs); } vector<BED> hits; // vector of potential hits // reserve some space hits.reserve(100); _bedA->bedType = 6; BamAlignment bam; bool overlapsFound; // get each set of alignments for each pair. while (reader.GetNextAlignment(bam)) { if (bam.IsMapped()) { BED a; a.chrom = refs.at(bam.RefID).RefName; a.start = bam.Position; a.end = bam.GetEndPosition(false, false); // build the name field from the BAM alignment. a.name = bam.Name; if (bam.IsFirstMate()) a.name += "/1"; if (bam.IsSecondMate()) a.name += "/2"; a.score = ToString(bam.MapQuality); a.strand = "+"; if (bam.IsReverseStrand()) a.strand = "-"; if (_bamOutput == true) { overlapsFound = FindOneOrMoreWindowOverlaps(a); if (overlapsFound == true) { if (_noHit == false) writer.SaveAlignment(bam); } else { if (_noHit == true) writer.SaveAlignment(bam); } } else { FindWindowOverlaps(a, hits); hits.clear(); } } // BAM IsMapped() is false else if (_noHit == true) { writer.SaveAlignment(bam); } } // close the relevant BAM files. reader.Close(); if (_bamOutput == true) { writer.Close(); } }
int main(int argc, char* argv[]) { //{{{ setup double trim_threshold = 1e-10; double merge_threshold = 1e-10; int min_weight = 0; int min_sample_weight = 0; bool show_evidence = false; bool has_pe_bams = false, has_sr_bams = false, has_bedpes = false; CHR_POS window_size = 1000000; string inter_chrom_file_prefix = "./"; int call_id = 0; bool has_next = false; vector<SV_EvidenceReader*>::iterator i_er; UCSCBins<SV_BreakPoint*> r_bin; srand (time(NULL)); string exclude_bed_file; bool has_exclude = false; string genome_file; bool has_genome_file = false; int print_prob = 0; int bedpe_output = 0; //vector<string> bam_files; //}}} //{{{ check to see if we should print out some help if (argc == 1) ShowHelp(); for(int i = 1; i < argc; i++) { int parameterLength = (int)strlen(argv[i]); if( (PARAMETER_CHECK("-h", 2, parameterLength)) || (PARAMETER_CHECK("--help", 6, parameterLength))) { ShowHelp(); } } //}}} //{{{ do some parsing and setup vector<SV_EvidenceReader*> evidence_readers; map<pair<string,string>, SV_EvidenceReader*> bam_evidence_readers; for(int i = 1; i < argc; i++) { int parameterLength = (int)strlen(argv[i]); if(PARAMETER_CHECK("-pe", 3, parameterLength)) { //{{{ has_pe_bams = true; SV_PairReader *pe_r = new SV_PairReader(); if ((i+1) < argc) { char *params = argv[i + 1]; char *param_val, *brka, *brkb; for ( param_val = strtok_r(params, ",", &brka); param_val; param_val = strtok_r(NULL, ",", &brka)) { char *param = strtok_r(param_val, ":", &brkb); char *val = strtok_r(NULL, ":", &brkb); if (val == NULL) { cerr << "Parameter required for " << param << endl; ShowHelp(); } if ( ! pe_r->add_param(param, val) ) { cerr << "Unknown pair end parameter:" << param << endl; ShowHelp(); } } } string msg = pe_r->check_params(); if ( msg.compare("") == 0 ) { // Add to list of readers // Set the distro map pe_r->initialize(); SV_Evidence::distros[pe_r->ev_id] = pair<log_space*,log_space*>( SV_Pair::get_bp_interval_probability('+', pe_r->distro_size, pe_r->distro), SV_Pair::get_bp_interval_probability('-', pe_r->distro_size, pe_r->distro)); SV_Evidence::distros_size[pe_r->ev_id] = pe_r->distro_size; } else { cerr << "missing pair end parameters:" << msg << endl; ShowHelp(); } // create SV_EvidenceReaders by (BAM, read_group) pairs if (pe_r->read_group.size() == 0) bam_evidence_readers[pair<string,string> (pe_r->get_source_file_name(),"")] = pe_r; else { for (vector<string>::iterator it = pe_r->read_group.begin(); it != pe_r->read_group.end(); ++it) { pair<string,string> ev_pair (pe_r->get_source_file_name(),*it); bam_evidence_readers[ev_pair] = pe_r; } } i++; //}}} } else if(PARAMETER_CHECK("-bedpe", 6, parameterLength)) { //{{{ has_bedpes = true; SV_BedpeReader *be_r = new SV_BedpeReader(); if ((i+1) < argc) { char *params = argv[i + 1]; char *param_val, *brka, *brkb; for ( param_val = strtok_r(params, ",", &brka); param_val; param_val = strtok_r(NULL, ",", &brka)) { char *param = strtok_r(param_val, ":", &brkb); char *val = strtok_r(NULL, ":", &brkb); if (val == NULL) { cerr << "Parameter requied for " << param << endl; ShowHelp(); } if ( ! be_r->add_param(param, val) ) { cerr << "Unknown bedpe parameter:" << param << endl; ShowHelp(); } } } string msg = be_r->check_params(); if ( msg.compare("") == 0 ) { be_r->initialize(); SV_Evidence::distros[be_r->ev_id] = pair<log_space*,log_space*>( SV_Bedpe:: get_bp_interval_probability('+', be_r->distro_size, be_r->distro), SV_Bedpe:: get_bp_interval_probability('-', be_r->distro_size, be_r->distro)); SV_Evidence::distros_size[be_r->ev_id] = be_r->distro_size; evidence_readers.push_back(be_r); } else { cerr << "missing bedpe parameters:" << msg << endl; ShowHelp(); } i++; //}}} } else if(PARAMETER_CHECK("-sr", 3, parameterLength)) { //{{{ has_sr_bams = true; SV_SplitReadReader *sr_r = new SV_SplitReadReader(); if ((i+1) < argc) { char *params = argv[i + 1]; char *param_val, *brka, *brkb; for ( param_val = strtok_r(params, ",", &brka); param_val; param_val = strtok_r(NULL, ",", &brka)) { char *param = strtok_r(param_val, ":", &brkb); char *val = strtok_r(NULL, ":", &brkb); if (val == NULL) { cerr << "Parameter required for " << param << endl; ShowHelp(); } if ( ! sr_r->add_param(param, val) ) { cerr << "Unknown split read parameter:" << param << endl; ShowHelp(); } } } string msg = sr_r->check_params(); if ( msg.compare("") == 0 ) { sr_r->initialize(); SV_Evidence::distros[sr_r->ev_id] = pair<log_space*,log_space*>( SV_SplitRead:: get_bp_interval_probability('+', sr_r->back_distance), SV_SplitRead:: get_bp_interval_probability('-', sr_r->back_distance)); SV_Evidence::distros_size[sr_r->ev_id] = sr_r->back_distance * 2 + 1; } else { cerr << "missing split read parameters:" << msg << endl; ShowHelp(); } // create SV_EvidenceReaders by (BAM, read_group) pairs if (sr_r->read_group.size() == 0) bam_evidence_readers[pair<string,string> (sr_r->get_source_file_name(),"")] = sr_r; else { for (vector<string>::iterator it = sr_r->read_group.begin(); it != sr_r->read_group.end(); ++it) { pair<string,string> ev_pair (sr_r->get_source_file_name(),*it); bam_evidence_readers[ev_pair] = sr_r; } } i++; //}}} } else if(PARAMETER_CHECK("-tt", 3, parameterLength)) { if ((i+1) < argc) { trim_threshold = 1 - atof(argv[i + 1]); i++; } } else if(PARAMETER_CHECK("-mt", 3, parameterLength)) { if ((i+1) < argc) { merge_threshold = atof(argv[i + 1]); i++; } } else if(PARAMETER_CHECK("-mw", 3, parameterLength)) { if ((i+1) < argc) { min_weight = atoi(argv[i + 1]); i++; } } else if(PARAMETER_CHECK("-msw", 4, parameterLength)) { if ((i+1) < argc) { min_sample_weight = atoi(argv[i + 1]); i++; } } else if(PARAMETER_CHECK("-w", 2, parameterLength)) { if ((i+1) < argc) { window_size = atoi(argv[i + 1]); i++; } } else if(PARAMETER_CHECK("-x", 2, parameterLength)) { if ((i+1) < argc) { exclude_bed_file = argv[i + 1]; has_exclude = true; i++; } } else if(PARAMETER_CHECK("-g", 2, parameterLength)) { if ((i+1) < argc) { genome_file = argv[i + 1]; has_genome_file = true; i++; } } else if(PARAMETER_CHECK("-t", 2, parameterLength)) { if ((i+1) < argc) { inter_chrom_file_prefix = argv[i + 1]; i++; } } else if(PARAMETER_CHECK("-e", 2, parameterLength)) { show_evidence = true; } else if(PARAMETER_CHECK("-P", 2, parameterLength)) { print_prob = 1; } else if(PARAMETER_CHECK("-b", 2, parameterLength)) { bedpe_output = 1; } else { cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; ShowHelp(); } } if (min_weight == 0 && min_sample_weight == 0) { cerr << endl << "*****ERROR: must set min weight or min sample weight *****" << endl << endl; ShowHelp(); } SV_BreakPoint::p_trim_threshold = trim_threshold; SV_BreakPoint::p_merge_threshold = merge_threshold; // append rand number to the temp inter-chrom file inter_chrom_file_prefix = inter_chrom_file_prefix + ToString(rand()); if (has_exclude) parse_exclude_file(exclude_bed_file, SV_Evidence::exclude_regions); SV_BamReader *bam_r; if (has_pe_bams || has_sr_bams) { bam_r = new SV_BamReader(&bam_evidence_readers); bam_r->set_inter_chrom_file_name(inter_chrom_file_prefix + ".bam"); bam_r->initialize(); evidence_readers.push_back(bam_r); } map<string, int> genome_order; if (has_genome_file) { GenomeFile *genome; genome = new GenomeFile(genome_file); vector<string> chroms = genome->getChromList(); vector<string>::iterator chr_itr; int chr_count = 0; for (chr_itr = chroms.begin(); chr_itr != chroms.end(); ++chr_itr) { genome_order[*chr_itr] = chr_count; chr_count += 1; } } else if (has_pe_bams || has_sr_bams) { //map<string, SV_EvidenceReader*> bam_evidence_readers; //map<string, SV_EvidenceReader*>::iterator bam_itr; //bam_itr = bam_evidence_readers.begin(); RefVector refs = bam_r->refs; vector<RefData>::iterator ref_itr; int chr_count = 0; for (ref_itr = refs.begin(); ref_itr != refs.end(); ++ref_itr) { RefData r = *ref_itr; genome_order[r.RefName] = chr_count; chr_count += 1; } } else { cerr << endl << "*****ERROR: Unknown chromosome order. " << "Chromosome order must be\nspecified by either bam header " << "or a genome file *****" << endl << endl; ShowHelp(); } //}}} end parsing //{{{ Test if there lines to process in each input file for ( i_er = evidence_readers.begin(); i_er != evidence_readers.end(); ++i_er) { SV_EvidenceReader *er = *i_er; has_next = has_next || er->has_next(); } //}}} // print VCF header if (! bedpe_output) { SV_VcfVariant *header_var = new SV_VcfVariant(); map<int,string>::iterator s_itr; for (s_itr = SV_EvidenceReader::sample_names.begin(); s_itr != SV_EvidenceReader::sample_names.end(); ++s_itr) header_var->add_sample(s_itr->second); // add appropriate fields to active_formats header_var->set_sample_field(SV_EvidenceReader:: sample_names.begin()->second, "GT", "./."); header_var->set_sample_field(SV_EvidenceReader:: sample_names.begin()->second, "SU", "0"); if (has_pe_bams) header_var->set_sample_field(SV_EvidenceReader:: sample_names.begin()->second, "PE", "0"); if (has_sr_bams) header_var->set_sample_field(SV_EvidenceReader:: sample_names.begin()->second, "SR", "0"); if (has_bedpes) header_var->set_sample_field(SV_EvidenceReader:: sample_names.begin()->second, "BD", "0"); header_var->print_header(); delete(header_var); } //{{{ process the intra-chrom events that were saved to a file CHR_POS max_pos = 0; string last_min_chr = ""; while ( has_next ) { string min_chr = ""; //{{{ find min_chr among all input files for ( i_er = evidence_readers.begin(); i_er != evidence_readers.end(); ++i_er) { SV_EvidenceReader *er = *i_er; if ( er->has_next() ) { string curr_chr = er->get_curr_chr(); if ( ( min_chr.compare("") == 0 ) || ( genome_order[curr_chr] < genome_order[min_chr] ) ) { min_chr = curr_chr; } } } //}}} //{{{ if the chrome switches, reset the max_pos if (last_min_chr.compare(min_chr) != 0) { max_pos = window_size; last_min_chr = min_chr; } //}}} cerr << min_chr << "\t" << max_pos << endl; bool input_processed = true; while (input_processed) { input_processed = false; //{{{ read the files for ( i_er = evidence_readers.begin(); i_er != evidence_readers.end(); ++i_er) { SV_EvidenceReader *er = *i_er; if ( er->has_next() ) { string curr_chr = er->get_curr_chr(); CHR_POS curr_pos = er->get_curr_pos(); if ( ( genome_order[curr_chr] <= genome_order[min_chr] ) && ( curr_pos < max_pos) ) { er->process_input_chr_pos(curr_chr, max_pos, r_bin); input_processed = true; } } } //}}} //{{{ call breakpoints vector< UCSCElement<SV_BreakPoint*> > values = r_bin.values(min_chr, max_pos); vector< UCSCElement<SV_BreakPoint*> >::iterator it; for (it = values.begin(); it < values.end(); ++it) { SV_BreakPoint *bp = it->value; // Make sure both ends of the bp are less than or equal to the // current chrom if (bp->weight >= min_weight && bp->get_max_sample_weight() >= min_sample_weight) { //bp->do_it(); // make sure there was not an error with trimming if (bp->trim_intervals() > 0) { if (bedpe_output) { bp->print_bedpe(++call_id, print_prob); if (show_evidence) bp->print_evidence("\t"); } else { SV_VcfVariant *vcf_var = new SV_VcfVariant(bp, ++call_id, print_prob); vcf_var->print_var(); delete(vcf_var); } } } if (r_bin.remove(*it, false, false, true) != 0) { cerr << "Error removing element:" << *bp << endl; abort(); } bp->free_evidence(); delete bp; } //}}} // move the window max_pos = max_pos *2; } //{{{ Test if there is still input lines has_next = false; for ( i_er = evidence_readers.begin(); i_er != evidence_readers.end(); ++i_er) { SV_EvidenceReader *er = *i_er; has_next = has_next || er->has_next(); } //}}} } //}}} //{{{ terminate input files for ( i_er = evidence_readers.begin(); i_er != evidence_readers.end(); ++i_er) { SV_EvidenceReader *er = *i_er; er->terminate(); } //}}} //{{{ Call remaining intra breakpoints vector< UCSCElement<SV_BreakPoint*> > values = r_bin.values(); vector< UCSCElement<SV_BreakPoint*> >::iterator it; for ( it = values.begin(); it != values.end(); ++it) { SV_BreakPoint *bp = it->value; if (bp->weight >= min_weight && bp->get_max_sample_weight() >= min_sample_weight) { //bp->do_it(); if (bp->trim_intervals() > 0) { if (bedpe_output) { bp->print_bedpe(++call_id, print_prob); if (show_evidence) bp->print_evidence("\t"); } else { SV_VcfVariant *vcf_var = new SV_VcfVariant(bp, ++call_id, print_prob); vcf_var->print_var(); } } } if (r_bin.remove(*it, false, false, true) != 0) { cerr << "Error removing element" << endl; abort(); } bp->free_evidence(); delete bp; } //}}} //{{{ process the inter-chrom events that were saved to a file string intra_bam_file_name = inter_chrom_file_prefix + ".bam"; ifstream intra_bam_file( intra_bam_file_name.c_str() ); if (intra_bam_file.good()) { intra_bam_file.close(); sort_inter_chrom_bam( inter_chrom_file_prefix + ".bam", inter_chrom_file_prefix + ".sort.bam"); SV_InterChromBamReader *ic_r = new SV_InterChromBamReader( inter_chrom_file_prefix + ".sort.bam", &bam_evidence_readers); ic_r->initialize(); vector<SV_EvidenceReader*> inter_chrom_evidence_readers; inter_chrom_evidence_readers.push_back(ic_r); // There are two files containg all of the inter-chrom events, one bam // and one bedpe, each line in the file corresponds to the properies // set in one of the readers. Each line has a "LS" (lumpy source) // property that gives its source file name. Using that entry, the // line will be sent to the reader for processing. // get new evidence readers for both bedpe and bam inter-chrom // readers has_next = true; int32_t last_min_primary_refid = -1; int32_t last_min_secondary_refid = -1; max_pos = 0; while ( has_next ) { string min_primary_chr = ""; string min_secondary_chr = ""; int32_t min_primary_refid = -1; int32_t min_secondary_refid = -1; //{{{ find min_chr pair among all input files for ( i_er = inter_chrom_evidence_readers.begin(); i_er != inter_chrom_evidence_readers.end(); ++i_er) { SV_EvidenceReader *er = *i_er; if ( er->has_next() ) { int32_t curr_primary_refid = er->get_curr_primary_refid(); int32_t curr_secondary_refid = er->get_curr_secondary_refid(); if ( (( min_primary_refid == -1 ) && ( min_secondary_refid == -1 )) || (( curr_primary_refid < min_primary_refid) && ( curr_secondary_refid < min_secondary_refid)) ) { min_primary_refid = curr_primary_refid; min_secondary_refid = curr_secondary_refid; min_secondary_chr = er->get_curr_secondary_chr(); min_primary_chr = er->get_curr_primary_chr(); } } } //}}} // if the chrome pair switches, reset the max_pos if ( (last_min_primary_refid != min_primary_refid) || (last_min_secondary_refid != min_secondary_refid) ) { max_pos = window_size; last_min_primary_refid = min_primary_refid; last_min_secondary_refid = min_secondary_refid; } bool input_processed = true; while (input_processed) { input_processed = false; //{{{ read the files, process anything in frame for ( i_er = inter_chrom_evidence_readers.begin(); i_er != inter_chrom_evidence_readers.end(); ++i_er) { SV_EvidenceReader *er = *i_er; if ( er->has_next() ) { int32_t curr_primary_refid = er->get_curr_primary_refid(); int32_t curr_secondary_refid = er->get_curr_secondary_refid(); CHR_POS curr_pos = er->get_curr_primary_pos(); if ( (curr_primary_refid <= min_primary_refid) && (curr_secondary_refid <= min_secondary_refid) && (curr_pos < max_pos) ) { er->process_input_chr_pos( er->get_curr_primary_chr(), er->get_curr_secondary_chr(), max_pos, r_bin); input_processed = true; } } } //}}} //{{{ get breakpoints // get anything that has ends in both chroms vector< UCSCElement<SV_BreakPoint*> > values = r_bin.values(min_secondary_chr); vector< UCSCElement<SV_BreakPoint*> >::iterator it; for (it = values.begin(); it < values.end(); ++it) { SV_BreakPoint *bp = it->value; if (bp->weight >= min_weight && bp->get_max_sample_weight() >= min_sample_weight) { //bp->do_it(); if (bp->trim_intervals() > 0) { if (bedpe_output) { bp->print_bedpe(++call_id, print_prob); if (show_evidence) bp->print_evidence("\t"); } else { SV_VcfVariant *vcf_var = new SV_VcfVariant(bp, ++call_id, print_prob); vcf_var->print_var(); } } } if (r_bin.remove(*it, false, false, true) != 0) { cerr << "Error removing element" << endl; abort(); } bp->free_evidence(); delete bp; } //}}} max_pos = max_pos * 2; } has_next = false; //{{{ Test if there is still input lines for ( i_er = inter_chrom_evidence_readers.begin(); i_er != inter_chrom_evidence_readers.end(); ++i_er) { SV_EvidenceReader *er = *i_er; has_next = has_next || er->has_next(); } //}}} } //{{{ Call remaining break points values = r_bin.values(); for (it = values.begin(); it != values.end(); ++it) { SV_BreakPoint *bp = it->value; if (bp->weight >= min_weight && bp->get_max_sample_weight() >= min_sample_weight) { //bp->do_it(); if (bp->trim_intervals() > 0) { if (bedpe_output) { bp->print_bedpe(++call_id, print_prob); if (show_evidence) bp->print_evidence("\t"); } else { SV_VcfVariant *vcf_var = new SV_VcfVariant(bp, ++call_id, print_prob); vcf_var->print_var(); } } } if (r_bin.remove(*it, false, false, true) != 0) { cerr << "Error removing element" << endl; abort(); } bp->free_evidence(); delete bp; } //}}} for ( i_er = inter_chrom_evidence_readers.begin(); i_er != inter_chrom_evidence_readers.end(); ++i_er) { SV_EvidenceReader *er = *i_er; delete(er); } } //}}} //{{{ free up stuff string s = inter_chrom_file_prefix + ".bam"; remove(s.c_str()); s = inter_chrom_file_prefix + ".sort.bam"; remove(s.c_str()); map<int, pair<log_space*,log_space*> >::iterator e_it; for( e_it = SV_Evidence::distros.begin(); e_it != SV_Evidence::distros.end(); ++e_it) { free(e_it->second.first); free(e_it->second.second); } #if 0 for ( i_er = evidence_readers.begin(); i_er != evidence_readers.end(); ++i_er) { SV_EvidenceReader *er = *i_er; delete(er); } #endif evidence_readers.clear(); bam_evidence_readers.clear(); //}}} return 0; }
// Same as ParseRegionString() above, but accepts a BamMultiReader bool ParseRegionString(const string& regionString, const BamReader& reader, BamRegion& region) { // ------------------------------- // parse region string // check first for empty string if ( regionString.empty() ) return false; //cerr << "ParseRegionString Input: " << regionString << endl; // non-empty string, look for a colom size_t foundFirstColon = regionString.find(':'); // store chrom strings, and numeric positions string startChrom; string stopChrom; int startPos; int stopPos; // no colon found // going to use entire contents of requested chromosome // just store entire region string as startChrom name // use BamReader methods to check if its valid for current BAM file if ( foundFirstColon == string::npos ) { startChrom = regionString; startPos = 0; stopChrom = regionString; stopPos = -1; } // colon found, so we at least have some sort of startPos requested else { // store start chrom from beginning to first colon startChrom = regionString.substr(0,foundFirstColon); // look for ".." after the colon size_t foundRangeDots = regionString.find("..", foundFirstColon+1); // no dots found // so we have a startPos but no range // store contents before colon as startChrom, after as startPos if ( foundRangeDots == string::npos ) { startPos = atoi( regionString.substr(foundFirstColon+1).c_str() ); stopChrom = startChrom; stopPos = -1; } // ".." found, so we have some sort of range selected else { // store startPos between first colon and range dots ".." startPos = atoi( regionString.substr(foundFirstColon+1, foundRangeDots-foundFirstColon-1).c_str() ); // look for second colon size_t foundSecondColon = regionString.find(':', foundRangeDots+1); // no second colon found // so we have a "standard" chrom:start..stop input format (on single chrom) if ( foundSecondColon == string::npos ) { stopChrom = startChrom; stopPos = atoi( regionString.substr(foundRangeDots+2).c_str() ); } // second colon found // so we have a range requested across 2 chrom's else { stopChrom = regionString.substr(foundRangeDots+2, foundSecondColon-(foundRangeDots+2)); stopPos = atoi( regionString.substr(foundSecondColon+1).c_str() ); } } } // ------------------------------- // validate reference IDs & genomic positions const RefVector references = reader.GetReferenceData(); // if startRefID not found, return false int startRefID = reader.GetReferenceID(startChrom); if ( startRefID == -1 ) return false; // startPos cannot be greater than or equal to reference length const RefData& startReference = references.at(startRefID); if ( startPos >= startReference.RefLength ) return false; // if stopRefID not found, return false int stopRefID = reader.GetReferenceID(stopChrom); if ( stopRefID == -1 ) return false; // stopPosition cannot be larger than reference length const RefData& stopReference = references.at(stopRefID); if ( stopPos > stopReference.RefLength ) return false; // if no stopPosition specified, set to reference end if ( stopPos == -1 ) stopPos = stopReference.RefLength; // ------------------------------- // set up Region struct & return region.LeftRefID = startRefID; region.LeftPosition = startPos; region.RightRefID = stopRefID;; region.RightPosition = stopPos; //cerr << "ParseRegionString " << region.LeftRefID << " " << region.LeftPosition << " " << region.RightPosition << endl; return true; }
int main ( int argc, char *argv[] ) { struct parameters *param = 0; param = interface(param, argc, argv); //region file input (the region file should be sorted as the same way as the bam file) ifstream region_f; region_f.open(param->region_f, ios_base::in); // the region file is opened //bam input and generate index if not yet //-------------------------------------------------------------------------------------------------------+ // BAM input (file or filenames?) | //-------------------------------------------------------------------------------------------------------+ char *fof = param->mapping_f; FILE *IN=NULL; char linefof[5000]; int filecount=0; vector <string> fnames; if (strchr(fof,' ')!=NULL) { char *ptr; ptr=strtok(fof," "); while (ptr!=NULL) { fnames.push_back(ptr); filecount++; ptr=strtok(NULL," "); } } else { IN=fopen(fof,"rt"); if (IN!=NULL) { long linecount=0; while (fgets(linefof,5000-1,IN)!=NULL) { linecount++; if (linefof[0]!='#' && linefof[0]!='\n') { char *ptr=strchr(linefof,'\n'); if (ptr!=NULL && ptr[0]=='\n') { ptr[0]='\0'; } FILE *dummy=NULL; dummy=fopen(linefof,"rt"); if (dummy!=NULL) { // seems to be a file of filenames... fclose(dummy); fnames.push_back(linefof); filecount++; } else if (filecount==0 || linecount>=1000-1) { // seems to be a single file fnames.push_back(fof); filecount++; break; } } } fclose(IN); } } //file or file name decided and stored in vector "fnames" cerr << "the input mapping files are:" << endl; vector <string>::iterator fit = fnames.begin(); for(; fit != fnames.end(); fit++) { cerr << *fit << endl; } //-------------------------------------------------------------------------------------------------------+ // end of file or filenames | //-------------------------------------------------------------------------------------------------------+ // open the BAM file(s) BamMultiReader reader; reader.Open(fnames); // get header & reference information string header = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); if ( ! reader.LocateIndexes() ) // opens any existing index files that match our BAM files reader.CreateIndexes(); // creates index files for BAM files that still lack one // locus bias struct lb empty_profile = {0,0,0,0}; vector <struct lb> locus_b(1000, empty_profile); // output locus bias file string locus_bias_set = param->lbias; ofstream locus_bias; if ( locus_bias_set != "" ) { locus_bias.open(param->lbias); if ( !locus_bias ) { cerr << "can not open locus_bias file.\n"; exit(0); } } //should decide which chromosome string line; string old_chr = "SRP"; string type = param->type; //whether do some position-level pile-up stuff bool posc = false; ofstream posc_f; ofstream chrmap_f; string poscset = param->posc; if ( poscset != "" ) { posc = true; posc_f.open(param->posc); chrmap_f.open(param->chrmap); } bool noChr; if ( param->nochr == 1 ){ noChr = true; } else { noChr = false; } //regions for the input of region file deque <struct region> regions; getline(region_f, line); //get the first line eatline(line,regions,noChr); deque <struct region>::iterator it = regions.begin(); while ( it->chr != old_chr ) { old_chr = it->chr; // set the current chr as old chr int chr_id = reader.GetReferenceID(it->chr); if ( chr_id == -1 ) { //reference not found for (; it != regions.end() && it->chr == old_chr; ) { gene_processing(*it,locus_b); // print the old region info it = regions.erase(it); // erase the current region } while ( regions.empty() ) { getline(region_f, line); if ( region_f.eof() ){ cerr << "finished: end of region file, zone 0" << endl; break; } eatline(line, regions,noChr); it = regions.begin(); if (it->chr == old_chr){ gene_processing(*it,locus_b); regions.clear(); continue; } } continue; } int chr_len = refs.at(chr_id).RefLength; if ( !reader.SetRegion(chr_id, 1, chr_id, chr_len) ) // here set region { cerr << "bamtools count ERROR: Jump region failed " << it->chr << endl; reader.Close(); exit(1); } //pile-up pos stats set <string> fragment; map <string, unsigned int> pileup; bool isposPileup = false; unsigned int old_start = 0; unsigned int total_tags = 0; unsigned int total_pos = 0; unsigned int pileup_pos = 0; BamAlignment bam; while (reader.GetNextAlignment(bam)) { if ( bam.IsMapped() == false ) continue; // skip unaligned reads unsigned int unique; bam.GetTag("NH", unique); if (param->unique == 1) { if (unique != 1) { // skipe uniquelly mapped reads continue; } } if (read_length == 0){ read_length = bam.Length; } //cout << bam.Name << endl; string chrom = refs.at(bam.RefID).RefName; string strand = "+"; if (bam.IsReverseStrand()) strand = "-"; unsigned int alignmentStart = bam.Position+1; unsigned int mateStart; if (type == "p") mateStart = bam.MatePosition+1; unsigned int alignmentEnd = bam.GetEndPosition(); unsigned int cigarEnd; vector <int> blockLengths; vector <int> blockStarts; blockStarts.push_back(0); ParseCigar(bam.CigarData, blockStarts, blockLengths, cigarEnd); // position check for unique mapped reads (because is paired-end reads, shoule base on fragment level for paired end reads) if (posc == true && unique == 1) { if (type == "p" && fragment.count(bam.Name) > 0) fragment.erase(bam.Name); else { total_tags++; if (type == "p"){ fragment.insert(bam.Name); } string alignSum; if (type == "p") { alignSum = int2str(alignmentStart) + "\t" + int2str(mateStart) + "\t.\t" + strand; } else { alignSum = int2str(alignmentStart) + "\t" + int2str(alignmentEnd) + "\t.\t" + strand; } if ( alignmentStart != old_start ) { isposPileup = false; map <string, unsigned int>::iterator pit = pileup.begin(); for (; pit != pileup.end(); pit++) { posc_f << chrom << "\truping\tpileup\t" << pit->first << "\t.\t" << "Pileup=" << pit->second << endl; //print pileup } pileup.clear(); //clear pileup set pileup.insert( pair <string, unsigned int> (alignSum, 1) ); //insert the new read total_pos++; } else if ( alignmentStart == old_start ) { // same starts if ( pileup.count(alignSum) > 0 ) { // pileup if ( pileup[alignSum] == 1 && isposPileup == false ) { pileup_pos++; isposPileup = true; } pileup[alignSum]++; } else { pileup.insert( pair <string, unsigned int> (alignSum, 1) ); } } //same starts } //new fragment old_start = alignmentStart; } // do pos check float incre = 1.; if (blockStarts.size() > 1) incre = 0.5; // incre half for junction reads incre /= static_cast < float >(unique); // for multi aligned reads deque <struct region>::iterator iter = regions.begin(); if ( iter->start > alignmentEnd ) continue; // skip reads not overlapping with the first region while ( iter->chr == old_chr && iter->start <= alignmentEnd && iter != regions.end() ) { if (iter->end < alignmentStart) { // the region end is beyond the alignmentStart gene_processing(*iter,locus_b); // processing iter = regions.erase(iter); // this region should be removed if ( regions.empty() ) { getline(region_f, line); // get a line of region file if ( ! region_f.eof() ) { eatline(line, regions, noChr); // eat a line and put it into the duque iter = regions.begin(); } else { // it's reaching the end of the region file cerr << "finished: end of region file, zone 3" << endl; break; } } continue; } if (iter->end >= alignmentStart && iter->start <= alignmentEnd) { //overlapping, should take action vector <int>::iterator cigit = blockStarts.begin(); for (; cigit != blockStarts.end(); cigit++) { unsigned int current_start = *cigit + alignmentStart; int current_pos = current_start - (iter->start); //cout << iter->chr << "\t" << iter->start << "\t" << iter->end << "\t" << current_start << endl; if ( (iter->tags).count(current_pos) > 0 ) { (iter->tags)[current_pos] += incre; } else (iter->tags).insert( pair<int, float>(current_pos, incre) ); } } // overlapping take action! if ( (iter+1) != regions.end() ) iter++; // if this region is not the last element in the deque else { // the last element getline(region_f, line); // get a line of region file if ( ! region_f.eof() ){ eatline(line, regions, noChr); // eat a line and put it into the duque iter = regions.end(); iter--; } else { //it's reaching the end of the region file cerr << "finished: end of region file, zone 4" << endl; break; } } } //while } // read a bam // print chr map if (posc == true) { chrmap_f << old_chr << "\t" << total_tags << "\t" << total_pos << "\t" << pileup_pos << endl; } //somehow to loop back it = regions.begin(); //reset to begin for (; it != regions.end() && it->chr == old_chr; ) { gene_processing(*it,locus_b); // print the old region info it = regions.erase(it); // erase the current region } while ( regions.empty() ) { getline(region_f, line); if ( region_f.eof() ){ cerr << "finished: end of region file, zone 5" << endl; //print locus bias for (unsigned int l = 0; l < 1000; l++){ locus_bias << l << "\t" << locus_b[l].ps << "\t" << locus_b[l].hs << "\t" << locus_b[l].pe << "\t" << locus_b[l].he << endl; } exit(0); } eatline(line, regions, noChr); it = regions.begin(); if (it->chr == old_chr){ gene_processing(*it, locus_b); regions.clear(); continue; } } } // region chr != old chr regions.clear(); reader.Close(); region_f.close(); return 0; } //main
void BedIntersect::IntersectBam(string bamFile) { // load the "B" bed file into a map so // that we can easily compare "A" to it for overlaps _bedB->loadBedFileIntoMap(); // open the BAM file BamReader reader; BamWriter writer; reader.Open(bamFile); // get header & reference information string header = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); // open a BAM output to stdout if we are writing BAM if (_bamOutput == true) { // open our BAM writer writer.Open("stdout", header, refs, _isUncompressedBam); } vector<BED> hits; // reserve some space hits.reserve(100); _bedA->bedType = 6; BamAlignment bam; // get each set of alignments for each pair. while (reader.GetNextAlignment(bam)) { if (bam.IsMapped()) { BED a; a.chrom = refs.at(bam.RefID).RefName; a.start = bam.Position; a.end = bam.GetEndPosition(false); // build the name field from the BAM alignment. a.name = bam.Name; if (bam.IsFirstMate()) a.name += "/1"; if (bam.IsSecondMate()) a.name += "/2"; a.score = ToString(bam.MapQuality); a.strand = "+"; if (bam.IsReverseStrand()) a.strand = "-"; if (_bamOutput == true) { bool overlapsFound = false; // treat the BAM alignment as a single "block" if (_obeySplits == false) { overlapsFound = FindOneOrMoreOverlap(a); } // split the BAM alignment into discrete blocks and // look for overlaps only within each block. else { bool overlapFoundForBlock; bedVector bedBlocks; // vec to store the discrete BED "blocks" from a // we don't want to split on "D" ops, hence the "false" getBamBlocks(bam, refs, bedBlocks, false); vector<BED>::const_iterator bedItr = bedBlocks.begin(); vector<BED>::const_iterator bedEnd = bedBlocks.end(); for (; bedItr != bedEnd; ++bedItr) { overlapFoundForBlock = FindOneOrMoreOverlap(a); if (overlapFoundForBlock == true) overlapsFound = true; } } if (overlapsFound == true) { if (_noHit == false) writer.SaveAlignment(bam); } else { if (_noHit == true) { writer.SaveAlignment(bam); } } } else { // treat the BAM alignment as a single BED "block" if (_obeySplits == false) { FindOverlaps(a, hits); hits.clear(); } // split the BAM alignment into discrete BED blocks and // look for overlaps only within each block. else { bedVector bedBlocks; // vec to store the discrete BED "blocks" from a getBamBlocks(bam, refs, bedBlocks, false); vector<BED>::const_iterator bedItr = bedBlocks.begin(); vector<BED>::const_iterator bedEnd = bedBlocks.end(); for (; bedItr != bedEnd; ++bedItr) { FindOverlaps(*bedItr, hits); hits.clear(); } } } } } // close the relevant BAM files. reader.Close(); if (_bamOutput == true) { writer.Close(); } }
namespace BamTools { // ------------------------------- // string literal constants // property names const string ALIGNMENTFLAG_PROPERTY = "alignmentFlag"; const string CIGAR_PROPERTY = "cigar"; const string INSERTSIZE_PROPERTY = "insertSize"; const string ISDUPLICATE_PROPERTY = "isDuplicate"; const string ISFAILEDQC_PROPERTY = "isFailedQC"; const string ISFIRSTMATE_PROPERTY = "isFirstMate"; const string ISMAPPED_PROPERTY = "isMapped"; const string ISMATEMAPPED_PROPERTY = "isMateMapped"; const string ISMATEREVERSESTRAND_PROPERTY = "isMateReverseStrand"; const string ISPAIRED_PROPERTY = "isPaired"; const string ISPRIMARYALIGNMENT_PROPERTY = "isPrimaryAlignment"; const string ISPROPERPAIR_PROPERTY = "isProperPair"; const string ISREVERSESTRAND_PROPERTY = "isReverseStrand"; const string ISSECONDMATE_PROPERTY = "isSecondMate"; const string ISSINGLETON_PROPERTY = "isSingleton"; const string MAPQUALITY_PROPERTY = "mapQuality"; const string MATEPOSITION_PROPERTY = "matePosition"; const string MATEREFERENCE_PROPERTY = "mateReference"; const string NAME_PROPERTY = "name"; const string POSITION_PROPERTY = "position"; const string QUERYBASES_PROPERTY = "queryBases"; const string REFERENCE_PROPERTY = "reference"; const string TAG_PROPERTY = "tag"; // boolalpha const string TRUE_STR = "true"; const string FALSE_STR = "false"; RefVector filterToolReferences; struct BamAlignmentChecker { bool check(const PropertyFilter& filter, const BamAlignment& al) { bool keepAlignment = true; const PropertyMap& properties = filter.Properties; PropertyMap::const_iterator propertyIter = properties.begin(); PropertyMap::const_iterator propertyEnd = properties.end(); for ( ; propertyIter != propertyEnd; ++propertyIter ) { // check alignment data field depending on propertyName const string& propertyName = (*propertyIter).first; const PropertyFilterValue& valueFilter = (*propertyIter).second; if ( propertyName == ALIGNMENTFLAG_PROPERTY ) keepAlignment &= valueFilter.check(al.AlignmentFlag); else if ( propertyName == CIGAR_PROPERTY ) { stringstream cigarSs; const vector<CigarOp>& cigarData = al.CigarData; if ( !cigarData.empty() ) { vector<CigarOp>::const_iterator cigarBegin = cigarData.begin(); vector<CigarOp>::const_iterator cigarIter = cigarBegin; vector<CigarOp>::const_iterator cigarEnd = cigarData.end(); for ( ; cigarIter != cigarEnd; ++cigarIter ) { const CigarOp& op = (*cigarIter); cigarSs << op.Length << op.Type; } keepAlignment &= valueFilter.check(cigarSs.str()); } } else if ( propertyName == INSERTSIZE_PROPERTY ) keepAlignment &= valueFilter.check(al.InsertSize); else if ( propertyName == ISDUPLICATE_PROPERTY ) keepAlignment &= valueFilter.check(al.IsDuplicate()); else if ( propertyName == ISFAILEDQC_PROPERTY ) keepAlignment &= valueFilter.check(al.IsFailedQC()); else if ( propertyName == ISFIRSTMATE_PROPERTY ) keepAlignment &= valueFilter.check(al.IsFirstMate()); else if ( propertyName == ISMAPPED_PROPERTY ) keepAlignment &= valueFilter.check(al.IsMapped()); else if ( propertyName == ISMATEMAPPED_PROPERTY ) keepAlignment &= valueFilter.check(al.IsMateMapped()); else if ( propertyName == ISMATEREVERSESTRAND_PROPERTY ) keepAlignment &= valueFilter.check(al.IsMateReverseStrand()); else if ( propertyName == ISPAIRED_PROPERTY ) keepAlignment &= valueFilter.check(al.IsPaired()); else if ( propertyName == ISPRIMARYALIGNMENT_PROPERTY ) keepAlignment &= valueFilter.check(al.IsPrimaryAlignment()); else if ( propertyName == ISPROPERPAIR_PROPERTY ) keepAlignment &= valueFilter.check(al.IsProperPair()); else if ( propertyName == ISREVERSESTRAND_PROPERTY ) keepAlignment &= valueFilter.check(al.IsReverseStrand()); else if ( propertyName == ISSECONDMATE_PROPERTY ) keepAlignment &= valueFilter.check(al.IsSecondMate()); else if ( propertyName == ISSINGLETON_PROPERTY ) { const bool isSingleton = al.IsPaired() && al.IsMapped() && !al.IsMateMapped(); keepAlignment &= valueFilter.check(isSingleton); } else if ( propertyName == MAPQUALITY_PROPERTY ) keepAlignment &= valueFilter.check(al.MapQuality); else if ( propertyName == MATEPOSITION_PROPERTY ) keepAlignment &= ( al.IsPaired() && al.IsMateMapped() && valueFilter.check(al.MateRefID) ); else if ( propertyName == MATEREFERENCE_PROPERTY ) { if ( !al.IsPaired() || !al.IsMateMapped() ) return false; BAMTOOLS_ASSERT_MESSAGE( (al.MateRefID>=0 && (al.MateRefID<(int)filterToolReferences.size())), "Invalid MateRefID"); const string& refName = filterToolReferences.at(al.MateRefID).RefName; keepAlignment &= valueFilter.check(refName); } else if ( propertyName == NAME_PROPERTY ) keepAlignment &= valueFilter.check(al.Name); else if ( propertyName == POSITION_PROPERTY ) keepAlignment &= valueFilter.check(al.Position); else if ( propertyName == QUERYBASES_PROPERTY ) keepAlignment &= valueFilter.check(al.QueryBases); else if ( propertyName == REFERENCE_PROPERTY ) { BAMTOOLS_ASSERT_MESSAGE( (al.RefID>=0 && (al.RefID<(int)filterToolReferences.size())), "Invalid RefID"); const string& refName = filterToolReferences.at(al.RefID).RefName; keepAlignment &= valueFilter.check(refName); } else if ( propertyName == TAG_PROPERTY ) keepAlignment &= checkAlignmentTag(valueFilter, al); else BAMTOOLS_ASSERT_UNREACHABLE; // if alignment fails at ANY point, just quit and return false if ( !keepAlignment ) return false; } BAMTOOLS_ASSERT_MESSAGE( keepAlignment, "Error in BamAlignmentChecker... keepAlignment should be true here"); return keepAlignment; } bool checkAlignmentTag(const PropertyFilterValue& valueFilter, const BamAlignment& al) { // ensure filter contains string data Variant entireTagFilter = valueFilter.Value; if ( !entireTagFilter.is_type<string>() ) return false; // localize string from variant const string& entireTagFilterString = entireTagFilter.get<string>(); // ensure we have at least "XX:x" if ( entireTagFilterString.length() < 4 ) return false; // get tagName & lookup in alignment // if found, set tagType to tag type character // if not found, return false const string& tagName = entireTagFilterString.substr(0,2); char tagType = '\0'; if ( !al.GetTagType(tagName, tagType) ) return false; // remove tagName & ":" from beginning tagFilter string tagFilterString = entireTagFilterString.substr(3); // switch on tag type to set tag query value & parse filter token int32_t intFilterValue, intQueryValue; uint32_t uintFilterValue, uintQueryValue; float realFilterValue, realQueryValue; string stringFilterValue, stringQueryValue; PropertyFilterValue tagFilter; PropertyFilterValue::ValueCompareType compareType; bool keepAlignment = false; switch (tagType) { // signed int tag type case 'c' : case 's' : case 'i' : if ( al.GetTag(tagName, intQueryValue) ) { if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, intFilterValue, compareType) ) { tagFilter.Value = intFilterValue; tagFilter.Type = compareType; keepAlignment = tagFilter.check(intQueryValue); } } break; // unsigned int tag type case 'C' : case 'S' : case 'I' : if ( al.GetTag(tagName, uintQueryValue) ) { if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, uintFilterValue, compareType) ) { tagFilter.Value = uintFilterValue; tagFilter.Type = compareType; keepAlignment = tagFilter.check(uintQueryValue); } } break; // 'real' tag type case 'f' : if ( al.GetTag(tagName, realQueryValue) ) { if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, realFilterValue, compareType) ) { tagFilter.Value = realFilterValue; tagFilter.Type = compareType; keepAlignment = tagFilter.check(realQueryValue); } } break; // string tag type case 'A': case 'Z': case 'H': if ( al.GetTag(tagName, stringQueryValue) ) { if ( FilterEngine<BamAlignmentChecker>::parseToken(tagFilterString, stringFilterValue, compareType) ) { tagFilter.Value = stringFilterValue; tagFilter.Type = compareType; keepAlignment = tagFilter.check(stringQueryValue); } } break; // unknown tag type default : keepAlignment = false; } return keepAlignment; } }; } // namespace BamTools
bool RandomTool::RandomToolPrivate::Run(void) { // set to default stdin if no input files provided if ( !m_settings->HasInput && !m_settings->HasInputFilelist ) m_settings->InputFiles.push_back(Options::StandardIn()); // add files in the filelist to the input file list if ( m_settings->HasInputFilelist ) { ifstream filelist(m_settings->InputFilelist.c_str(), ios::in); if ( !filelist.is_open() ) { cerr << "bamtools random ERROR: could not open input BAM file list... Aborting." << endl; return false; } string line; while ( getline(filelist, line) ) m_settings->InputFiles.push_back(line); } // open our reader BamMultiReader reader; if ( !reader.Open(m_settings->InputFiles) ) { cerr << "bamtools random ERROR: could not open input BAM file(s)... Aborting." << endl; return false; } // look up index files for all BAM files reader.LocateIndexes(); // make sure index data is available if ( !reader.HasIndexes() ) { cerr << "bamtools random ERROR: could not load index data for all input BAM file(s)... Aborting." << endl; reader.Close(); return false; } // get BamReader metadata const string headerText = reader.GetHeaderText(); const RefVector references = reader.GetReferenceData(); if ( references.empty() ) { cerr << "bamtools random ERROR: no reference data available... Aborting." << endl; reader.Close(); return false; } // determine compression mode for BamWriter bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() && !m_settings->IsForceCompression ); BamWriter::CompressionMode compressionMode = BamWriter::Compressed; if ( writeUncompressed ) compressionMode = BamWriter::Uncompressed; // open BamWriter BamWriter writer; writer.SetCompressionMode(compressionMode); if ( !writer.Open(m_settings->OutputFilename, headerText, references) ) { cerr << "bamtools random ERROR: could not open " << m_settings->OutputFilename << " for writing... Aborting." << endl; reader.Close(); return false; } // if user specified a REGION constraint, attempt to parse REGION string BamRegion region; if ( m_settings->HasRegion && !Utilities::ParseRegionString(m_settings->Region, reader, region) ) { cerr << "bamtools random ERROR: could not parse REGION: " << m_settings->Region << endl; cerr << "Check that REGION is in valid format (see documentation) and that the coordinates are valid" << endl; reader.Close(); writer.Close(); return false; } // seed our random number generator srand( time(NULL) ); // grab random alignments BamAlignment al; unsigned int i = 0; while ( i < m_settings->AlignmentCount ) { int randomRefId = 0; int randomPosition = 0; // use REGION constraints to select random refId & position if ( m_settings->HasRegion ) { // select a random refId randomRefId = getRandomInt(region.LeftRefID, region.RightRefID); // select a random position based on randomRefId const int lowerBoundPosition = ( (randomRefId == region.LeftRefID) ? region.LeftPosition : 0 ); const int upperBoundPosition = ( (randomRefId == region.RightRefID) ? region.RightPosition : (references.at(randomRefId).RefLength - 1) ); randomPosition = getRandomInt(lowerBoundPosition, upperBoundPosition); } // otherwise select from all possible random refId & position else { // select random refId randomRefId = getRandomInt(0, (int)references.size() - 1); // select random position based on randomRefId const int lowerBoundPosition = 0; const int upperBoundPosition = references.at(randomRefId).RefLength - 1; randomPosition = getRandomInt(lowerBoundPosition, upperBoundPosition); } // if jump & read successful, save first alignment that overlaps random refId & position if ( reader.Jump(randomRefId, randomPosition) ) { while ( reader.GetNextAlignmentCore(al) ) { if ( al.RefID == randomRefId && al.Position >= randomPosition ) { writer.SaveAlignment(al); ++i; break; } } } } // cleanup & exit reader.Close(); writer.Close(); return true; }
/** * Construct a Vector using a reference Vector. This will be used to do * implicit casts from RefVector to Vector for most of the Vector operations. * * @param vec the RefVector to copy */ Vector::Vector(const RefVector& vec) : data({vec.x(), vec.y(), vec.z(), 0.0}) { }