void BedCoverage::CollectCoverageBed() { // load the "B" bed file into a map so // that we can easily compare "A" to it for overlaps _bedB->loadBedCovFileIntoMap(); BED a; _bedA->Open(); // process each entry in A while (_bedA->GetNextBed(a)) { if (_bedA->_status == BED_VALID) { // process the BED entry as a single block if (_obeySplits == false) _bedB->countHits(a, _sameStrand, _diffStrand, _countsOnly); // split the BED into discrete blocksand process each independently. else { bedVector bedBlocks; GetBedBlocks(a, bedBlocks); // use countSplitHits to avoid over-counting each split chunk // as distinct read coverage. _bedB->countSplitHits(bedBlocks, _sameStrand, _diffStrand, _countsOnly); } } } _bedA->Close(); // report the coverage (summary or histogram) for BED B. if (_countsOnly == true) ReportCounts(); else ReportCoverage(); }
bool BedIntersect::FindBlockedOverlaps(const BED &a, const vector<BED> &a_blocks, const vector<BED> &hits, bool a_is_bam) { int a_footprint = GetTotalBlockLength(a_blocks); // container to store the set of raw hits // that actually overlap the A blocks bedVector valid_hits; valid_hits.reserve(hits.size()); // 1. Loop through each raw hit (outer loop) // 2. Break the raw hit into it;s blocks // and see of one of the hit blocks (inner loop) // overlaps one of a's blocks (inner, inner loop) // 3. If so, mark the hit as valid and add it to the valid_set. // Otherwise, the hit only overlapped the span of a and not // the individual blocks. Thus, it doesn't count. bedVector::const_iterator hItr = hits.begin(); bedVector::const_iterator hEnd = hits.end(); for (; hItr != hEnd; ++hItr) { // break the hit into blocks bedVector hitBlocks; GetBedBlocks(*hItr, hitBlocks); int b_footprint = GetTotalBlockLength(hitBlocks); // test to see if there is a valid hit with one of the blocks bool valid_hit = false; int total_overlap = 0; bedVector::const_iterator hbItr = hitBlocks.begin(); bedVector::const_iterator hbEnd = hitBlocks.end(); for (; hbItr != hbEnd; ++hbItr) { // look for overlaps between this hit/block and each block in a bedVector::const_iterator a_blockItr = a_blocks.begin(); bedVector::const_iterator a_blockEnd = a_blocks.end(); for (; a_blockItr != a_blockEnd; ++a_blockItr) { int hs = max(a_blockItr->start, hbItr->start); int he = min(a_blockItr->end, hbItr->end); int overlap = he - hs; if (overlap > 0) { valid_hit = true; total_overlap += overlap; } } } if (valid_hit) { // require sufficint overlap fraction (reciprocal or otherwise) // w.r.t to the "footprint" (i.e., the total length of each block) if ( ((float) total_overlap / (float) a_footprint) > _overlapFraction) { if (_reciprocal && ((float) total_overlap / (float) b_footprint) > _overlapFraction) { valid_hits.push_back(*hItr); } else if (!_reciprocal) { valid_hits.push_back(*hItr); } } } } if (!a_is_bam) { return processHits(a, valid_hits); } else return !valid_hits.empty(); }
void BedGenomeCoverage::CoverageBed() { BED a; ResetChromCoverage(); _bed->Open(); while (_bed->GetNextBed(a)) { if (_bed->_status == BED_VALID) { if (_filterByStrand == true) { if (a.strand.empty()) { cerr << "Input error: Interval is missing a strand value on line " << _bed->_lineNum << "." <<endl; exit(1); } if ( ! (a.strand == "-" || a.strand == "+") ) { cerr << "Input error: Invalid strand value (" << a.strand << ") on line " << _bed->_lineNum << "." << endl; exit(1); } // skip if the strand is not what the user requested. if (a.strand != _requestedStrand) continue; } // are we on a new chromosome? if (a.chrom != _currChromName) StartNewChrom(a.chrom); if (_obeySplits == true) { bedVector bedBlocks; // vec to store the discrete BED "blocks" GetBedBlocks(a, bedBlocks); AddBlockedCoverage(bedBlocks); } else if (_only_5p_end) { CHRPOS pos = ( a.strand=="+" ) ? a.start : a.end-1; AddCoverage(pos,pos); } else if (_only_3p_end) { CHRPOS pos = ( a.strand=="-" ) ? a.start : a.end-1; AddCoverage(pos,pos); } else AddCoverage(a.start, a.end-1); } } _bed->Close(); // process the results of the last chromosome. ReportChromCoverage(_currChromCoverage, _currChromSize, _currChromName, _currChromDepthHist); // report all empty chromsomes PrintEmptyChromosomes(); // report the overall coverage if asked. PrintFinalCoverage(); }
//****************************************************************************** // ExtractDNA //****************************************************************************** void Bed2Fa::ExtractDNA() { /* Make sure that we can open all of the files successfully*/ // open the fasta database for reading ifstream faDb(_dbFile.c_str(), ios::in); if ( !faDb ) { cerr << "Error: The requested fasta database file (" << _dbFile << ") could not be opened. Exiting!" << endl; exit (1); } // open and memory-map genome file FastaReference *fr = new FastaReference; bool memmap = true; fr->open(_dbFile, memmap, _useFullHeader); BED bed, nullBed; string sequence; _bed->Open(); while (_bed->GetNextBed(bed)) { if (_bed->_status == BED_VALID) { // make sure we are extracting >= 1 bp if (bed.zeroLength == false) { size_t seqLength = fr->sequenceLength(bed.chrom); // seqLength > 0 means chrom was found in index. // seqLength == 0 otherwise. if (seqLength) { // make sure this feature will not exceed // the end of the chromosome. if ( (bed.start <= seqLength) && (bed.end <= seqLength) ) { int length = bed.end - bed.start; if(_useBlocks){ // vec to store the discrete BED "blocks" bedVector bedBlocks; GetBedBlocks(bed, bedBlocks); sequence.clear(); for (int i = 0; i < (int) bedBlocks.size(); ++i) { sequence += fr->getSubSequence(bed.chrom, bedBlocks[i].start, bedBlocks[i].end - bedBlocks[i].start); } } else { sequence = \ fr->getSubSequence(bed.chrom, bed.start, length); } ReportDNA(bed, sequence); } else { cerr << "Feature (" << bed.chrom << ":" << bed.start << "-" << bed.end << ") beyond the length of " << bed.chrom << " size (" << seqLength << " bp). Skipping." << endl; } } else { cerr << "WARNING. chromosome (" << bed.chrom << ") was not found in the FASTA file. Skipping." << endl; } } // handle zeroLength else { cerr << "Feature (" << bed.chrom << ":" << bed.start+1 << "-" << bed.end-1 << ") has length = 0, Skipping." << endl; } bed = nullBed; } } _bed->Close(); }
void BedIntersect::IntersectBed() { // create new BED file objects for A and B _bedA = new BedFile(_bedAFile); _bedB = new BedFile(_bedBFile); if (_sortedInput == false) { // load the "B" file into a map in order to // compare each entry in A to it in search of overlaps. _bedB->loadBedFileIntoMap(); vector<BED> hits; hits.reserve(100); BED a; // open the "A" file, process each BED entry and searh for overlaps. _bedA->Open(); // report A's header first if asked. if (_printHeader == true) { _bedA->PrintHeader(); } while (_bedA->GetNextBed(a)) { if (_bedA->_status == BED_VALID) { // treat the BED as a single "block" if (_obeySplits == false) FindOverlaps(a, hits); // split the BED12 into blocks and look for overlaps in each discrete block else { // find the hits that overlap with the full span of the blocked BED _bedB->allHits(a.chrom, a.start, a.end, a.strand, hits, _sameStrand, _diffStrand, _overlapFraction, _reciprocal); // break a into discrete blocks, as we need to // measure overlap with the individual blocks, not the full span. bedVector a_blocks; GetBedBlocks(a, a_blocks); // find the overlaps between the block in A and B // last parm is false as a is not a BAM entry FindBlockedOverlaps(a, a_blocks, hits, false); } hits.clear(); } } _bedA->Close(); } else { // use the chromsweep algorithm to detect overlaps on the fly. ChromSweep sweep = ChromSweep(_bedA, _bedB, _sameStrand, _diffStrand, _overlapFraction, _reciprocal, _printHeader); pair<BED, vector<BED> > hit_set; hit_set.second.reserve(10000); while (sweep.Next(hit_set)) { if (_obeySplits == false) processHits(hit_set.first, hit_set.second); else { bedVector a_blocks; GetBedBlocks(hit_set.first, a_blocks); FindBlockedOverlaps(hit_set.first, a_blocks, hit_set.second, false); } } } }