Esempio n. 1
0
void BedCoverage::CollectCoverageBed() {

    // load the "B" bed file into a map so
    // that we can easily compare "A" to it for overlaps
    _bedB->loadBedCovFileIntoMap();

    BED a;
    _bedA->Open();
    // process each entry in A
    while (_bedA->GetNextBed(a)) {
        if (_bedA->_status == BED_VALID) {
            // process the BED entry as a single block
            if (_obeySplits == false)
                _bedB->countHits(a, _sameStrand, _diffStrand, _countsOnly);
            // split the BED into discrete blocksand process each independently.
            else {
                bedVector bedBlocks;
                GetBedBlocks(a, bedBlocks);
                // use countSplitHits to avoid over-counting each split chunk
                // as distinct read coverage.
                _bedB->countSplitHits(bedBlocks, _sameStrand, _diffStrand, _countsOnly);
            }
        }
    }
    _bedA->Close();

    // report the coverage (summary or histogram) for BED B.
    if (_countsOnly == true)
        ReportCounts();
    else 
        ReportCoverage();
}
Esempio n. 2
0
bool BedIntersect::FindBlockedOverlaps(const BED &a, const vector<BED> &a_blocks, 
                                       const vector<BED> &hits, bool a_is_bam) {
    int a_footprint = GetTotalBlockLength(a_blocks);
    // container to store the set of raw hits 
    // that actually overlap the A blocks
    bedVector valid_hits;
    valid_hits.reserve(hits.size());
    
    // 1. Loop through each raw hit (outer loop)
    // 2. Break the raw hit into it;s blocks
    //    and see of one of the hit blocks (inner loop)
    //    overlaps one of a's blocks (inner, inner loop)
    // 3. If so, mark the hit as valid and add it to the valid_set.
    //    Otherwise, the hit only overlapped the span of a and not
    //    the individual blocks.  Thus, it doesn't count.
    bedVector::const_iterator hItr = hits.begin();
    bedVector::const_iterator hEnd = hits.end();
    for (; hItr != hEnd; ++hItr) {
        // break the hit into blocks
        bedVector hitBlocks;
        GetBedBlocks(*hItr, hitBlocks);
        int b_footprint = GetTotalBlockLength(hitBlocks);
        // test to see if there is a valid hit with one of the blocks
        bool valid_hit    = false;
        int total_overlap = 0;
        bedVector::const_iterator hbItr = hitBlocks.begin();
        bedVector::const_iterator hbEnd = hitBlocks.end();
        for (; hbItr != hbEnd; ++hbItr) {
            // look for overlaps between this hit/block and each block in a
            bedVector::const_iterator a_blockItr = a_blocks.begin();
            bedVector::const_iterator a_blockEnd = a_blocks.end();
            for (; a_blockItr != a_blockEnd; ++a_blockItr) {
                int hs = max(a_blockItr->start, hbItr->start);
                int he = min(a_blockItr->end, hbItr->end);
                int overlap = he - hs;
                if (overlap > 0) {
                    valid_hit = true;
                    total_overlap += overlap;
                }
            }
        }
        if (valid_hit) {
            // require sufficint overlap fraction (reciprocal or otherwise)
            // w.r.t to the "footprint" (i.e., the total length of each block)
            if ( ((float) total_overlap / (float) a_footprint) > _overlapFraction) {
                if (_reciprocal && ((float) total_overlap / (float) b_footprint) > _overlapFraction) {
                    valid_hits.push_back(*hItr);
                }
                else if (!_reciprocal) {
                    valid_hits.push_back(*hItr);
                }
            }
        }
    }
    if (!a_is_bam) {
        return processHits(a, valid_hits);
    }
    else
        return !valid_hits.empty();
}
Esempio n. 3
0
void BedGenomeCoverage::CoverageBed() {

    BED a;

    ResetChromCoverage();

    _bed->Open();
    while (_bed->GetNextBed(a)) {
        if (_bed->_status == BED_VALID) {
            if (_filterByStrand == true) {
                if (a.strand.empty()) {
                    cerr << "Input error: Interval is missing a strand value on line " << _bed->_lineNum << "." <<endl;
                    exit(1);
                }
                if ( ! (a.strand == "-" || a.strand == "+") ) {
                    cerr << "Input error: Invalid strand value (" << a.strand << ") on line " << _bed->_lineNum << "." << endl;
                    exit(1);
                }
                // skip if the strand is not what the user requested.
                if (a.strand != _requestedStrand)
                    continue;
            }

            // are we on a new chromosome?
            if (a.chrom != _currChromName)
                StartNewChrom(a.chrom);

            if (_obeySplits == true) {
                bedVector bedBlocks; // vec to store the discrete BED "blocks"
                GetBedBlocks(a, bedBlocks);
                AddBlockedCoverage(bedBlocks);
            }
            else if (_only_5p_end) {
                CHRPOS pos = ( a.strand=="+" ) ? a.start : a.end-1;
                AddCoverage(pos,pos);
            }
            else if (_only_3p_end) {
                CHRPOS pos = ( a.strand=="-" ) ? a.start : a.end-1;
                AddCoverage(pos,pos);
            }
            else
                AddCoverage(a.start, a.end-1);
        }
    }
    _bed->Close();

    // process the results of the last chromosome.
    ReportChromCoverage(_currChromCoverage, _currChromSize,
            _currChromName, _currChromDepthHist);

    // report all empty chromsomes
    PrintEmptyChromosomes();

    // report the overall coverage if asked.
    PrintFinalCoverage();
}
Esempio n. 4
0
//******************************************************************************
// ExtractDNA
//******************************************************************************
void Bed2Fa::ExtractDNA() {

    /* Make sure that we can open all of the files successfully*/

    // open the fasta database for reading
    ifstream faDb(_dbFile.c_str(), ios::in);
    if ( !faDb ) {
        cerr << "Error: The requested fasta database file (" 
             << _dbFile << ") could not be opened. Exiting!" 
             << endl;
        exit (1);
    }

    // open and memory-map genome file
    FastaReference *fr = new FastaReference;
    bool memmap = true;
    fr->open(_dbFile, memmap, _useFullHeader);

    BED bed, nullBed;
    string sequence;

    _bed->Open();
    while (_bed->GetNextBed(bed)) {
        if (_bed->_status == BED_VALID) {
            // make sure we are extracting >= 1 bp
            if (bed.zeroLength == false) {
    
                size_t seqLength = fr->sequenceLength(bed.chrom);
                // seqLength > 0 means chrom was found in index.
                // seqLength == 0 otherwise.
                if (seqLength) {
                    // make sure this feature will not exceed 
                    // the end of the chromosome.
                    if ( (bed.start <= seqLength) && (bed.end <= seqLength) ) 
                    {
                        int length = bed.end - bed.start;
                        if(_useBlocks){
                            // vec to store the discrete BED "blocks"
                            bedVector bedBlocks;  
                            GetBedBlocks(bed, bedBlocks);
                            sequence.clear();
                            for (int i = 0; i < (int) bedBlocks.size(); ++i) {
                                sequence += fr->getSubSequence(bed.chrom,
                                        bedBlocks[i].start,
                                        bedBlocks[i].end - bedBlocks[i].start);
                            }
                        } else {
                            sequence = \
                               fr->getSubSequence(bed.chrom, bed.start, length);
                        }
                        ReportDNA(bed, sequence);
                    }
                    else
                    {
                        cerr << "Feature (" << bed.chrom << ":" 
                             << bed.start << "-" << bed.end 
                            << ") beyond the length of "
                            << bed.chrom 
                            << " size (" << seqLength << " bp).  Skipping." 
                            << endl;
                    }
                }
                else
                {
                    cerr << "WARNING. chromosome (" 
                         << bed.chrom 
                         << ") was not found in the FASTA file. Skipping."
                         << endl;
                }
            }
            // handle zeroLength 
            else {
                cerr << "Feature (" << bed.chrom << ":" 
                     << bed.start+1 << "-" << bed.end-1 
                     << ") has length = 0, Skipping." 
                     << endl;
            }
            bed = nullBed;
        }
    }
    _bed->Close();
}
Esempio n. 5
0
void BedIntersect::IntersectBed() {

    // create new BED file objects for A and B
    _bedA = new BedFile(_bedAFile);
    _bedB = new BedFile(_bedBFile);

    if (_sortedInput == false) {
        // load the "B" file into a map in order to
        // compare each entry in A to it in search of overlaps.
        _bedB->loadBedFileIntoMap();

        vector<BED> hits;
        hits.reserve(100);
        BED a;

        // open the "A" file, process each BED entry and searh for overlaps.
        _bedA->Open();
        // report A's header first if asked.
        if (_printHeader == true) {
            _bedA->PrintHeader();
        }
        while (_bedA->GetNextBed(a)) {
            if (_bedA->_status == BED_VALID) {
                // treat the BED as a single "block"
                if (_obeySplits == false)
                    FindOverlaps(a, hits);
                // split the BED12 into blocks and look for overlaps in each discrete block
                else {
                    // find the hits that overlap with the full span of the blocked BED
                    _bedB->allHits(a.chrom, a.start, a.end, a.strand,
                                   hits, _sameStrand, _diffStrand,
                                   _overlapFraction, _reciprocal);
                    // break a into discrete blocks, as we need to 
                    // measure overlap with the individual blocks, not the full span.
                    bedVector a_blocks; 
                    GetBedBlocks(a, a_blocks);
                    // find the overlaps between the block in A and B 
                    // last parm is false as a is not a BAM entry
                    FindBlockedOverlaps(a, a_blocks, hits, false);
                }
                hits.clear();
            }
        }
        _bedA->Close();
    }
    else {
        // use the chromsweep algorithm to detect overlaps on the fly.
        ChromSweep sweep = ChromSweep(_bedA, _bedB, 
                                      _sameStrand, _diffStrand, 
                                      _overlapFraction, _reciprocal,
                                      _printHeader);

        pair<BED, vector<BED> > hit_set;
        hit_set.second.reserve(10000);
        while (sweep.Next(hit_set)) {
            if (_obeySplits == false)
                processHits(hit_set.first, hit_set.second);
            else {
                bedVector a_blocks; 
                GetBedBlocks(hit_set.first, a_blocks);
                FindBlockedOverlaps(hit_set.first, a_blocks, hit_set.second, false);
            }
        }
    }
}