// =============================================================================== // = Merge overlapping BED entries into a single entry, accounting for strandedness = // ================================================================================ void BedMerge::MergeBedStranded() { // load the "B" bed file into a map so // that we can easily compare "A" to it for overlaps _bed->loadBedFileIntoMapNoBin(); // loop through each chromosome and merge their BED entries masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin(); masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end(); for (; m != mEnd; ++m) { // bedList is already sorted by start position. string chrom = m->first; // make a list of the two strands to merge separately. vector<string> strands(2); strands[0] = "+"; strands[1] = "-"; // do two passes, one for each strand. for (unsigned int s = 0; s < strands.size(); s++) { int mergeCount = 1; int numOnStrand = 0; vector<string> names; vector<string> scores; // merge overlapping features for this chromosome. int start = -1; int end = -1; vector<BED>::const_iterator bedItr = m->second.begin(); vector<BED>::const_iterator bedEnd = m->second.end(); for (; bedItr != bedEnd; ++bedItr) { // if forcing strandedness, move on if the hit // is not on the current strand. if (bedItr->strand != strands[s]) { continue; } else { numOnStrand++; } if ( (((int) bedItr->start - end) > _maxDistance) || (end < 0)) { if (start >= 0) { ReportStranded(chrom, start, end, names, scores, mergeCount, strands[s]); // reset mergeCount = 1; names.clear(); scores.clear(); } start = bedItr->start; end = bedItr->end; if (!bedItr->name.empty()) names.push_back(bedItr->name); if (!bedItr->score.empty()) scores.push_back(bedItr->score); } else { if ((int) bedItr-> end > end) end = bedItr->end; mergeCount++; if (!bedItr->name.empty()) names.push_back(bedItr->name); if (!bedItr->score.empty()) scores.push_back(bedItr->score); } } if (start >= 0) { ReportStranded(chrom, start, end, names, scores, mergeCount, strands[s]); } } } }
// ================================================================================== // = Merge overlapping BED entries into a single entry, accounting for strandedness = // ================================================================================== void BedMerge::MergeBedStranded() { // load the "B" bed file into a map so // that we can easily compare "A" to it for overlaps _bed->loadBedFileIntoMapNoBin(); // loop through each chromosome and merge their BED entries masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin(); masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end(); for (; m != mEnd; ++m) { // bedList is already sorted by start position. vector<BED> bedList = m->second; // make a list of the two strands to merge separately. vector<string> strands(2); strands[0] = "+"; strands[1] = "-"; // do two passes, one for each strand. for (unsigned int s = 0; s < strands.size(); s++) { CHRPOS minStart = INT_MAX; CHRPOS maxEnd = 0; bool OIP = false; // OIP = Overlap In Progress. Lame, I realize. int prev = -1; unsigned int curr = 0; int mergeCount = 1; int numOnStrand = 0; map<string, bool> names; // loop through the BED entries for this chromosome // and look for overlaps for (curr = 0; curr < bedList.size(); ++curr) { // if forcing strandedness, move on if the hit // is not on the current strand. if (bedList[curr].strand != strands[s]) { continue; // continue force the next iteration of the for loop. } else { numOnStrand++; } // make sure prev points to an actual element on the // current strand if (prev < 0) { if (bedList[curr].strand == strands[s]) { prev = curr; } continue; } if ( overlaps(bedList[prev].start, bedList[prev].end, bedList[curr].start, bedList[curr].end) >= _maxDistance) { OIP = true; mergeCount++; minStart = min(bedList[prev].start, min(minStart, bedList[curr].start)); maxEnd = max(bedList[prev].end, max(maxEnd, bedList[curr].end)); names[bedList[prev].name] = true; names[bedList[curr].name] = true; } else if ( overlaps(minStart, maxEnd, bedList[curr].start, bedList[curr].end) >= _maxDistance) { mergeCount++; minStart = min(minStart, bedList[curr].start); maxEnd = max(maxEnd, bedList[curr].end); names[bedList[curr].name] = true; } else { // was there an overlap before the current entry broke it? if (OIP) { if (_numEntries) { cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << mergeCount << "\t" << strands[s] << endl; } else if (_reportNames) { cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t"; ReportMergedNames(names); cout << "\t" << strands[s] << endl; } else { cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << strands[s] << endl; } } else { if ((_numEntries) && (numOnStrand > 0)) { cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << mergeCount << "\t" << strands[s] << endl; } else if (_reportNames) { cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << bedList[prev].name << "\t" << strands[s] << endl; } else if (numOnStrand > 0) { cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << strands[s] << endl; } } // reset things for the next overlapping "block" OIP = false; mergeCount = 1; minStart = INT_MAX; maxEnd = 0; names.clear(); // add the name of the current element in prep for the next block names[bedList[curr].name] = true; } prev = curr; } // clean up based on the last entry for the current chromosome if (OIP) { if (_numEntries) { cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << mergeCount << "\t" << strands[s] << endl; } else if (_reportNames) { cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t"; ReportMergedNames(names); cout << "\t" << strands[s] << endl; } else { cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << strands[s] << endl; } } else { if ((_numEntries) && (numOnStrand > 0)) { cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << mergeCount << "\t" << strands[s] << endl; } else if ((_reportNames) && (numOnStrand > 0)) { cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << bedList[prev].name << "\t" << strands[s] << endl; } else if (numOnStrand > 0) { cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << strands[s] << endl; } } } } }