示例#1
0
// ===============================================================================
// = Merge overlapping BED entries into a single entry, accounting for strandedness =
// ================================================================================
void BedMerge::MergeBedStranded() {

    // load the "B" bed file into a map so
    // that we can easily compare "A" to it for overlaps
    _bed->loadBedFileIntoMapNoBin();

    // loop through each chromosome and merge their BED entries
    masterBedMapNoBin::const_iterator m    = _bed->bedMapNoBin.begin();
    masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end();
    for (; m != mEnd; ++m) {
        
        // bedList is already sorted by start position.
        string chrom        = m->first;

        // make a list of the two strands to merge separately.
        vector<string> strands(2);
        strands[0] = "+";
        strands[1] = "-";
        // do two passes, one for each strand.
        for (unsigned int s = 0; s < strands.size(); s++) {
            int mergeCount = 1;
            int numOnStrand = 0;
            vector<string> names;
            vector<string> scores;

            // merge overlapping features for this chromosome.
            int start = -1;
            int end   = -1;
            vector<BED>::const_iterator bedItr = m->second.begin();
            vector<BED>::const_iterator bedEnd = m->second.end();
            for (; bedItr != bedEnd; ++bedItr) {
                // if forcing strandedness, move on if the hit
                // is not on the current strand.
                if (bedItr->strand != strands[s]) { continue; }
                else { numOnStrand++; }
                if ( (((int) bedItr->start - end) > _maxDistance) || 
                    (end < 0)) 
                {
                    if (start >= 0) {
                        ReportStranded(chrom, start, end, names, 
                                       scores, mergeCount, strands[s]);
                        // reset
                        mergeCount = 1;
                        names.clear();
                        scores.clear();
                    }
                    start = bedItr->start;
                    end   = bedItr->end;
                    if (!bedItr->name.empty())  names.push_back(bedItr->name);
                    if (!bedItr->score.empty()) scores.push_back(bedItr->score);
                }
                else {
                    if ((int) bedItr-> end > end) end = bedItr->end;
                    mergeCount++;
                    if (!bedItr->name.empty())  names.push_back(bedItr->name);
                    if (!bedItr->score.empty()) scores.push_back(bedItr->score);
                }
            }
            if (start >= 0) {
                ReportStranded(chrom, start, end, names, 
                               scores, mergeCount, strands[s]);
            }
        }
    }
}
示例#2
0
// ==================================================================================
// = Merge overlapping BED entries into a single entry, accounting for strandedness =
// ==================================================================================
void BedMerge::MergeBedStranded() {

	// load the "B" bed file into a map so
	// that we can easily compare "A" to it for overlaps
	_bed->loadBedFileIntoMapNoBin();

	// loop through each chromosome and merge their BED entries
	masterBedMapNoBin::const_iterator m    = _bed->bedMapNoBin.begin(); 
	masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end(); 
    for (; m != mEnd; ++m) {
		// bedList is already sorted by start position.
		vector<BED> bedList = m->second; 

		// make a list of the two strands to merge separately.
		vector<string> strands(2);
		strands[0] = "+";
		strands[1] = "-";

		// do two passes, one for each strand.
		for (unsigned int s = 0; s < strands.size(); s++) {

			CHRPOS minStart = INT_MAX;
			CHRPOS maxEnd = 0;
			bool OIP = false;       // OIP = Overlap In Progress.  Lame, I realize.
			int prev = -1;
			unsigned int curr = 0;
			int mergeCount = 1;
			int numOnStrand = 0;
			map<string, bool> names;	
			
			// loop through the BED entries for this chromosome
			// and look for overlaps
			for (curr = 0; curr < bedList.size(); ++curr) {

				// if forcing strandedness, move on if the hit
				// is not on the current strand.
				
				if (bedList[curr].strand != strands[s]) {
					continue;		// continue force the next iteration of the for loop.
				}
				else {
					numOnStrand++;
				}

				// make sure prev points to an actual element on the
				// current strand
				if (prev < 0) {
					if (bedList[curr].strand == strands[s]) {
						prev = curr;
					}
					continue;
				}
	
				if ( overlaps(bedList[prev].start, bedList[prev].end, 
				 			bedList[curr].start, bedList[curr].end) >= _maxDistance) {					
					OIP = true;
					mergeCount++;
					minStart = min(bedList[prev].start, min(minStart, bedList[curr].start));
					maxEnd = max(bedList[prev].end, max(maxEnd, bedList[curr].end));

					names[bedList[prev].name] = true;
					names[bedList[curr].name] = true;
				}
				else if ( overlaps(minStart, maxEnd, 
								bedList[curr].start, bedList[curr].end) >= _maxDistance) {
					mergeCount++;
					minStart = min(minStart, bedList[curr].start);
					maxEnd = max(maxEnd, bedList[curr].end);
					names[bedList[curr].name] = true;
				}
				else {

					// was there an overlap before the current entry broke it?
					if (OIP) {
						if (_numEntries) {
							cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << mergeCount << "\t" << strands[s] << endl;
						}
						else if (_reportNames) {
							cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t";
							ReportMergedNames(names);
							cout << "\t" << strands[s] << endl;
						}
						else {
							cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << strands[s] << endl;
						}
					}
					else {
						if ((_numEntries) && (numOnStrand > 0)) {
							cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << mergeCount << "\t" << strands[s] << endl;
						}
						else if (_reportNames) {
							cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << bedList[prev].name << "\t" << strands[s] << endl;
						}
						else if (numOnStrand > 0) {
							cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << strands[s] << endl;
						}
					}

					// reset things for the next overlapping "block"
					OIP = false;
					mergeCount = 1;			
					minStart = INT_MAX;
					maxEnd = 0;
					names.clear();
					
					// add the name of the current element in prep for the next block
					names[bedList[curr].name] = true;
				}
				prev = curr;
			}

			// clean up based on the last entry for the current chromosome
			if (OIP) {
				if (_numEntries) {
					cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << mergeCount << "\t" << strands[s] << endl;
				}
				else if (_reportNames) {
					cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t";
					ReportMergedNames(names);
					cout << "\t" << strands[s] << endl;
				}
				else {
					cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << strands[s] << endl;
				}
			}
			else {
				if ((_numEntries) && (numOnStrand > 0)) {
					cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << mergeCount << "\t" << strands[s] << endl;
				}
				else if ((_reportNames) && (numOnStrand > 0)) {
					cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << bedList[prev].name << "\t" << strands[s] << endl;
				}
				else if (numOnStrand > 0) {
					cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << strands[s] << endl;
				}
			}
		}
	}
}