void breakSingletonTigs(UnitigVector &unitigs) { // For any singleton unitig, eject the read and delete the unitig. Eventually, // we will stop making singleton unitigs. uint32 removed = 0; for (uint32 ti=1; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg == NULL) continue; if (utg->ufpath.size() > 1) continue; unitigs[ti] = NULL; // Remove the unitig from the list utg->removeFrag(utg->ufpath[0].ident); // Eject the read delete utg; // Reclaim space removed++; // Count } writeLog("Removed %u read%s from %u singleton unitig%s.\n", removed, (removed != 1) ? "" : "s", removed, (removed != 1) ? "" : "s"); }
void placeContainsUsingBestOverlaps(UnitigVector &unitigs) { uint32 fragsPlaced = 1; uint32 fragsPending = 0; logFileFlags &= ~LOG_PLACE_FRAG; while (fragsPlaced > 0) { fragsPlaced = 0; fragsPending = 0; writeLog("==> PLACING CONTAINED FRAGMENTS\n"); for (uint32 fid=1; fid<FI->numFragments()+1; fid++) { BestContainment *bestcont = OG->getBestContainer(fid); Unitig *utg; if (bestcont->isContained == false) // Not a contained fragment. continue; if (Unitig::fragIn(fid) != 0) // Containee already placed. continue; if (Unitig::fragIn(bestcont->container) == 0) { // Container not placed (yet). fragsPending++; continue; } utg = unitigs[Unitig::fragIn(bestcont->container)]; utg->addContainedFrag(fid, bestcont, logFileFlagSet(LOG_INITIAL_CONTAINED_PLACEMENT)); if (utg->id() != Unitig::fragIn(fid)) writeLog("placeContainsUsingBestOverlaps()-- FAILED to add frag %d to unitig %d.\n", fid, bestcont->container); assert(utg->id() == Unitig::fragIn(fid)); fragsPlaced++; } writeLog("==> PLACING CONTAINED FRAGMENTS - placed %d fragments; still need to place %d\n", fragsPlaced, fragsPending); if ((fragsPlaced == 0) && (fragsPending > 0)) { writeLog("Stopping contained fragment placement due to zombies.\n"); fragsPlaced = 0; fragsPending = 0; } } for (uint32 ti=1; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg) utg->sort(); } }
void joinUnitigs(UnitigVector &unitigs, bool enableJoining) { if (enableJoining == false) return; writeLog("==> JOINING SPLIT UNITIGS\n"); // Sort unitigs by joined size. Sort. Join the largest first. vector<joinEntry> joins; // Over all unitigs, evaluate if a unitig is a candidate for merging onto something. for (uint32 frID=0; frID<unitigs.size(); frID++) { Unitig *fr = unitigs[frID]; if (fr == NULL) // Ain't no unitig here, mister! continue; if (fr->ufpath.size() < 2) // Ain't no real unitig here, mister! continue; // Do we look like a bubble? if (joinUnitigs_looksLikeBubble(fr)) continue; // The for loop tries reads close to the end - but we don't support joining these. for (uint32 ii=0; (ii < 1) && (ii < fr->ufpath.size()); ii++) if (joinUnitigs_examineEnd(unitigs, fr, ii, true, joins)) break; for (uint32 ii=0; (ii < 1) && (ii < fr->ufpath.size()); ii++) if (joinUnitigs_examineEnd(unitigs, fr, ii, false, joins)) break; } // Over all unitigs. writeLog("Found %d pairs of unitigs to join.\n", (int)joins.size()); std::sort(joins.begin(), joins.end(), greater<joinEntry>()); return; for (uint32 j=0; j<joins.size(); j++) { joinEntry *join = &joins[j]; //joinUnitigs_append(unitigs, join); } }
void checkUnitigMembership(UnitigVector &unitigs) { uint32 *inUnitig = new uint32 [FI->numFragments()+1]; uint32 noUnitig = 0xffffffff; // All reads start of not placed in a unitig. for (uint32 i=0; i<FI->numFragments()+1; i++) inUnitig[i] = noUnitig; // Over all unitigs, remember where each read is. for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *tig = unitigs[ti]; int32 len = 0; if (tig == NULL) continue; for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; if (frg->ident > FI->numFragments()) fprintf(stderr, "tig %u ufpath[%d] ident %u more than number of reads %u\n", tig->id(), fi, frg->ident, FI->numFragments()); if (inUnitig[frg->ident] != noUnitig) fprintf(stderr, "tig %u ufpath[%d] ident %u placed multiple times\n", tig->id(), fi, frg->ident); assert(frg->ident <= FI->numFragments()); // Can't be out of range. assert(inUnitig[frg->ident] == noUnitig); // Read must be not placed yet. inUnitig[frg->ident] = ti; } } // Find any read not placed in a unitig. for (uint32 i=0; i<FI->numFragments()+1; i++) { if (FI->fragmentLength(i) == 0) // Deleted read. continue; assert(inUnitig[i] != 0); // There shouldn't be a unitig 0. assert(inUnitig[i] != noUnitig); // The read should be in a unitig. } delete [] inUnitig; }
void reportUnitigs(UnitigVector &unitigs, const char *prefix, const char *name) { if (logFileFlagSet(LOG_INTERMEDIATE_UNITIGS) == 0) return; uint32 numFragsT = 0; uint32 numFragsP = 0; uint64 utgLen = 0; // Compute average frags per partition. for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg == NULL) continue; numFragsT += utg->ufpath.size(); if (utg->ufpath.size() > 2) utgLen += utg->getLength(); } if (utgLen < 16 * 1024 * 1024) numFragsP = numFragsT / 7; else if (utgLen < 64 * 1024 * 1024) numFragsP = numFragsT / 63; else numFragsP = numFragsT / 127; char tigStorePath[FILENAME_MAX]; sprintf(tigStorePath, "%s.%03u.%s.tigStore", prefix, logFileOrder, name); // Failing to do this results in consensus running about 40 times slower. Three hours instead of // five minutes. setParentAndHang(unitigs); writeUnitigsToStore(unitigs, tigStorePath, tigStorePath, numFragsP, false); }
// For every unitig, report the best overlaps contained in the // unitig, and all overlaps contained in the unitig. // // Wow, this is ancient. // void writeOverlapsUsed(UnitigVector &unitigs, char *fileprefix) { char filename[FILENAME_MAX] = {0}; #if 0 GenericMesg pmesg; OverlapMesg omesg; #endif sprintf(filename, "%s.unused.ovl", fileprefix); FILE *file = fopen(filename, "w"); assert(file != NULL); #if 0 for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg == NULL) continue; for (uint32 fi=0; fi<utg->ufpath.size(); fi++) { ufNode *frg = &utg->ufpath[fi]; // Where is our best overlap? Contained or dovetail? BestEdgeOverlap *bestedge5 = OG->getBestEdgeOverlap(frg->ident, false); BestEdgeOverlap *bestedge3 = OG->getBestEdgeOverlap(frg->ident, true); int bestident5 = 0; int bestident3 = 0; if (bestedge5) { bestident5 = bestedge5->fragId(); if ((bestident5 > 0) && (utg->fragIn(bestident5) != utg->id())) { omesg.aifrag = frg->ident; omesg.bifrag = bestident5; omesg.ahg = bestedge5->ahang(); omesg.bhg = bestedge5->bhang(); omesg.orientation.setIsUnknown(); omesg.overlap_type = AS_DOVETAIL; omesg.quality = 0.0; omesg.min_offset = 0; omesg.max_offset = 0; omesg.polymorph_ct = 0; omesg.alignment_trace = NULL; #ifdef AS_MSG_USE_OVL_DELTA omesg.alignment_delta = NULL; #endif // This overlap is off of the 5' end of this fragment. if (bestedge5->frag3p() == false) omesg.orientation.setIsOuttie(); if (bestedge5->frag3p() == true) omesg.orientation.setIsAnti(); pmesg.t = MESG_OVL; pmesg.m = &omesg; WriteProtoMesg_AS(file, &pmesg); } } if (bestedge3) { bestident3 = bestedge3->fragId(); if ((bestident3 > 0) && (utg->fragIn(bestident3) != utg->id())) { omesg.aifrag = frg->ident; omesg.bifrag = bestident3; omesg.ahg = bestedge3->ahang(); omesg.bhg = bestedge3->bhang(); omesg.orientation.setIsUnknown(); omesg.overlap_type = AS_DOVETAIL; omesg.quality = 0.0; omesg.min_offset = 0; omesg.max_offset = 0; omesg.polymorph_ct = 0; omesg.alignment_trace = NULL; #ifdef AS_MSG_USE_OVL_DELTA omesg.alignment_delta = NULL; #endif // This overlap is off of the 3' end of this fragment. if (bestedge3->frag3p() == false) omesg.orientation.setIsNormal(); if (bestedge3->frag3p() == true) omesg.orientation.setIsInnie(); pmesg.t = MESG_OVL; pmesg.m = &omesg; WriteProtoMesg_AS(file, &pmesg); } } } } #endif fclose(file); }
// After splitting and ejecting some contains, check for discontinuous unitigs. // void splitDiscontinuousUnitigs(UnitigVector &unitigs, uint32 minOverlap) { writeLog("==> SPLIT DISCONTINUOUS\n"); uint32 numTested = 0; uint32 numSplit = 0; uint32 numCreated = 0; uint32 splitFragsLen = 0; uint32 splitFragsMax = 0; ufNode *splitFrags = NULL; for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *tig = unitigs[ti]; if ((tig == NULL) || (tig->ufpath.size() < 2)) continue; // Unitig must be sorted. Someone upstream os screwing this up. tig->sort(); // We'll want to build an array of new fragments to split out. This can be up // to the size of the largest unitig. splitFragsMax = MAX(splitFragsMax, tig->ufpath.size()); // Check that the unitig starts at position zero. Not critical for the next loop, but // needs to be dome sometime. int32 minPos = MIN(tig->ufpath[0].position.bgn, tig->ufpath[0].position.end); if (minPos == 0) continue; writeLog("splitDiscontinuous()-- tig "F_U32" offset messed up; reset by "F_S32".\n", tig->id(), minPos); for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; frg->position.bgn -= minPos; frg->position.end -= minPos; } } splitFrags = new ufNode [splitFragsMax]; // Now, finally, we can check for gaps in unitigs. for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *tig = unitigs[ti]; if ((tig == NULL) || (tig->ufpath.size() < 2)) continue; // We don't expect many unitigs to be broken, so we'll do a first quick pass to just // test if it is. int32 maxEnd = MAX(tig->ufpath[0].position.bgn, tig->ufpath[0].position.end); bool isBroken = false; for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; int32 bgn = MIN(frg->position.bgn, frg->position.end); int32 end = MAX(frg->position.bgn, frg->position.end); if (bgn > maxEnd - minOverlap) { isBroken = true; break; } maxEnd = MAX(maxEnd, end); } numTested++; if (isBroken == false) continue; numSplit++; // Dang, busted unitig. Fix it up. splitFragsLen = 0; maxEnd = 0; if (logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS)) writeLog("splitDiscontinuous()-- discontinuous tig "F_U32" with "F_SIZE_T" fragments broken into:\n", tig->id(), tig->ufpath.size()); for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; int32 bgn = MIN(frg->position.bgn, frg->position.end); int32 end = MAX(frg->position.bgn, frg->position.end); // Good thick overlap exists to this fragment, save it. if (bgn <= maxEnd - minOverlap) { assert(splitFragsLen < splitFragsMax); splitFrags[splitFragsLen++] = *frg; maxEnd = MAX(maxEnd, end); continue; } // No thick overlap found. We need to break right here before the current fragment. // If there is exactly one fragment, and it's contained, and it's not mated, move it to the // container. (This has a small positive benefit over just making every read a singleton). // if ((splitFragsLen == 1) && (FI->mateIID(splitFrags[0].ident) == 0) && (splitFrags[0].contained != 0)) { Unitig *dangler = unitigs[tig->fragIn(splitFrags[0].contained)]; // If the parent isn't in a unitig, we must have shattered the repeat unitig it was in. // Do the same here. if (dangler == NULL) { if (logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS)) writeLog("splitDiscontinuous()-- singleton frag "F_U32" shattered.\n", splitFrags[0].ident); Unitig::removeFrag(splitFrags[0].ident); } else { assert(dangler->id() == tig->fragIn(splitFrags[0].contained)); if (logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS)) writeLog("splitDiscontinuous()-- old tig "F_U32" with "F_SIZE_T" fragments (contained frag "F_U32" moved here).\n", dangler->id(), dangler->ufpath.size() + 1, splitFrags[0].ident); BestContainment *bestcont = OG->getBestContainer(splitFrags[0].ident); assert(bestcont->isContained == true); dangler->addContainedFrag(splitFrags[0].ident, bestcont, false); dangler->bubbleSortLastFrag(); assert(dangler->id() == Unitig::fragIn(splitFrags[0].ident)); } } // Otherwise, make an entirely new unitig for these fragments. else { numCreated++; makeNewUnitig(unitigs, splitFragsLen, splitFrags); tig = unitigs[ti]; } // Done with the split, save the current fragment. This resets everything. splitFragsLen = 0; splitFrags[splitFragsLen++] = *frg; maxEnd = end; } // If we did any splitting, then the length of the frags in splitFrags will be less than the length // of the path in the current unitig. Make a final new unitig for the remaining fragments. // if (splitFragsLen != tig->ufpath.size()) { numCreated++; makeNewUnitig(unitigs, splitFragsLen, splitFrags); delete unitigs[ti]; unitigs[ti] = NULL; } } writeLog("splitDiscontinuous()-- Tested "F_U32" unitigs, split "F_U32" into "F_U32" new unitigs.\n", numTested, numSplit, numCreated); delete [] splitFrags; }
InsertSizes::InsertSizes(UnitigVector &unitigs) { _numLibs = FI->numLibraries(); _dist = new int32 * [_numLibs + 1]; _distLen = new int32 [_numLibs + 1]; _distMax = new int32 [_numLibs + 1]; _mean = new int32 [_numLibs + 1]; _stddev = new int32 [_numLibs + 1]; _samples = new int32 [_numLibs + 1]; _distLen[0] = 0; _distMax[0] = 0; _dist[0] = NULL; for (uint32 i=1; i<_numLibs + 1; i++) { _distLen[i] = 0; _distMax[i] = 1048576; _dist[i] = new int32 [_distMax[i]]; _mean[i] = (int32)FI->mean(i); _stddev[i] = (int32)FI->stddev(i); _samples[i] = FI->numMatesInLib(i); } for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if ((utg == NULL) || (utg->ufpath.size() < 2)) continue; accumulateLibraryStats(utg); } for (uint32 i=1; i<_numLibs + 1; i++) sort(_dist[i], _dist[i] + _distLen[i]); // Disregard outliers (those outside 5 (estimated) stddevs) and recompute global stddev for (uint32 i=1; i<_numLibs + 1; i++) { int32 median = _dist[i][_distLen[i] * 1 / 2]; int32 oneThird = _dist[i][_distLen[i] * 1 / 3]; int32 twoThird = _dist[i][_distLen[i] * 2 / 3]; int32 aproxStd = MAX(median - oneThird, twoThird - median); int32 biggest = median + aproxStd * 5; int32 smallest = median - aproxStd * 5; uint32 numPairs = 0; double sum_Dists = 0.0; double sumSquares = 0.0; for (int32 d=0; d<_distLen[i]; d++) if ((smallest <= _dist[i][d]) && (_dist[i][d] <= biggest)) { numPairs++; sum_Dists += _dist[i][d]; } _samples[i] = numPairs; _mean[i] = (numPairs > 0) ? sum_Dists / numPairs : 0; for (int32 d=0; d<_distLen[i]; d++) if ((smallest <= _dist[i][d]) && (_dist[i][d] <= biggest)) sumSquares += ((double)(_dist[i][d] - _mean[i]) * (double)(_dist[i][d] - _mean[i])); _stddev[i] = (numPairs > 1) ? sqrt(sumSquares / (numPairs - 1)) : 0.0; writeLog("InsertSizes()-- lib %d mean %d stddev %d samples %d\n", i, _mean[i], _stddev[i], _samples[i]); } for (uint32 i=0; i<_numLibs + 1; i++) delete [] _dist[i]; delete [] _dist; _dist = NULL; delete [] _distLen; _distLen = NULL; delete [] _distMax; _distMax = NULL; }
void placeUnplacedUsingAllOverlaps(UnitigVector &unitigs, const char *prefix) { uint32 fiLimit = FI->numFragments(); uint32 numThreads = omp_get_max_threads(); uint32 blockSize = (fiLimit < 100 * numThreads) ? numThreads : fiLimit / 99; uint32 *placedTig = new uint32 [FI->numFragments() + 1]; SeqInterval *placedPos = new SeqInterval [FI->numFragments() + 1]; memset(placedTig, 0, sizeof(uint32) * (FI->numFragments() + 1)); memset(placedPos, 0, sizeof(SeqInterval) * (FI->numFragments() + 1)); // Just some logging. Count the number of reads we try to place. uint32 nToPlaceContained = 0; uint32 nToPlace = 0; uint32 nPlacedContained = 0; uint32 nPlaced = 0; uint32 nFailedContained = 0; uint32 nFailed = 0; for (uint32 fid=1; fid<FI->numFragments()+1; fid++) if (Unitig::fragIn(fid) == 0) if (OG->isContained(fid)) nToPlaceContained++; else nToPlace++; writeLog("placeContains()-- placing %u contained and %u unplaced reads, with %d threads.\n", nToPlaceContained, nToPlace, numThreads); // Do the placing! #pragma omp parallel for schedule(dynamic, blockSize) for (uint32 fid=1; fid<FI->numFragments()+1; fid++) { bool enableLog = true; if (Unitig::fragIn(fid) > 0) continue; // Place the read. vector<overlapPlacement> placements; placeFragUsingOverlaps(unitigs, AS_MAX_ERATE, NULL, fid, placements); // Search the placements for the highest expected identity placement using all overlaps in the unitig. uint32 b = UINT32_MAX; for (uint32 i=0; i<placements.size(); i++) { Unitig *tig = unitigs[placements[i].tigID]; if (placements[i].fCoverage < 0.99) // Ignore partially placed reads. continue; if (tig->ufpath.size() == 1) // Ignore placements in singletons. continue; uint32 bgn = (placements[i].position.bgn < placements[i].position.end) ? placements[i].position.bgn : placements[i].position.end; uint32 end = (placements[i].position.bgn < placements[i].position.end) ? placements[i].position.end : placements[i].position.bgn; double erate = placements[i].errors / placements[i].aligned; if (tig->overlapConsistentWithTig(5.0, bgn, end, erate) < 0.5) { if ((enableLog == true) && (logFileFlagSet(LOG_PLACE_UNPLACED))) writeLog("frag %8u tested tig %6u (%6u reads) at %8u-%8u (cov %7.5f erate %6.4f) - HIGH ERROR\n", fid, placements[i].tigID, tig->ufpath.size(), placements[i].position.bgn, placements[i].position.end, placements[i].fCoverage, erate); continue; } if ((enableLog == true) && (logFileFlagSet(LOG_PLACE_UNPLACED))) writeLog("frag %8u tested tig %6u (%6u reads) at %8u-%8u (cov %7.5f erate %6.4f)\n", fid, placements[i].tigID, tig->ufpath.size(), placements[i].position.bgn, placements[i].position.end, placements[i].fCoverage, erate); if ((b == UINT32_MAX) || (placements[i].errors / placements[i].aligned < placements[b].errors / placements[b].aligned)) b = i; } // If we didn't find a best, b will be invalid; set positions for adding to a new tig. // If we did, save both the position it was placed at, and the tigID it was placed in. if (b == UINT32_MAX) { if ((enableLog == true) && (logFileFlagSet(LOG_PLACE_UNPLACED))) writeLog("frag %8u remains unplaced\n", fid); placedPos[fid].bgn = 0; placedPos[fid].end = FI->fragmentLength(fid); } else { if ((enableLog == true) && (logFileFlagSet(LOG_PLACE_UNPLACED))) writeLog("frag %8u placed tig %6u (%6u reads) at %8u-%8u (cov %7.5f erate %6.4f)\n", fid, placements[b].tigID, unitigs[placements[b].tigID]->ufpath.size(), placements[b].position.bgn, placements[b].position.end, placements[b].fCoverage, placements[b].errors / placements[b].aligned); placedTig[fid] = placements[b].tigID; placedPos[fid] = placements[b].position; } } // All reads placed, now just dump them in their correct tigs. for (uint32 fid=1; fid<FI->numFragments()+1; fid++) { Unitig *tig = NULL; ufNode frg; if (Unitig::fragIn(fid) > 0) continue; // If not placed, dump it in a new unitig. Well, not anymore. These reads were not placed in // any tig initially, were not allowed to seed a tig, and now, could find no place to go. // They're garbage. Plus, it screws up the logging above because we don't know the new tig ID // until now. if (placedTig[fid] == 0) { if (OG->isContained(fid)) nFailedContained++; else nFailed++; //tig = unitigs.newUnitig(false); } // Otherwise, it was placed somewhere, grab the tig. else { if (OG->isContained(fid)) nPlacedContained++; else nPlaced++; tig = unitigs[placedTig[fid]]; } // Regardless, add it to the tig. Logging for this is above. if (tig) { frg.ident = fid; frg.contained = 0; frg.parent = 0; frg.ahang = 0; frg.bhang = 0; frg.position = placedPos[fid]; tig->addFrag(frg, 0, false); } } // Cleanup. delete [] placedPos; delete [] placedTig; writeLog("placeContains()-- Placed %u contained reads and %u unplaced reads.\n", nPlacedContained, nPlaced); writeLog("placeContains()-- Failed to place %u contained reads (too high error suspected) and %u unplaced reads (lack of overlaps suspected).\n", nFailedContained, nFailed); // But wait! All the tigs need to be sorted. Well, not really _all_, but the hard ones to sort // are big, and those quite likely had reads added to them, so it's really not worth the effort // of tracking which ones need sorting, since the ones that don't need it are trivial to sort. for (uint32 ti=1; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg) utg->sort(); } }
void checkUnitigMembership(UnitigVector &unitigs) { int nutg = 0; int nfrg = 0; writeLog("checkUnitigMembership()-- numfrags=%d\n", FI->numFragments()); uint32 *inUnitig = new uint32 [FI->numFragments()+1]; uint32 logSizeMax = 0; uint32 logSize[64] = {0}; for (uint32 i=0; i<FI->numFragments()+1; i++) inUnitig[i] = noUnitig; for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *tig = unitigs[ti]; int32 len = 0; if (tig) { nutg++; for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; nfrg++; if (frg->ident > FI->numFragments()) writeLog("HUH? ident=%d numfrags=%d\n", frg->ident, FI->numFragments()); inUnitig[frg->ident] = ti; len = MAX(len, frg->position.bgn); len = MAX(len, frg->position.end); } uint32 ls = (uint32)(log10(len) / log10(2)); logSizeMax = (logSizeMax < ls) ? ls : logSizeMax; logSize[ls]++; } } int lost = 0; int found = 0; for (uint32 i=0; i<FI->numFragments()+1; i++) { if (FI->fragmentLength(i) > 0) { if (inUnitig[i] == 0) { writeLog("ERROR frag %d is in unitig 0!\n", i); } else if (inUnitig[i] != noUnitig) { found++; } else { writeLog("ERROR frag %d disappeared!\n", i); lost++; } } } writeLog("checkUnitigMembership()-- nutg=%d nfrg=%d lost=%d found=%d\n", nutg, nfrg, lost, found); writeLog("checkUnitigMembership()-- log2 length histogram:\n"); for (uint32 i=5; i<=logSizeMax; i++) writeLog("checkUnitigMembership()-- %2u (%9u-%9u) %u\n", i, (uint32)1 << i, (uint32)1 << (i+1), logSize[i]); assert(lost == 0); delete [] inUnitig; }
// For every unitig, report the best overlaps contained in the // unitig, and all overlaps contained in the unitig. // // Wow, this is ancient. // void writeOverlapsUsed(UnitigVector &unitigs, char *prefix) { char N[FILENAME_MAX]; sprintf(N, "%s.unused.best.edges", prefix); FILE *F = fopen(N, "w"); for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *tig = unitigs[ti]; Unitig *ovl = NULL; char tyt = 'C'; if (tig == NULL) continue; if (tig->_isUnassembled) tyt = 'U'; if (tig->_isBubble) tyt = 'B'; if (tig->_isRepeat) tyt = 'R'; if (tig->_isCircular) tyt = 'O'; for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; ufNode *oth = NULL; // Report the unused best edge BestEdgeOverlap *be5 = OG->getBestEdgeOverlap(frg->ident, false); uint32 rd5 = (be5 == NULL) ? 0 : be5->fragId(); Unitig *tg5 = (be5 == NULL) ? NULL : unitigs[Unitig::fragIn(rd5)]; char ty5 = 'C'; if ((tg5 != NULL) && (tg5->tigID() != tig->tigID())) { uint32 ord = Unitig::pathPosition(rd5); ufNode *oth = &tg5->ufpath[ord]; if (tig->_isUnassembled) ty5 = 'U'; if (tig->_isBubble) ty5 = 'B'; if (tig->_isRepeat) ty5 = 'R'; if (tig->_isCircular) ty5 = 'O'; fprintf(F, "tig %7u %c read %8u at %9u %-9u %c' -- %8d %-8d -- tig %7u %c read %8u at %9u %-9u %c'\n", tig->tigID(), tyt, frg->ident, frg->position.bgn, frg->position.end, '5', be5->ahang(), be5->bhang(), tg5->tigID(), ty5, oth->ident, oth->position.bgn, oth->position.end, (be5->frag3p() == false) ? '5' : '3'); } BestEdgeOverlap *be3 = OG->getBestEdgeOverlap(frg->ident, true); uint32 rd3 = (be3 == NULL) ? 0 : be3->fragId(); Unitig *tg3 = (be3 == NULL) ? NULL : unitigs[Unitig::fragIn(rd3)]; char ty3 = 'C'; if ((tg3 != NULL) && (tg3->tigID() != tig->tigID())) { uint32 ord = Unitig::pathPosition(rd3); ufNode *oth = &tg3->ufpath[ord]; if (tig->_isUnassembled) ty3 = 'U'; if (tig->_isBubble) ty3 = 'B'; if (tig->_isRepeat) ty3 = 'R'; if (tig->_isCircular) ty3 = 'O'; fprintf(F, "tig %7u %c read %8u at %9u %-9u %c' -- %8d %-8d -- tig %7u %c read %8u at %9u %-9u %c'\n", tig->tigID(), tyt, frg->ident, frg->position.bgn, frg->position.end, '3', be3->ahang(), be3->bhang(), tg3->tigID(), ty3, oth->ident, oth->position.bgn, oth->position.end, (be3->frag3p() == false) ? '5' : '3'); } } } fclose(F); }
void writeUnitigsToStore(UnitigVector &unitigs, char *fileprefix, char *tigStorePath, uint32 frg_count_target, bool isFinal) { uint32 utg_count = 0; uint32 frg_count = 0; uint32 prt_count = 1; char filename[FILENAME_MAX] = {0}; uint32 *partmap = new uint32 [unitigs.size()]; // This code closely follows that in AS_CGB_unitigger.c::output_the_chunks() if (isFinal) checkUnitigMembership(unitigs); // Open up the initial output file sprintf(filename, "%s.iidmap", fileprefix); FILE *iidm = fopen(filename, "w"); assert(NULL != iidm); sprintf(filename, "%s.partitioning", fileprefix); FILE *part = fopen(filename, "w"); assert(NULL != part); sprintf(filename, "%s.partitioningInfo", fileprefix); FILE *pari = fopen(filename, "w"); assert(NULL != pari); // Step through all the unitigs once to build the partition mapping and IID mapping. tgStore *tigStore = new tgStore(tigStorePath); tgTig *tig = new tgTig; for (uint32 tigID=0, ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if ((utg == NULL) || (utg->getNumFrags() == 0)) continue; assert(utg->getLength() > 0); // Convert the bogart tig to a tgTig and save to the store. unitigToTig(tig, (isFinal) ? tigID : ti, utg); tigID++; tigStore->insertTig(tig, false); // Increment the partition if the current one is too large. if ((frg_count + utg->getNumFrags() >= frg_count_target) && (frg_count > 0)) { fprintf(pari, "Partition %d has %d unitigs and %d fragments.\n", prt_count, utg_count, frg_count); prt_count++; utg_count = 0; frg_count = 0; } // Note that the tig is included in this partition. utg_count += 1; frg_count += utg->getNumFrags(); // Map the tig to a partition, and log both the tig-to-partition map and the partition-to-read map. fprintf(iidm, "bogart "F_U32" -> tig "F_U32" (in partition "F_U32" with "F_U32" frags)\n", utg->id(), utg->tigID(), prt_count, utg->getNumFrags()); for (uint32 fragIdx=0; fragIdx<utg->getNumFrags(); fragIdx++) fprintf(part, "%d\t%d\n", prt_count, utg->ufpath[fragIdx].ident); } fprintf(pari, "Partition %d has %d unitigs and %d fragments.\n", // Don't forget to log the last partition! prt_count, utg_count, frg_count); fclose(pari); fclose(part); fclose(iidm); delete tig; delete tigStore; }
intersectionList::intersectionList(UnitigVector &unitigs) { for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *tig = unitigs[ti]; if (tig == NULL) continue; intersectionEvidence *evidence = new intersectionEvidence [tig->ufpath.size()]; for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; if (OG->isContained(frg->ident)) continue; // For my best overlap, the ID of the unitig that the overlapping fragment is in. evidence[fi].edge5 = *OG->getBestEdgeOverlap(frg->ident, false); evidence[fi].edge3 = *OG->getBestEdgeOverlap(frg->ident, true); evidence[fi].frag5tig = tig->fragIn(evidence[fi].edge5.fragId()); evidence[fi].frag3tig = tig->fragIn(evidence[fi].edge3.fragId()); // Do NOT initialize these! An earlier fragment could have already confirmed an end. // Properly, only the 5' end of a forward fragment (or 3' end of a reverse fragment) can be // confirmed already (otherwise the tig is nonsense), but we don't yet check that. // //evidence[fi].frag5confirmed = false; //evidence[fi].frag3confirmed = false; // But, because the path could be promiscuous, not every overlap to a different tig is bad. // // If my best overlap is to a different tig, but there is an overlapping fragment (in the // unitig placement) with a best edge to me, I'm still good. The BOG build this unitig using // the edge from the other fragment to me. // // If the fragments do not overlap in the layout (yet the best edge still exists) that is a // self-intersection. // // The two blocks are identical, except for 'edge3' and 'edge5'. if (evidence[fi].frag5tig == tig->id()) { uint32 ti = tig->pathPosition(evidence[fi].edge5.fragId()); ufNode *trg = &tig->ufpath[ti]; uint32 minf = (frg->position.bgn < frg->position.end) ? frg->position.bgn : frg->position.end; uint32 maxf = (frg->position.bgn < frg->position.end) ? frg->position.end : frg->position.bgn; uint32 mint = (trg->position.bgn < trg->position.end) ? trg->position.bgn : trg->position.end; uint32 maxt = (trg->position.bgn < trg->position.end) ? trg->position.end : trg->position.bgn; // If they overlap, mark as confirmed, else remember an intersection. if (((minf < mint) && (mint < maxf)) || // t begins inside f ((mint < minf) && (minf < maxt))) { // f begins inside t if (evidence[fi].edge5.frag3p()) evidence[ti].frag3confirmed = true; else evidence[ti].frag5confirmed = true; } else { evidence[fi].frag5self = true; // Not the correct place to report this. Some of these get confirmed by later fragments. //writeLog("BUG1 F: %d,%d T %d,%d\n", minf, maxf, mint, maxt); //writeLog("INTERSECT from unitig %d frag %d end %d TO unitig %d frag %d end %d (SELF)\n", // tig->id(), frg->ident, 5, evidence[fi].frag5tig, evidence[fi].edge5.fragId(), evidence[fi].edge5.frag3p() ? 3 : 5); } } if (evidence[fi].frag3tig == tig->id()) { uint32 ti = tig->pathPosition(evidence[fi].edge3.fragId()); ufNode *trg = &tig->ufpath[ti]; uint32 minf = (frg->position.bgn < frg->position.end) ? frg->position.bgn : frg->position.end; uint32 maxf = (frg->position.bgn < frg->position.end) ? frg->position.end : frg->position.bgn; uint32 mint = (trg->position.bgn < trg->position.end) ? trg->position.bgn : trg->position.end; uint32 maxt = (trg->position.bgn < trg->position.end) ? trg->position.end : trg->position.bgn; if (((minf < mint) && (mint < maxf)) || // t begins inside f ((mint < minf) && (minf < maxt))) { // f begins inside t if (evidence[fi].edge3.frag3p()) evidence[ti].frag3confirmed = true; else evidence[ti].frag5confirmed = true; } else { evidence[fi].frag3self = true; // Not the correct place to report this. Some of these get confirmed by later fragments. //writeLog("BUG2 F: %d,%d T %d,%d\n", minf, maxf, mint, maxt); //writeLog("INTERSECT from unitig %d frag %d end %d TO unitig %d frag %d end %d (SELF)\n", // tig->id(), frg->ident, 3, evidence[fi].frag3tig, evidence[fi].edge3.fragId(), evidence[fi].edge3.frag3p() ? 3 : 5); } } } // // Build the list. // for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; if ((evidence[fi].frag5tig != 0) && (evidence[fi].frag5tig != tig->id()) && (evidence[fi].frag5confirmed == false)) isects.push_back(intersectionPoint(evidence[fi].edge5, frg->ident, false, false)); if ((evidence[fi].frag5tig == tig->id()) && (evidence[fi].frag5self == true) && (evidence[fi].frag5confirmed == false)) isects.push_back(intersectionPoint(evidence[fi].edge5, frg->ident, false, true)); if ((evidence[fi].frag3tig != 0) && (evidence[fi].frag3tig != tig->id()) && (evidence[fi].frag3confirmed == false)) isects.push_back(intersectionPoint(evidence[fi].edge3, frg->ident, true, false)); if ((evidence[fi].frag3tig == tig->id()) && (evidence[fi].frag3self == true) && (evidence[fi].frag3confirmed == false)) isects.push_back(intersectionPoint(evidence[fi].edge3, frg->ident, true, true)); } delete [] evidence; } // Sort the intersections by the ID of the intersected fragment, then build an index into the array. std::sort(isects.begin(), isects.end()); // Terminate the intersection list with a sentinal intersection. This is CRITICAL // to the way we iterate over intersections. isects.push_back(intersectionPoint(BestEdgeOverlap(), 0, true, true)); // Build a map from fragment id to the first intersection in the list. for (uint32 i=0; i<isects.size(); i++) { isectsNum[isects[i].isectFrg]++; if (isectsMap.find(isects[i].isectFrg) == isectsMap.end()) isectsMap[isects[i].isectFrg] = i; } }
void reportUnitigs(UnitigVector &unitigs, const char *prefix, const char *name, uint64 genomeSize) { // Generate n50. Assumes unitigs have been 'classified' already. vector<uint32> unassembledLength; vector<uint32> bubbleLength; vector<uint32> repeatLength; vector<uint32> circularLength; vector<uint32> contigLength; for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg == NULL) continue; if (utg->_isUnassembled) { unassembledLength.push_back(utg->getLength()); } else if (utg->_isBubble) { bubbleLength.push_back(utg->getLength()); } else if (utg->_isRepeat) { repeatLength.push_back(utg->getLength()); } else if (utg->_isCircular) { circularLength.push_back(utg->getLength()); } else { contigLength.push_back(utg->getLength()); } } char N[FILENAME_MAX]; sprintf(N, "%s.sizes", getLogFilePrefix()); errno = 0; FILE *F = fopen(N, "w"); if (errno == 0) { reportN50(F, unassembledLength, "UNASSEMBLED", genomeSize); reportN50(F, bubbleLength, "BUBBLE", genomeSize); reportN50(F, repeatLength, "REPEAT", genomeSize); reportN50(F, circularLength, "CIRCULAR", genomeSize); reportN50(F, contigLength, "CONTIGS", genomeSize); fclose(F); } if (logFileFlagSet(LOG_INTERMEDIATE_UNITIGS) == 0) return; // Dump to an intermediate store. char tigStorePath[FILENAME_MAX]; sprintf(tigStorePath, "%s.tigStore", getLogFilePrefix()); fprintf(stderr, "Creating intermediate tigStore '%s'\n", tigStorePath); uint32 numFragsT = 0; uint32 numFragsP = 0; uint64 utgLen = 0; // Compute average frags per partition. for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg == NULL) continue; numFragsT += utg->ufpath.size(); if (utg->ufpath.size() > 2) utgLen += utg->getLength(); } if (utgLen < 16 * 1024 * 1024) numFragsP = numFragsT / 7; else if (utgLen < 64 * 1024 * 1024) numFragsP = numFragsT / 63; else numFragsP = numFragsT / 127; // Dump the unitigs to an intermediate store. setParentAndHang(unitigs); writeUnitigsToStore(unitigs, tigStorePath, tigStorePath, numFragsP, false); }
void writeUnitigsToStore(UnitigVector &unitigs, char *fileprefix, char *tigStorePath, uint32 frg_count_target, bool isFinal) { uint32 utg_count = 0; uint32 frg_count = 0; uint32 prt_count = 1; char filename[FILENAME_MAX] = {0}; uint32 *partmap = new uint32 [unitigs.size()]; // This code closely follows that in AS_CGB_unitigger.c::output_the_chunks() if (isFinal) checkUnitigMembership(unitigs); // Open up the initial output file sprintf(filename, "%s.iidmap", fileprefix); FILE *iidm = fopen(filename, "w"); assert(NULL != iidm); sprintf(filename, "%s.partitioning", fileprefix); FILE *part = fopen(filename, "w"); assert(NULL != part); sprintf(filename, "%s.partitioningInfo", fileprefix); FILE *pari = fopen(filename, "w"); assert(NULL != pari); // Step through all the unitigs once to build the partition mapping and IID mapping. memset(partmap, 0xff, sizeof(uint32) * unitigs.size()); for (uint32 iumiid=0, ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; uint32 nf = (utg) ? utg->getNumFrags() : 0; if ((utg == NULL) || (nf == 0)) continue; assert(utg->getLength() > 0); assert(nf == utg->ufpath.size()); if ((frg_count + nf >= frg_count_target) && (frg_count > 0)) { fprintf(pari, "Partition %d has %d unitigs and %d fragments.\n", prt_count, utg_count, frg_count); prt_count++; utg_count = 0; frg_count = 0; } uint32 tigid = (isFinal) ? iumiid : ti; assert(tigid < unitigs.size()); partmap[tigid] = prt_count; fprintf(iidm, "Unitig "F_U32" == IUM "F_U32" (in partition "F_U32" with "F_U32" frags)\n", utg->id(), (tigid), partmap[(tigid)], nf); for (uint32 fragIdx=0; fragIdx<nf; fragIdx++) { ufNode *f = &utg->ufpath[fragIdx]; fprintf(part, "%d\t%d\n", prt_count, f->ident); } utg_count += 1; frg_count += nf; iumiid++; } fprintf(pari, "Partition %d has %d unitigs and %d fragments.\n", prt_count, utg_count, frg_count); fclose(pari); fclose(part); fclose(iidm); // Step through all the unitigs once to build the partition mapping and IID mapping. tgStore *tigStore = new tgStore(tigStorePath); tgTig *tig = new tgTig; for (uint32 iumiid=0, ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; uint32 nf = (utg) ? utg->getNumFrags() : 0; if ((utg == NULL) || (nf == 0)) continue; unitigToTig(tig, (isFinal) ? iumiid : ti, utg); tigStore->insertTig(tig, false); iumiid++; } delete tig; delete tigStore; delete [] partmap; }
// Decides if a unitig is unassembled. The other classifications (isBubble, isCircular, isRepeat) // are made when the type is processed (e.g., when bubbles are popped). // // A unitig is unassembled if: // 1) it has fewer than R reads (R=2) // 2) it is shorter than S bases (S=1000) // 3) a single read spans at least fraction F of the lenth (F=1.0) // 4) at least fraction F of the unitig is below read depth D (F=1.0, D=2) // void classifyUnitigsAsUnassembled(UnitigVector &unitigs, uint32 fewReadsNumber, uint32 tooShortLength, double spanFraction, double lowcovFraction, uint32 lowcovDepth) { uint32 nTooFew = 0; uint32 nShort = 0; uint32 nSingle = 0; uint32 nCoverage = 0; uint32 nContig = 0; uint64 bTooFew = 0; uint64 bShort = 0; uint64 bSingle = 0; uint64 bCoverage = 0; uint64 bContig = 0; char N[FILENAME_MAX]; sprintf(N, "%s.unassembled", getLogFilePrefix()); errno = 0; FILE *F = fopen(N, "w"); if (errno) F = NULL; for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg == NULL) continue; utg->_isUnassembled = false; // Rule 1. Too few reads. if (utg->ufpath.size() < fewReadsNumber) { fprintf(F, "unitig "F_U32" unassembled - too few reads ("F_U64" < "F_U32")\n", ti, utg->ufpath.size(), fewReadsNumber); utg->_isUnassembled = true; nTooFew += 1; bTooFew += utg->getLength(); continue; } // Rule 2. Short. if (utg->getLength() < tooShortLength) { fprintf(F, "unitig "F_U32" unassembled - too short ("F_U32" < "F_U32")\n", ti, utg->getLength(), tooShortLength); utg->_isUnassembled = true; nShort += 1; bShort += utg->getLength(); continue; } // Rule 3. Single read spans large fraction of tig. for (uint32 oi=0; oi<utg->ufpath.size(); oi++) { ufNode *frg = &utg->ufpath[oi]; int frgbgn = MIN(frg->position.bgn, frg->position.end); int frgend = MAX(frg->position.bgn, frg->position.end); if (frgend - frgbgn > utg->getLength() * spanFraction) { fprintf(F, "unitig "F_U32" unassembled - single read spans unitig (read "F_U32" "F_U32"-"F_U32" spans fraction %f > %f\n", ti, frg->ident, frg->position.bgn, frg->position.end, (double)(frgend - frgbgn) / utg->getLength(), spanFraction); utg->_isUnassembled = true; nSingle += 1; bSingle += utg->getLength(); break; } } if (utg->_isUnassembled) continue; // Rule 4. Low coverage. intervalList<int32> IL; for (uint32 oi=0; oi<utg->ufpath.size(); oi++) { ufNode *frg = &utg->ufpath[oi]; int frgbgn = MIN(frg->position.bgn, frg->position.end); int frgend = MAX(frg->position.bgn, frg->position.end); IL.add(frgbgn, frgend - frgbgn); } intervalList<int32> ID(IL); uint32 basesLow = 0; uint32 basesHigh = 0; for (uint32 ii=0; ii<ID.numberOfIntervals(); ii++) if (ID.depth(ii) < lowcovDepth) basesLow += ID.hi(ii) - ID.lo(ii) + 1; else basesHigh += ID.hi(ii) - ID.lo(ii) + 1; double lowcov = (double)basesLow / (basesLow + basesHigh); if (lowcov >= lowcovFraction) { fprintf(F, "Unitig "F_U32" unassembled - low coverage (%.4f > %.4f at < "F_U32"x coverage)\n", ti, lowcov, lowcovFraction, lowcovDepth); utg->_isUnassembled = true; nCoverage += 1; bCoverage += utg->getLength(); continue; } // Otherwise, unitig is assembled! nContig += 1; bContig += utg->getLength(); } writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- too few reads\n", nTooFew, bTooFew); writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- too short\n", nShort, bShort); writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- single spanning read\n", nSingle, bSingle); writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- low coverage\n", nCoverage, bCoverage); writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- acceptable contigs\n", nContig, bContig); }
void findPotentialBubbles(UnitigVector &unitigs, BubTargetList &potentialBubbles) { uint32 tiLimit = unitigs.size(); uint32 tiNumThreads = omp_get_max_threads(); uint32 tiBlockSize = (tiLimit < 100000 * tiNumThreads) ? tiNumThreads : tiLimit / 99999; writeStatus("\n"); writeStatus("bubbleDetect()-- working on "F_U32" unitigs, with "F_U32" threads.\n", tiLimit, tiNumThreads); for (uint32 ti=0; ti<tiLimit; ti++) { Unitig *tig = unitigs[ti]; if ((tig == NULL) || // Not a tig, ignore it. (tig->ufpath.size() == 1)) // Singleton, handled elsewhere. continue; uint32 nonContainedReads = 0; bool validBubble = true; map<uint32,uint32> tigOlapsTo; uint32 fiLimit = tig->ufpath.size(); uint32 fiNumThreads = omp_get_max_threads(); uint32 fiBlockSize = (fiLimit < 100 * fiNumThreads) ? fiNumThreads : fiLimit / 99; for (uint32 fi=0; (validBubble == true) && (fi<fiLimit); fi++) { uint32 rid = tig->ufpath[fi].ident; if (OG->isContained(rid) == true) // Don't need to check contained reads. If their container continue; // passes the tests below, the contained read will too. nonContainedReads++; uint32 ovlLen = 0; BAToverlap *ovl = OC->getOverlaps(rid, AS_MAX_ERATE, ovlLen); set<uint32> readOlapsTo; for (uint32 oi=0; oi<ovlLen; oi++) { uint32 ovlTigID = Unitig::fragIn(ovl[oi].b_iid); Unitig *ovlTig = unitigs[ovlTigID]; // Skip this overlap if it is to an unplaced read, to a singleton tig, to ourself, // or to a unitig that is shorter than us. We can not pop this tig as a bubble // in any of those cases. if ((ovlTigID == 0) || (ovlTig == NULL) || (ovlTig->ufpath.size() == 1) || (ovlTig->id() == tig->id()) || (ovlTig->getLength() < tig->getLength())) continue; // Otherwise, remember that we had an overlap to ovlTig. //writeLog("tig %u read %u overlap to tig %u read %u\n", // tig->id(), rid, ovlTigID, ovl[oi].b_iid); readOlapsTo.insert(ovlTigID); } //writeLog("tig %8u read %8u has %u olaps\n", tig->id(), rid, readOlapsTo.size()); // Transfer the per-read counts to the per-unitig counts: add one to the counter for each tig // that we have overlaps to. for (set<uint32>::iterator it=readOlapsTo.begin(); it != readOlapsTo.end(); ++it) tigOlapsTo[*it]++; // Decide if we're a valid potential bubble. If tig id (in it->first) has overlaps to every // read we've seen so far (nonContainedReads), we're still a valid bubble. // // To _attempt_ to have differences in the bubble, we'll accept it if 3/4 of the reads // have overlaps. validBubble = false; for (map<uint32,uint32>::iterator it=tigOlapsTo.begin(); it != tigOlapsTo.end(); ++it) if (it->second >= BUBBLE_READ_FRACTION * nonContainedReads) validBubble = true; // If we've not seen that many reads, pretend it's a valid bubble. It'll get screened out later. if (nonContainedReads < 16) validBubble = true; } // If not validBubble, report. #if 0 if (validBubble == false) { writeLog("notValidBubble tig %8d expects %6u reads\n", tig->id(), nonContainedReads); for (map<uint32,uint32>::iterator it=tigOlapsTo.begin(); it != tigOlapsTo.end(); ++it) writeLog(" to tig %8u overlaps %6u\n", it->first, it->second); } #endif // If validBubble, then there is a tig that every dovetail read has at least one overlap to. // Save those tigs in potentialBubbles. uint32 nTigs = 0; if (validBubble) { for (map<uint32,uint32>::iterator it=tigOlapsTo.begin(); it != tigOlapsTo.end(); ++it) if (it->second >= BUBBLE_READ_FRACTION * nonContainedReads) nTigs++; } // ALWAYS log potential bubbles. if (nTigs > 0) { writeLog("\n"); writeLog("potential bubble tig %8u length %9u nReads %7u to %3u tigs:\n", tig->id(), tig->getLength(), tig->ufpath.size(), nTigs); for (map<uint32,uint32>::iterator it=tigOlapsTo.begin(); it != tigOlapsTo.end(); ++it) { if (it->second >= BUBBLE_READ_FRACTION * nonContainedReads) { Unitig *dest = unitigs[it->first]; writeLog(" tig %8u length %9u nReads %7u\n", dest->id(), dest->getLength(), dest->ufpath.size()); potentialBubbles[ti].push_back(dest->id()); } } } } flushLog(); }
void popBubbles(UnitigVector &unitigs, double deviationBubble) { BubTargetList potentialBubbles; findPotentialBubbles(unitigs, potentialBubbles); writeStatus("popBubbles()-- Found "F_SIZE_T" potential bubbles.\n", potentialBubbles.size()); //if (potentialBubbles.size() == 0) // return; writeLog("\n"); writeLog("Found "F_SIZE_T" potential bubbles.\n", potentialBubbles.size()); writeLog("\n"); vector<overlapPlacement> *placed = findBubbleReadPlacements(unitigs, potentialBubbles, deviationBubble); // We now have, in 'placed', a list of all the places that each read could be placed. Decide if there is a _single_ // place for each bubble to be popped. uint32 tiLimit = unitigs.size(); //uint32 tiNumThreads = omp_get_max_threads(); //uint32 tiBlockSize = (tiLimit < 100000 * tiNumThreads) ? tiNumThreads : tiLimit / 99999; // Clear flags. for (uint32 ti=0; ti<tiLimit; ti++) { if (unitigs[ti]) { unitigs[ti]->_isBubble = false; unitigs[ti]->_isRepeat = false; } } // In parallel, process the placements. for (uint32 ti=0; ti<tiLimit; ti++) { if (potentialBubbles.count(ti) == 0) // Not a potential bubble continue; // Scan the bubble, decide if there are _ANY_ read placements. Log appropriately. Unitig *bubble = unitigs[ti]; bool hasPlacements = false; for (uint32 fi=0; fi<bubble->ufpath.size(); fi++) { uint32 readID = bubble->ufpath[fi].ident; if (placed[readID].size() > 0) hasPlacements = true; } if (hasPlacements == false) writeLog("potential bubble %u had no valid placements (all were not contained in target tig)\n", ti); else writeLog("potential bubble %u\n", ti); // Split the placements into piles for each target and build an interval list for each target. // For each read in the tig, convert the vector of placements into interval lists, one list per target tig. map<uint32, intervalList<uint32> *> targetIntervals; for (uint32 fi=0; fi<bubble->ufpath.size(); fi++) { uint32 readID = bubble->ufpath[fi].ident; for (uint32 pp=0; pp<placed[readID].size(); pp++) { uint32 tid = placed[readID][pp].tigID; assert(placed[readID][pp].frgID > 0); uint32 bgn = (placed[readID][pp].position.bgn < placed[readID][pp].position.end) ? placed[readID][pp].position.bgn : placed[readID][pp].position.end; uint32 end = (placed[readID][pp].position.bgn < placed[readID][pp].position.end) ? placed[readID][pp].position.end : placed[readID][pp].position.bgn; if (targetIntervals[tid] == NULL) targetIntervals[tid] = new intervalList<uint32>; //writeLog("read %u -> tig %u intervals %u-%u\n", readID, tid, bgn, end); targetIntervals[tid]->add(bgn, end-bgn); } } vector<candidatePop *> targets; // Squish the intervals. Create new candidatePops for each interval that isn't too big or // small. Assign each overlapPlacements to the correct candidatePop. for (map<uint32, intervalList<uint32> *>::iterator it=targetIntervals.begin(); it != targetIntervals.end(); ++it) { uint32 targetID = it->first; intervalList<uint32> *IL = it->second; IL->merge(); // Discard intervals that are significantly too small or large. Save the ones that are // nicely sized. Logging here isn't terribly useful, it's just repeated (out of order) later // when we try to make sense of the read alignments. for (uint32 ii=0; ii<IL->numberOfIntervals(); ii++) { if ((IL->hi(ii) - IL->lo(ii) < 0.75 * bubble->getLength()) || // Too small! (1.25 * bubble->getLength() < IL->hi(ii) - IL->lo(ii))) { // Too big! writeLog("tig %8u length %9u -> target %8u piece %2u position %9u-%9u length %8u - size mismatch, discarded\n", bubble->id(), bubble->getLength(), targetID, ii, IL->lo(ii), IL->hi(ii), IL->hi(ii) - IL->lo(ii)); continue; } writeLog("tig %8u length %9u -> target %8u piece %2u position %9u-%9u length %8u\n", bubble->id(), bubble->getLength(), targetID, ii, IL->lo(ii), IL->hi(ii), IL->hi(ii) - IL->lo(ii)); targets.push_back(new candidatePop(bubble, unitigs[targetID], IL->lo(ii), IL->hi(ii))); } delete IL; } targetIntervals.clear(); // If no targets, nothing to do. if (targets.size() == 0) continue; // Run through the placements again, and assign them to the correct target. // // For each read: // For each acceptable placement: // For each target location: // If the placement is for this target, save it. for (uint32 fi=0; fi<bubble->ufpath.size(); fi++) { uint32 readID = bubble->ufpath[fi].ident; for (uint32 pp=0; pp<placed[readID].size(); pp++) { uint32 tid = placed[readID][pp].tigID; uint32 bgn = (placed[readID][pp].position.bgn < placed[readID][pp].position.end) ? placed[readID][pp].position.bgn : placed[readID][pp].position.end; uint32 end = (placed[readID][pp].position.bgn < placed[readID][pp].position.end) ? placed[readID][pp].position.end : placed[readID][pp].position.bgn; for (uint32 tt=0; tt<targets.size(); tt++) if ((targets[tt]->target->id() == tid) && (targets[tt]->bgn < end) && (bgn < targets[tt]->end)) targets[tt]->placed.push_back(placed[readID][pp]); } } // Count the number of targets that have all the reads (later: in the correct order, etc, etc). Remove those // that don't. uint32 nTargets = 0; set<uint32> tigReads; // Reads in the bubble tig. set<uint32> tgtReads; // Reads in the bubble that have a placement in the target. // Remove duplicate placements from each target. for (uint32 tt=0; tt<targets.size(); tt++) { candidatePop *t = targets[tt]; // Detect duplicates, keep the one with lower error. There are a lot of duplicate // placements, logging isn't terribly useful. for (uint32 aa=0; aa<t->placed.size(); aa++) { for (uint32 bb=0; bb<t->placed.size(); bb++) { if ((aa == bb) || (t->placed[aa].frgID != t->placed[bb].frgID) || (t->placed[aa].frgID == 0) || (t->placed[bb].frgID == 0)) continue; if (t->placed[aa].errors / t->placed[aa].aligned < t->placed[bb].errors / t->placed[bb].aligned) { #ifdef SHOW_MULTIPLE_PLACEMENTS writeLog("duplicate read alignment for tig %u read %u - better %u-%u %.4f - worse %u-%u %.4f\n", t->placed[aa].tigID, t->placed[aa].frgID, t->placed[aa].position.bgn, t->placed[aa].position.end, t->placed[aa].errors / t->placed[aa].aligned, t->placed[bb].position.bgn, t->placed[bb].position.end, t->placed[bb].errors / t->placed[bb].aligned); #endif t->placed[bb] = overlapPlacement(); } else { #ifdef SHOW_MULTIPLE_PLACEMENTS writeLog("duplicate read alignment for tig %u read %u - better %u-%u %.4f - worse %u-%u %.4f\n", t->placed[aa].tigID, t->placed[aa].frgID, t->placed[bb].position.bgn, t->placed[bb].position.end, t->placed[bb].errors / t->placed[bb].aligned, t->placed[aa].position.bgn, t->placed[aa].position.end, t->placed[aa].errors / t->placed[aa].aligned); #endif t->placed[aa] = overlapPlacement(); } } } // Get rid of any now-empty entries. for (uint32 aa=t->placed.size(); aa--; ) { if (t->placed[aa].frgID == 0) { t->placed[aa] = t->placed.back(); t->placed.pop_back(); } } } // Make a set of the reads in the bubble. We'll compare each target against this to decide if all reads are placed. for (uint32 fi=0; fi<bubble->ufpath.size(); fi++) tigReads.insert(bubble->ufpath[fi].ident); uint32 nOrphan = 0; // Full coverage; bubble can be popped. uint32 orphanTarget = 0; uint32 nBubble = 0; // Partial coverage, bubble cannot be popped. uint32 bubbleTarget = 0; for (uint32 tt=0; tt<targets.size(); tt++) { tgtReads.clear(); for (uint32 op=0; op<targets[tt]->placed.size(); op++) { if (logFileFlagSet(LOG_BUBBLE_DETAIL)) writeLog("tig %8u length %9u -> target %8u piece %2u position %9u-%9u length %8u - read %7u at %9u-%9u\n", bubble->id(), bubble->getLength(), targets[tt]->target->id(), tt, targets[tt]->bgn, targets[tt]->end, targets[tt]->end - targets[tt]->bgn, targets[tt]->placed[op].frgID, targets[tt]->placed[op].position.bgn, targets[tt]->placed[op].position.end); assert(targets[tt]->placed[op].frgID > 0); tgtReads.insert(targets[tt]->placed[op].frgID); } // Count the number of consecutive reads from the 5' or 3' end of the bubble that are placed // in the target. // // Also, count the number of reads in the bubble that are placed in the target. Likely the // same as n5 + n3. uint32 n5 = 0; uint32 n3 = 0; uint32 nt = 0; for (uint32 fi=0; fi<bubble->ufpath.size(); fi++) if (tgtReads.count(bubble->ufpath[fi].ident) > 0) n5++; else break; for (uint32 fi=bubble->ufpath.size(); fi-->0; ) if (tgtReads.count(bubble->ufpath[fi].ident) > 0) n3++; else break; for (uint32 fi=0; fi<bubble->ufpath.size(); fi++) if (tgtReads.count(bubble->ufpath[fi].ident) > 0) nt++; // Report now, before we nuke targets[tt] for being not a bubble! if ((nt == bubble->ufpath.size()) || ((n5 > 0) && (n3 > 0))) writeLog("tig %8u length %9u -> target %8u piece %2u position %9u-%9u length %8u - expected %3"F_SIZE_TP" reads, had %3"F_SIZE_TP" reads. n5=%3u n3=%3u nt=%3u\n", bubble->id(), bubble->getLength(), targets[tt]->target->id(), tt, targets[tt]->bgn, targets[tt]->end, targets[tt]->end - targets[tt]->bgn, tigReads.size(), tgtReads.size(), n5, n3, nt); // Decide if this is a bubble, orphan from construction, or repeat. if (nt == bubble->ufpath.size()) { nOrphan++; orphanTarget = tt; } else if ((n5 > 0) && (n3 > 0)) { nBubble++; bubbleTarget = tt; } } // If no placements, pbbbt. if (nOrphan + nBubble == 0) { //writeLog("tig %8u length %8u reads %6u had no bubble or orphan placements.\n", bubble->id(), bubble->getLength(), bubble->ufpath.size()); continue; } // If multiple orphan and/or bubble placements, it's a repeat. if (nOrphan + nBubble > 1) { writeLog("tig %8u length %8u reads %6u - repeat - %u orphan %u bubble placements.\n", bubble->id(), bubble->getLength(), bubble->ufpath.size(), nOrphan, nBubble); writeLog("\n"); bubble->_isRepeat = true; continue; } // If a bubble placement, mark it as a bubble so it can be skipped during repeat detection. if (nBubble > 0) { writeLog("tig %8u length %8u reads %6u - bubble\n", bubble->id(), bubble->getLength(), bubble->ufpath.size()); writeLog("\n"); bubble->_isBubble = true; continue; } // Otherwise, it's an orphan, move the reads to the proper place. writeLog("tig %8u length %8u reads %6u - orphan\n", bubble->id(), bubble->getLength(), bubble->ufpath.size()); for (uint32 op=0, tt=orphanTarget; op<targets[tt]->placed.size(); op++) { ufNode frg; frg.ident = targets[tt]->placed[op].frgID; frg.contained = 0; frg.parent = 0; frg.ahang = 0; frg.bhang = 0; frg.position.bgn = targets[tt]->placed[op].position.bgn; frg.position.end = targets[tt]->placed[op].position.end; writeLog("move read %u from tig %u to tig %u %u-%u\n", frg.ident, bubble->id(), targets[tt]->target->id(), frg.position.bgn, frg.position.end); targets[tt]->target->addFrag(frg, 0, false); } writeLog("\n"); unitigs[bubble->id()] = NULL; delete bubble; } // Over all bubbles writeLog("\n"); // Needed if no bubbles are popped. delete [] placed; // Sort reads in all the tigs. Overkill, but correct. for (uint32 ti=0; ti<tiLimit; ti++) { Unitig *tig = unitigs[ti]; if ((tig == NULL) || // Not a tig, ignore it. (tig->ufpath.size() == 1)) // Singleton, already sorted. continue; tig->sort(); } }
void markRepeatReads(UnitigVector &unitigs, double deviationRepeat, uint32 confusedAbsolute, double confusedPercent) { uint32 tiLimit = unitigs.size(); uint32 numThreads = omp_get_max_threads(); uint32 blockSize = (tiLimit < 100000 * numThreads) ? numThreads : tiLimit / 99999; writeLog("repeatDetect()-- working on "F_U32" unitigs, with "F_U32" threads.\n", tiLimit, numThreads); vector<olapDat> repeatOlaps; // Overlaps to reads promoted to tig coords intervalList<int32> tigMarksR; // Marked repeats based on reads, filtered by spanning reads intervalList<int32> tigMarksU; // Non-repeat invervals, just the inversion of tigMarksR for (uint32 ti=0; ti<tiLimit; ti++) { Unitig *tig = unitigs[ti]; if (tig == NULL) continue; if (tig->ufpath.size() == 1) continue; vector<olapDat> repeats; writeLog("Annotating repeats in reads for tig %u/%u.\n", ti, tiLimit); // Clear out all the existing marks. They're not for this tig. // Analyze overlaps for each read. For each overlap to a read not in this tig, or not // overlapping in this tig, and of acceptable error rate, add the overlap to repeatOlaps. repeatOlaps.clear(); uint32 fiLimit = tig->ufpath.size(); uint32 numThreads = omp_get_max_threads(); uint32 blockSize = (fiLimit < 100 * numThreads) ? numThreads : fiLimit / 99; #pragma omp parallel for if(fiLimit > 100) schedule(dynamic, blockSize) for (uint32 fi=0; fi<fiLimit; fi++) annotateRepeatsOnRead(unitigs, tig, &tig->ufpath[fi], deviationRepeat, repeatOlaps); writeLog("Annotated with %lu overlaps.\n", repeatOlaps.size()); // Merge marks for the same read into the largest possible. sort(repeatOlaps.begin(), repeatOlaps.end(), olapDatByEviRid); #ifdef SHOW_ANNOTATE for (uint32 ii=0; ii<repeatOlaps.size(); ii++) if (repeatOlaps[ii].tigbgn < 1000000) writeLog("repeatOlaps[%u] %u-%u from tig %u read %u RAW\n", ii, repeatOlaps[ii].tigbgn, repeatOlaps[ii].tigend, repeatOlaps[ii].eviTid, repeatOlaps[ii].eviRid); flushLog(); #endif for (uint32 dd=0, ss=1; ss<repeatOlaps.size(); ss++) { assert(repeatOlaps[dd].eviRid <= repeatOlaps[ss].eviRid); // If different evidence reads, close the destination olap, set up // for a new destination. if (repeatOlaps[dd].eviRid != repeatOlaps[ss].eviRid) { dd = ss; continue; } // If the destination ends before the source begins, there is no overlap between the // two regions. Close dd, set up for a new dd. if (repeatOlaps[dd].tigend <= repeatOlaps[ss].tigbgn) { dd = ss; continue; } // Otherwise, there must be an overlap. Extend the destination region, erase the source // region. repeatOlaps[dd].tigbgn = min(repeatOlaps[ss].tigbgn, repeatOlaps[dd].tigbgn); repeatOlaps[dd].tigend = max(repeatOlaps[ss].tigend, repeatOlaps[dd].tigend); repeatOlaps[ss].tigbgn = UINT32_MAX; repeatOlaps[ss].tigend = UINT32_MAX; repeatOlaps[ss].eviTid = UINT32_MAX; repeatOlaps[ss].eviRid = UINT32_MAX; } // Sort overlaps again. This pushes all those 'erased' regions to the end of the list, which // we can then just pop off. sort(repeatOlaps.begin(), repeatOlaps.end(), olapDatByEviRid); for (uint32 ii=repeatOlaps.size(); ii--; ) if (repeatOlaps[ii].eviTid == UINT32_MAX) repeatOlaps.pop_back(); // For logging, sort by coordinate sort(repeatOlaps.begin(), repeatOlaps.end()); #ifdef SHOW_ANNOTATE for (uint32 ii=0; ii<repeatOlaps.size(); ii++) writeLog("repeatOlaps[%d] %u-%u from tig %u read %u MERGED\n", ii, repeatOlaps[ii].tigbgn, repeatOlaps[ii].tigend, repeatOlaps[ii].eviTid, repeatOlaps[ii].eviRid); #endif // Make a new set of intervals based on all the detected repeats. tigMarksR.clear(); for (uint32 bb=0, ii=0; ii<repeatOlaps.size(); ii++) tigMarksR.add(repeatOlaps[ii].tigbgn, repeatOlaps[ii].tigend - repeatOlaps[ii].tigbgn); // Collapse these markings Collapse all the read markings to intervals on the unitig, merging those that overlap // significantly. writeLog("Merge marks.\n"); tigMarksR.merge(REPEAT_OVERLAP_MIN); // Scan reads, discard any mark that is contained in a read // // We don't need to filterShort() after every one is removed, but it's simpler to do it Right Now than // to track if it is needed. writeLog("Scan reads to discard spanned repeats.\n"); for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; bool frgfwd = (frg->position.bgn < frg->position.end); int32 frglo = (frgfwd) ? frg->position.bgn : frg->position.end; int32 frghi = (frgfwd) ? frg->position.end : frg->position.bgn; bool discarded = false; for (uint32 ri=0; ri<tigMarksR.numberOfIntervals(); ri++) { bool spanLo = false; bool spanHi = false; // The decision of 'spanned by a read' is broken into two pieces: does the read span the // lower (higher) boundary of the region. To be spanned, the boundary needs to be spanned // by at least MIN_ANCHOR_HANG additional bases (to anchor the read to non-repeat // sequence). // // This is a problem at the start/end of the tig, beacuse no read will extend past the // start/end of the tig. Instead, if the repeat is contained within the first (last) read // with no extension at the respective end, it is spanned. if ((frglo == 0) && // Read at start of tig, spans off the high end (tigMarksR.hi(ri) + MIN_ANCHOR_HANG <= frghi)) spanLo = spanHi = true; if ((frghi == tig->getLength()) && // Read at end of tig, spans off the low end (frglo + MIN_ANCHOR_HANG <= tigMarksR.lo(ri))) spanLo = spanHi = true; if (frglo + MIN_ANCHOR_HANG <= tigMarksR.lo(ri)) // Read spanned off the low end spanLo = true; if (tigMarksR.hi(ri) + MIN_ANCHOR_HANG <= frghi) // Read spanned off the high end spanHi = true; if (spanLo && spanHi) { writeLog("discard region %8d:%-8d - contained in read %6u %8d-%8d\n", tigMarksR.lo(ri), tigMarksR.hi(ri), frg->ident, frglo, frghi); tigMarksR.lo(ri) = 0; tigMarksR.hi(ri) = 0; discarded = true; } } if (discarded) tigMarksR.filterShort(1); } // Run through again, looking for the thickest overlap(s) to the remaining regions. // This isn't caring about the end effect noted above. #if 1 writeLog("thickest edges to the repeat regions:\n"); for (uint32 ri=0; ri<tigMarksR.numberOfIntervals(); ri++) { uint32 t5 = UINT32_MAX, l5 = 0, t5bgn, t5end; uint32 t3 = UINT32_MAX, l3 = 0, t3bgn, t3end; for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; bool frgfwd = (frg->position.bgn < frg->position.end); int32 frglo = (frgfwd) ? frg->position.bgn : frg->position.end; int32 frghi = (frgfwd) ? frg->position.end : frg->position.bgn; bool discarded = false; // Overlap off the 5' end of the region. if (frglo <= tigMarksR.lo(ri) && (tigMarksR.lo(ri) <= frghi)) { uint32 olap = frghi - tigMarksR.lo(ri); if (l5 < olap) { l5 = olap; t5 = fi; t5bgn = frglo; // Easier than recomputing it later on... t5end = frghi; } } // Overlap off the 3' end of the region. if (frglo <= tigMarksR.hi(ri) && (tigMarksR.hi(ri) <= frghi)) { uint32 olap = tigMarksR.hi(ri) - frglo; if (l3 < olap) { l3 = olap; t3 = fi; t3bgn = frglo; t3end = frghi; } } if (frglo <= tigMarksR.lo(ri) && (tigMarksR.hi(ri) <= frghi)) { writeLog("saved region %8d:%-8d - closest read %6u (%+6d) %8d:%-8d (%+6d) (contained)\n", tigMarksR.lo(ri), tigMarksR.hi(ri), frg->ident, tigMarksR.lo(ri) - frglo, frglo, frghi, frghi - tigMarksR.hi(ri)); } } if (t5 != UINT32_MAX) writeLog("saved region %8d:%-8d - closest 5' read %6u (%+6d) %8d:%-8d (%+6d)\n", tigMarksR.lo(ri), tigMarksR.hi(ri), tig->ufpath[t5].ident, tigMarksR.lo(ri) - t5bgn, t5bgn, t5end, t5end - tigMarksR.hi(ri)); if (t3 != UINT32_MAX) writeLog("saved region %8d:%-8d - closest 3' read %6u (%+6d) %8d:%-8d (%+6d)\n", tigMarksR.lo(ri), tigMarksR.hi(ri), tig->ufpath[t3].ident, tigMarksR.lo(ri) - t3bgn, t3bgn, t3end, t3end - tigMarksR.hi(ri)); } #endif // Scan reads. If a read intersects a repeat interval, and the best edge for that read // is entirely in the repeat region, decide if there is a near-best edge to something // not in this tig. // // A region with no such near-best edges is _probably_ correct. writeLog("search for confused edges:\n"); uint32 *isConfused = new uint32 [tigMarksR.numberOfIntervals()]; memset(isConfused, 0, sizeof(uint32) * tigMarksR.numberOfIntervals()); for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *rdA = &tig->ufpath[fi]; uint32 rdAid = rdA->ident; bool rdAfwd = (rdA->position.bgn < rdA->position.end); int32 rdAlo = (rdAfwd) ? rdA->position.bgn : rdA->position.end; int32 rdAhi = (rdAfwd) ? rdA->position.end : rdA->position.bgn; double sc = (rdAhi - rdAlo) / (double)FI->fragmentLength(rdAid); if ((OG->isContained(rdAid) == true) || (OG->isSuspicious(rdAid) == true)) continue; for (uint32 ri=0; ri<tigMarksR.numberOfIntervals(); ri++) { uint32 rMin = tigMarksR.lo(ri); uint32 rMax = tigMarksR.hi(ri); if ((rdAhi < rMin) || // Read ends before the region (rMax < rdAlo)) // Read starts after the region continue; // -> don't care about this read! // Compute the position (in the tig) of the best overlaps. int32 tig5bgn=0, tig5end=0; int32 tig3bgn=0, tig3end=0; // Instead of using the best edge - which might not be the edge used in the unitig - // we need to scan the layout to return the previous/next dovetail // Put this in a function - what to return if no best overlap? BestEdgeOverlap *b5 = OG->getBestEdgeOverlap(rdAid, false); BestEdgeOverlap *b3 = OG->getBestEdgeOverlap(rdAid, true); // If the best edge is to a read not in this tig, there is nothing to compare against. // Is this confused by default? Possibly. The unitig was constructed somehow, and that // must then be the edge coming into us. We'll pick it up later. bool b5use = true; bool b3use = true; if (b5->fragId() == 0) b5use = false; if (b3->fragId() == 0) b3use = false; if ((b5use) && (Unitig::fragIn(b5->fragId()) != tig->id())) b5use = false; if ((b3use) && (Unitig::fragIn(b3->fragId()) != tig->id())) b3use = false; // The best edge read is in this tig. If they don't overlap, again, nothing to compare // against. if (b5use) { ufNode *rdB = &tig->ufpath[Unitig::pathPosition(b5->fragId())]; uint32 rdBid = rdB->ident; bool rdBfwd = (rdB->position.bgn < rdB->position.end); int32 rdBlo = (rdBfwd) ? rdB->position.bgn : rdB->position.end; int32 rdBhi = (rdBfwd) ? rdB->position.end : rdB->position.bgn; if ((rdAhi < rdBlo) || (rdBhi < rdAlo)) b5use = false; } if (b3use) { ufNode *rdB = &tig->ufpath[Unitig::pathPosition(b3->fragId())]; uint32 rdBid = rdB->ident; bool rdBfwd = (rdB->position.bgn < rdB->position.end); int32 rdBlo = (rdBfwd) ? rdB->position.bgn : rdB->position.end; int32 rdBhi = (rdBfwd) ? rdB->position.end : rdB->position.bgn; if ((rdAhi < rdBlo) || (rdBhi < rdAlo)) b3use = false; } // If we can use this edge, compute the placement of the overlap on the unitig. // Call #1; if (b5use) { int32 bgn=0, end=0; olapToReadCoords(rdA, b5->ahang(), b5->bhang(), bgn, end); tig5bgn = (rdAfwd) ? (rdAlo + sc * bgn) : (rdAhi - sc * end); tig5end = (rdAfwd) ? (rdAlo + sc * end) : (rdAhi - sc * bgn); assert(tig5bgn < tig5end); if (tig5bgn < 0) tig5bgn = 0; if (tig5end > tig->getLength()) tig5end = tig->getLength(); } // Call #2 if (b3use) { int32 bgn=0, end=0; olapToReadCoords(rdA, b3->ahang(), b3->bhang(), bgn, end); tig3bgn = (rdAfwd) ? (rdAlo + sc * bgn) : (rdAhi - sc * end); tig3end = (rdAfwd) ? (rdAlo + sc * end) : (rdAhi - sc * bgn); assert(tig3bgn < tig3end); if (tig3bgn < 0) tig3bgn = 0; if (tig3end > tig->getLength()) tig3end = tig->getLength(); } // If either of the 5' or 3' overlaps (or both!) are in the repeat region, we need to check for // close overlaps on that end. uint32 len5 = 0; uint32 len3 = 0; if ((rMin < tig5bgn) && (tig5end < rMax) && (b5use)) len5 = FI->overlapLength(rdAid, b5->fragId(), b5->ahang(), b5->bhang()); else b5use = false; if ((rMin < tig3bgn) && (tig3end < rMax) && (b3use)) len3 = FI->overlapLength(rdAid, b3->fragId(), b3->ahang(), b3->bhang()); else b3use = false; double score5 = len5 * (1 - b5->erate()); double score3 = len3 * (1 - b3->erate()); // Neither of the best edges are in the repeat region; move to the next region and/or read. if (len5 + len3 == 0) continue; // At least one of the best edge overlaps is in the repeat region. Scan for other edges // that are of comparable length and quality. uint32 ovlLen = 0; BAToverlap *ovl = OC->getOverlaps(rdAid, AS_MAX_ERATE, ovlLen); for (uint32 oo=0; oo<ovlLen; oo++) { uint32 rdBid = ovl[oo].b_iid; uint32 tgBid = Unitig::fragIn(rdBid); // If the read is in a singleton, skip. These are unassembled crud. if ((tgBid == 0) || (unitigs[tgBid] == NULL) || (unitigs[tgBid]->ufpath.size() == 1)) continue; // If the read is in an annotated bubble, skip. if (unitigs[tgBid]->_isBubble) continue; // Skip if this overlap is the best we're trying to match. if ((rdBid == b5->fragId()) || (rdBid == b3->fragId())) continue; // Skip if this overlap is crappy quality if (OG->isOverlapBadQuality(ovl[oo])) continue; // Skip if the read is contained or suspicious. if ((OG->isContained(rdBid) == true) || (OG->isSuspicious(rdBid) == true)) continue; // Skip if the overlap isn't dovetail. bool ovl5 = ovl[oo].AEndIs5prime(); bool ovl3 = ovl[oo].AEndIs3prime(); if ((ovl5 == false) && (ovl3 == false)) continue; // Skip if we're not using this overlap if ((ovl5 == true) && (b5use == false)) continue; if ((ovl3 == true) && (b3use == false)) continue; uint32 rdBpos = unitigs[tgBid]->pathPosition(rdBid); ufNode *rdB = &unitigs[tgBid]->ufpath[rdBpos]; bool rdBfwd = (rdB->position.bgn < rdB->position.end); int32 rdBlo = (rdBfwd) ? rdB->position.bgn : rdB->position.end; int32 rdBhi = (rdBfwd) ? rdB->position.end : rdB->position.bgn; // If the overlap is to a read in a different tig, or // the overlap is to a read in the same tig, but we don't overlap in the tig, check lengths. // Otherwise, the overlap is present in the tig, and can't be confused. if ((tgBid == tig->id()) && (rdBlo <= rdAhi) && (rdAlo <= rdBhi)) continue; uint32 len = FI->overlapLength(rdAid, ovl[oo].b_iid, ovl[oo].a_hang, ovl[oo].b_hang); double score = len * (1 - ovl[oo].erate); // Compute percent difference. double ad5 = fabs(score - score5); double ad3 = fabs(score - score3); double pd5 = 200 * ad5 / (score + score5); double pd3 = 200 * ad3 / (score + score3); // Skip if this overlap is vastly worse than the best. if ((ovl5 == true) && ((ad5 >= confusedAbsolute) || (pd3 > confusedPercent))) { writeLog("tig %7u read %8u pos %7u-%-7u NOT confused by 5' edge to read %8u - best edge read %8u len %6u erate %.4f score %8.2f - alt edge len %6u erate %.4f score %8.2f - absdiff %8.2f percdiff %8.4f\n", tig->id(), rdAid, rdAlo, rdAhi, rdBid, b5->fragId(), len5, b5->erate(), score5, len, ovl[oo].erate, score, ad5, pd5); continue; } if ((ovl3 == true) && ((ad3 >= confusedAbsolute) || (pd3 > confusedPercent))) { writeLog("tig %7u read %8u pos %7u-%-7u NOT confused by 3' edge to read %8u - best edge read %8u len %6u erate %.4f score %8.2f - alt edge len %6u erate %.4f score %8.2f - absdiff %8.2f percdiff %8.4f\n", tig->id(), rdAid, rdAlo, rdAhi, rdBid, b3->fragId(), len3, b3->erate(), score3, len, ovl[oo].erate, score, ad3, pd3); continue; } // Potential confusion! if (ovl5 == true) writeLog("tig %7u read %8u pos %7u-%-7u IS confused by 5' edge to read %8u - best edge read %8u len %6u erate %.4f score %8.2f - alt edge len %6u erate %.4f score %8.2f - absdiff %8.2f percdiff %8.4f\n", tig->id(), rdAid, rdAlo, rdAhi, rdBid, b5->fragId(), len5, b5->erate(), score5, len, ovl[oo].erate, score, ad5, pd5); if (ovl3 == true) writeLog("tig %7u read %8u pos %7u-%-7u IS confused by 3' edge to read %8u - best edge read %8u len %6u erate %.4f score %8.2f - alt edge len %6u erate %.4f score %8.2f - absdiff %8.2f percdiff %8.4f\n", tig->id(), rdAid, rdAlo, rdAhi, rdBid, b3->fragId(), len3, b3->erate(), score3, len, ovl[oo].erate, score, ad3, pd3); isConfused[ri]++; } } // Over all marks (ri) } // Over all reads (fi) // Scan all the regions, and delete any that have no confusion. { bool discarded = false; for (uint32 ri=0; ri<tigMarksR.numberOfIntervals(); ri++) { if (isConfused[ri] == 0) { writeLog("discard region %8d:%-8d - no confusion in best edges\n", tigMarksR.lo(ri), tigMarksR.hi(ri)); tigMarksR.lo(ri) = 0; tigMarksR.hi(ri) = 0; discarded = true; } else { writeLog("saved region %8d:%-8d - %u best edges are potentially confused\n", tigMarksR.lo(ri), tigMarksR.hi(ri), isConfused[ri]); } } if (discarded) tigMarksR.filterShort(1); } delete [] isConfused; // Scan reads, join any marks that have their junctions spanned by a sufficiently large amount. // // If the read spans this junction be the usual amount, merge the intervals. // // The intervals can be overlapping (by up to REPEAT_OVERLAP_MIN (x2?) bases. For this junction // to be spanned, the read must span from min-ROM to max+ROM, not just hi(ri-1) to lo(ri). // // We DO need to filterShort() after every merge, otherwise, we'd have an empty bogus interval // in the middle of our list, which could be preventing some other merge. OK, we could // // Anything that gets merged is now no longer a true repeat. It's unique, just bordered by repeats. // We can't track this through the indices (because we delete things). We track it with a set of // begin coordinates. set<int32> nonRepeatIntervals; writeLog("Scan reads to merge repeat regions.\n"); for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; bool frgfwd = (frg->position.bgn < frg->position.end); int32 frglo = (frgfwd) ? frg->position.bgn : frg->position.end; int32 frghi = (frgfwd) ? frg->position.end : frg->position.bgn; bool merged = false; for (uint32 ri=1; ri<tigMarksR.numberOfIntervals(); ri++) { uint32 rMin = min(tigMarksR.hi(ri-1), tigMarksR.lo(ri)); uint32 rMax = max(tigMarksR.hi(ri-1), tigMarksR.lo(ri)); if ((frglo + MIN_ANCHOR_HANG <= rMin) && (rMax + MIN_ANCHOR_HANG <= frghi)) { writeLog("merge regions %8d:%-8d and %8d:%-8d - junction contained in read %6u %5d-%5d\n", tigMarksR.lo(ri-1), tigMarksR.hi(ri-1), tigMarksR.lo(ri), tigMarksR.hi(ri), frg->ident, frglo, frghi); tigMarksR.lo(ri) = tigMarksR.lo(ri-1); tigMarksR.lo(ri-1) = 0; // CRITICAL to delete this interval (and not ri) because the next tigMarksR.hi(ri-1) = 0; // iteration will be using ri-1 (== ri here) and ri (== ri+1). merged = true; nonRepeatIntervals.insert(tigMarksR.lo(ri)); } } if (merged) tigMarksR.filterShort(1); } // Extend the regions by MIN_ANCHOR_HANG. This makes checking for reads that span and are // anchored in the next region easier. It also solved a quirk when the first/last repeat // region doesn't extend to the end of the sequence: // 0-183 unique (created from inversion below, but useless and incorrect) // 183-9942 repeat for (uint32 ii=0; ii<tigMarksR.numberOfIntervals(); ii++) { tigMarksR.lo(ii) = max<int32>(tigMarksR.lo(ii) - MIN_ANCHOR_HANG, 0); tigMarksR.hi(ii) = min<int32>(tigMarksR.hi(ii) + MIN_ANCHOR_HANG, tig->getLength()); } // Find the non-repeat intervals. tigMarksU = tigMarksR; tigMarksU.invert(0, tig->getLength()); // Create the list of intervals we'll use to make new unitigs. // // The repeat intervals are extended by MIN_ANCHOR_HANG, and then any read fully contained in one of // these is moved here. // // The non-repeat intervals are shortened by the same amount, and any read that intersects one // is moved there. // // Does order matter? Not sure. The repeat intervals are first, then the formerly repeat // merged intervals, then the unique intervals. Splitting might depend on the repeats being // first. writeLog("Make breakpoints.\n"); vector<breakPointCoords> BP; for (uint32 ii=0; ii<tigMarksR.numberOfIntervals(); ii++) if (nonRepeatIntervals.count(tigMarksR.lo(ii)) == 0) BP.push_back(breakPointCoords(ti, tigMarksR.lo(ii), tigMarksR.hi(ii), true)); for (uint32 ii=0; ii<tigMarksR.numberOfIntervals(); ii++) if (nonRepeatIntervals.count(tigMarksR.lo(ii)) != 0) BP.push_back(breakPointCoords(ti, tigMarksR.lo(ii), tigMarksR.hi(ii), true)); for (uint32 ii=0; ii<tigMarksU.numberOfIntervals(); ii++) { BP.push_back(breakPointCoords(ti, tigMarksU.lo(ii), tigMarksU.hi(ii), false)); } // If only one region, the whole unitig was declared repeat. Nothing to do. if (BP.size() == 1) continue; sort(BP.begin(), BP.end()); // Report. writeLog("break tig %u into up to %u pieces:\n", ti, BP.size()); for (uint32 ii=0; ii<BP.size(); ii++) writeLog(" %8d %8d %s (length %d)\n", BP[ii]._bgn, BP[ii]._end, BP[ii]._isRepeat ? "repeat" : "unique", BP[ii]._end - BP[ii]._bgn); // Scan the reads, counting the number of reads that would be placed in each new tig. This is done // because there are a few 'splits' that don't move any reads around. Unitig **newTigs = new Unitig * [BP.size()]; int32 *lowCoord = new int32 [BP.size()]; uint32 *nRepeat = new uint32 [BP.size()]; uint32 *nUnique = new uint32 [BP.size()]; // First call, count the number of tigs we would create if we let it create them. uint32 nTigs = splitUnitigs(unitigs, tig, BP, newTigs, lowCoord, nRepeat, nUnique, false); // Second call, actually create the tigs, if anything would change. if (nTigs > 1) splitUnitigs(unitigs, tig, BP, newTigs, lowCoord, nRepeat, nUnique, true); // Report the tigs created. for (uint32 ii=0; ii<BP.size(); ii++) { int32 rgnbgn = BP[ii]._bgn; int32 rgnend = BP[ii]._end; bool repeat = BP[ii]._isRepeat; if (nRepeat[ii] + nUnique[ii] == 0) writeLog("For tig %5u %s region %8d %8d - %6u/%6u repeat/unique reads - no new unitig created.\n", ti, (repeat == true) ? "repeat" : "unique", rgnbgn, rgnend, nRepeat[ii], nUnique[ii]); else if (nTigs > 1) writeLog("For tig %5u %s region %8d %8d - %6u/%6u reads repeat/unique - unitig %5u created.\n", ti, (repeat == true) ? "repeat" : "unique", rgnbgn, rgnend, nRepeat[ii], nUnique[ii], newTigs[ii]->id()); else writeLog("For tig %5u %s region %8d %8d - %6u/%6u repeat/unique reads - unitig %5u remains unchanged.\n", ti, (repeat == true) ? "repeat" : "unique", rgnbgn, rgnend, nRepeat[ii], nUnique[ii], tig->id()); } // Cleanup. delete [] newTigs; delete [] lowCoord; delete [] nRepeat; delete [] nUnique; // Remove the old unitig....if we made new ones. if (nTigs > 1) { delete tig; unitigs[ti] = NULL; } } }
void extendByMates(UnitigVector &unitigs, double erateGraph) { //logFileFlags |= LOG_CHUNK_GRAPH; logFileFlags |= LOG_POPULATE_UNITIG; writeLog("==> EXTENDING UNITIGS WITH MATE PAIRS.\n"); uint32 tiMax = unitigs.size(); for (uint32 ti=0; ti<tiMax; ti++) { Unitig *target = unitigs[ti]; if (target == NULL) continue; if (target->ufpath.size() < 2) continue; // Build a list of all the fragments in this unitig, and any mates that are not in a unitig. uint32 extraMates = 0; for (uint32 fi=0; fi<target->ufpath.size(); fi++) { uint32 fid = target->ufpath[fi].ident; uint32 mid = FI->mateIID(fid); if ((mid != 0) && (Unitig::fragIn(mid) == 0)) extraMates++; } writeLog("\n"); writeLog("unitig "F_U32" of size "F_SIZE_T" with "F_U32" extra fragments via mates\n", ti, target->ufpath.size(), extraMates); if (extraMates == 0) continue; // Build a set of the fragments in this unitig plus their mates, and a set of just the mates. set<uint32> frags; set<uint32> mates; for (uint32 fi=0; fi<target->ufpath.size(); fi++) { uint32 fid = target->ufpath[fi].ident; uint32 mid = FI->mateIID(fid); frags.insert(fid); if ((mid != 0) && (Unitig::fragIn(mid) == 0)) { writeLog(" mate frag "F_U32"\n", mid); frags.insert(mid); mates.insert(mid); } } // Now, remove all the unitig fragments from the unitig so we can reconstruct it with the // additional mated fragments. Note that this loop cannot be combined with the last, since // the test for 'additional mate' is 'not in the same unitig' -- and if we remove the // fragments too early, we can't distinguish 'additional' from 'included'. for (uint32 fi=0; fi<target->ufpath.size(); fi++) target->removeFrag(target->ufpath[fi].ident); unitigs[ti] = NULL; delete target; // Build a new BOG for just those fragments - in particular, only overlaps within the set are // used for the BOG. BestOverlapGraph *OGsave = OG; ChunkGraph *CGsave = CG; OG = new BestOverlapGraph(erateGraph, &frags); CG = new ChunkGraph(&frags); uint32 numTigs = unitigs.size(); // Build new unitigs. There should only be one new unitig constructed, but that isn't // guaranteed. No new unitigs are built if they are seeded from the mate fragments. This // isn't ideal -- we'd like to allow the first unitig (supposedly the longest) to start from // a mate fragment. However, consider the not-so-rare case where the original unitig is two // backbone fragments and lots of contains. Those contains contribute mate pairs that all // assemble together, giving a longer path than the original unitig. We don't want to // assemble the mated fragments yet (we'll wait until we get the rest of the fragments that // could assemble together). for (uint32 fi = CG->nextFragByChunkLength(); fi > 0; fi=CG->nextFragByChunkLength()) { if ((Unitig::fragIn(fi) != 0) || (mates.count(fi) > 0)) // Fragment already in a unitig, or is an additional mate that we don't want // to seed from. continue; populateUnitig(unitigs, fi); } // Report what was constructed if (unitigs.size() - numTigs > 1) writeLog("WARNING: mate extension split a unitig.\n"); for (uint32 newTigs=numTigs; newTigs<unitigs.size(); newTigs++) { Unitig *tig = unitigs[newTigs]; if (tig == NULL) continue; placeContainsUsingBestOverlaps(tig, &frags); writeLog(" new tig "F_U32" with "F_SIZE_T" fragments\n", tig->id(), tig->ufpath.size()); } delete OG; delete CG; OG = OGsave; CG = CGsave; } }
void breakUnitigs(UnitigVector &unitigs, char *output_prefix, bool enableIntersectionBreaking) { writeLog("==> BREAKING UNITIGS.\n"); intersectionList *ilist = new intersectionList(unitigs); // Stop when we've seen all current unitigs. Replace tiMax // in the for loop below with unitigs.size() to recursively // split unitigs. uint32 tiMax = unitigs.size(); for (uint32 ti=0; ti<tiMax; ti++) { Unitig *tig = unitigs[ti]; if (tig == NULL) continue; vector<breakPoint> breaks; for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; intersectionPoint *isect = ilist->getIntersection(frg->ident, 0); if (isect == NULL) continue; for (; isect->isectFrg == frg->ident; isect++) { assert(tig->id() == Unitig::fragIn(isect->isectFrg)); // Grab the invading unitig Unitig *inv = unitigs[Unitig::fragIn(isect->invadFrg)]; assert(inv->id() == Unitig::fragIn(isect->invadFrg)); // Grab the best edges off the invading fragment. BestEdgeOverlap *best5 = OG->getBestEdgeOverlap(isect->invadFrg, false); BestEdgeOverlap *best3 = OG->getBestEdgeOverlap(isect->invadFrg, true); // Check if the incoming tig is a spur, and we should just ignore it immediately if ((inv->ufpath.size() == 1) && ((best5->fragId() == 0) || (best3->fragId() == 0))) { if (logFileFlagSet(LOG_INTERSECTION_BREAKING)) writeLog("unitig %d frag %d end %c' into unitig %d frag %d end %c' -- IS A SPUR, skip it\n", inv->id(), isect->invadFrg, isect->invad3p ? '3' : '5', tig->id(), isect->isectFrg, isect->isect3p ? '3' : '5'); continue; } // Keep only significant intersections if ((inv->getLength() > MIN_BREAK_LENGTH) && (inv->ufpath.size() > MIN_BREAK_FRAGS)) { if (logFileFlagSet(LOG_INTERSECTION_BREAKING)) writeLog("unitig %d frag %d end %c' into unitig %d frag %d end %c'\n", inv->id(), isect->invadFrg, isect->invad3p ? '3' : '5', tig->id(), isect->isectFrg, isect->isect3p ? '3' : '5'); breaks.push_back(breakPoint(isect->isectFrg, isect->isect3p, true, false)); } } // Over all incoming fragments // If this is the last fragment, terminate the break point list with a 'fakeEnd' (in AS_BAT_Breaking.cc) break point // at the end of the unitig. if ((fi+1 == tig->ufpath.size()) && (breaks.size() > 0)) { breaks.push_back(breakPoint(frg->ident, (frg->position.bgn < frg->position.end), true, false)); } } // Over all fragments in the unitig if (breaks.size() == 0) continue; // Report where breaks occur. 'breaks' is a list, not a vector. #if 0 // We've lost the fields in breaks[i] -- but the reports above aren't updated yet. if (logFileFlagSet(LOG_INTERSECTION_BREAKING) || logFileFlagSet(LOG_MATE_SPLIT_COVERAGE_PLOT)) for (uint32 i=0; i<breaks.size(); i++) writeLog("BREAK unitig %d at position %d,%d from inSize %d inFrags %d.\n", tig->id(), breaks[i].fragPos.bgn, breaks[i].fragPos.end, breaks[i].inSize, breaks[i].inFrags); #endif // Actually do the breaking. if (enableIntersectionBreaking) breakUnitigAt(unitigs, tig, breaks, true); breaks.clear(); } // Over all unitigs }
void placeZombies(UnitigVector &unitigs, double erate) { writeLog("==> SEARCHING FOR ZOMBIES\n"); uint32 *inUnitig = new uint32 [FI->numFragments()+1]; int numZombies = 0; // Mark fragments as dead, then unmark them if they are in a real living unitig. for (uint32 i=0; i<FI->numFragments()+1; i++) inUnitig[i] = noUnitig; for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg == NULL) continue; for (uint32 fi=0; fi<utg->ufpath.size(); fi++) inUnitig[utg->ufpath[fi].ident] = utg->id(); } // For anything not in a living unitig, reload the overlaps and find a new container. // (NOT IMPLEMENTED - for now we just move these to new singleton unitigs). for (uint32 i=0; i<FI->numFragments()+1; i++) { if (FI->fragmentLength(i) == 0) // Deleted fragment continue; if (inUnitig[i] != noUnitig) // Valid fragment in a unitig continue; Unitig *utg = unitigs.newUnitig(false); ufNode frg; frg.ident = i; frg.contained = 0; frg.parent = 0; frg.ahang = 0; frg.bhang = 0; frg.position.bgn = 0; frg.position.end = FI->fragmentLength(i); frg.containment_depth = 0; utg->addFrag(frg, 0, false); writeLog("placeZombies()-- unitig %d created from zombie fragment %d\n", utg->id(), i); numZombies++; } writeLog("RESURRECTED %d ZOMBIE FRAGMENT%s.\n", numZombies, (numZombies != 1) ? "s" : ""); delete [] inUnitig; }
void popMateBubbles(UnitigVector &unitigs) { uint32 nBubblePopped = 0; uint32 nBubbleTooBig = 0; uint32 nBubbleConflict = 0; writeLog("==> SEARCHING FOR MATE BUBBLES\n"); // For each unitig, if all (or most) of the external mates are to a single other unitig (not // counting singletons), then this is a potential bubble popping unitig. // // At present, this is exploratory only. for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *tig = unitigs[ti]; if ((tig == NULL) || (tig->ufpath.size() == 0)) // No tig here. continue; if ((tig->getLength() > 1000) || (tig->ufpath.size() >= 3000)) // Tig too big. continue; //if ((tig->getLength() < 150) || // (tig->ufpath.size() < 5)) // // Tig too small. // continue; uint32 *lkg = new uint32 [tig->ufpath.size()]; uint32 lkgLen = 0; uint32 lkgExt = 0; for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; int32 frgID = frg->ident; int32 matID = FI->mateIID(frgID); uint32 mtigID = 0; Unitig *mtig = 0L; if (matID == 0) // No mate. continue; mtigID = tig->fragIn(matID); mtig = unitigs[mtigID]; if (mtigID == tig->id()) // Mate is not external. continue; lkgExt++; if (mtig->ufpath.size() < 2) // Mate is in singleton. continue; lkg[lkgLen++] = mtigID; } if (lkgLen == 0) // No external mates. continue; sort(lkg, lkg+lkgLen); uint32 last = lkg[0]; uint32 lcnt = 1; for (uint32 i=1; i<lkgLen; i++) { if (last != lkg[i]) { if ((lcnt > 3)) writeLog("popMateBubble()-- tig %d len %d might pop bubble in tig %u (%u mates in there out of %d external mates)\n", tig->id(), tig->getLength(), last, lcnt, lkgExt); last = lkg[i]; lcnt = 0; } lcnt++; } if ((lcnt > 3)) writeLog("popMateBubble()-- tig %d len %d might pop bubble in tig %u (%u mates in there out of %d external mates)\n", tig->id(), tig->getLength(), last, lcnt, lkgExt); delete [] lkg; } }
// For every unitig, report the best overlaps contained in the // unitig, and all overlaps contained in the unitig. void reportOverlapsUsed(UnitigVector &unitigs, const char *prefix, const char *name) { if (logFileFlagSet(LOG_OVERLAPS_USED) == 0) return; char ovlPath[FILENAME_MAX]; sprintf(ovlPath, "%s.%03u.%s.overlaps", prefix, logFileOrder, name); FILE *F = fopen(ovlPath, "w"); if (F == NULL) return; for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg == NULL) continue; for (uint32 fi=0; fi<utg->ufpath.size(); fi++) { ufNode *frg = &utg->ufpath[fi]; // Where is our best overlap? Contained or dovetail? BestEdgeOverlap *bestedge5 = OG->getBestEdgeOverlap(frg->ident, false); BestEdgeOverlap *bestedge3 = OG->getBestEdgeOverlap(frg->ident, true); uint32 bestident5 = 0; uint32 bestident3 = 0; if (bestedge5) bestident5 = bestedge5->fragId(); if (bestedge3) bestident3 = bestedge3->fragId(); // Now search ahead, reporting any overlap to any fragment. // for (uint32 oi=fi+1; oi<utg->ufpath.size(); oi++) { ufNode *ooo = &utg->ufpath[oi]; int frgbgn = MIN(frg->position.bgn, frg->position.end); int frgend = MAX(frg->position.bgn, frg->position.end); int ooobgn = MIN(ooo->position.bgn, ooo->position.end); int oooend = MAX(ooo->position.bgn, ooo->position.end); if ((frgbgn <= ooobgn) && (ooobgn + 40 < frgend)) { BestContainment *bestcont = OG->getBestContainer(ooo->ident); uint32 bestident = 0; if (bestcont->isContained) bestident = bestcont->container; bool isBest = ((frg->ident == bestident) || (ooo->ident == bestident5) || (ooo->ident == bestident3)); fprintf(F, "%d\t%d%s\n", frg->ident, ooo->ident, (isBest) ? ((bestident) ? "\tbc" : "\tbe") : ""); } if (frgend < ooobgn) break; } } } fclose(F); }