void placeUnplacedUsingAllOverlaps(UnitigVector &unitigs, const char *prefix) { uint32 fiLimit = FI->numFragments(); uint32 numThreads = omp_get_max_threads(); uint32 blockSize = (fiLimit < 100 * numThreads) ? numThreads : fiLimit / 99; uint32 *placedTig = new uint32 [FI->numFragments() + 1]; SeqInterval *placedPos = new SeqInterval [FI->numFragments() + 1]; memset(placedTig, 0, sizeof(uint32) * (FI->numFragments() + 1)); memset(placedPos, 0, sizeof(SeqInterval) * (FI->numFragments() + 1)); // Just some logging. Count the number of reads we try to place. uint32 nToPlaceContained = 0; uint32 nToPlace = 0; uint32 nPlacedContained = 0; uint32 nPlaced = 0; uint32 nFailedContained = 0; uint32 nFailed = 0; for (uint32 fid=1; fid<FI->numFragments()+1; fid++) if (Unitig::fragIn(fid) == 0) if (OG->isContained(fid)) nToPlaceContained++; else nToPlace++; writeLog("placeContains()-- placing %u contained and %u unplaced reads, with %d threads.\n", nToPlaceContained, nToPlace, numThreads); // Do the placing! #pragma omp parallel for schedule(dynamic, blockSize) for (uint32 fid=1; fid<FI->numFragments()+1; fid++) { bool enableLog = true; if (Unitig::fragIn(fid) > 0) continue; // Place the read. vector<overlapPlacement> placements; placeFragUsingOverlaps(unitigs, AS_MAX_ERATE, NULL, fid, placements); // Search the placements for the highest expected identity placement using all overlaps in the unitig. uint32 b = UINT32_MAX; for (uint32 i=0; i<placements.size(); i++) { Unitig *tig = unitigs[placements[i].tigID]; if (placements[i].fCoverage < 0.99) // Ignore partially placed reads. continue; if (tig->ufpath.size() == 1) // Ignore placements in singletons. continue; uint32 bgn = (placements[i].position.bgn < placements[i].position.end) ? placements[i].position.bgn : placements[i].position.end; uint32 end = (placements[i].position.bgn < placements[i].position.end) ? placements[i].position.end : placements[i].position.bgn; double erate = placements[i].errors / placements[i].aligned; if (tig->overlapConsistentWithTig(5.0, bgn, end, erate) < 0.5) { if ((enableLog == true) && (logFileFlagSet(LOG_PLACE_UNPLACED))) writeLog("frag %8u tested tig %6u (%6u reads) at %8u-%8u (cov %7.5f erate %6.4f) - HIGH ERROR\n", fid, placements[i].tigID, tig->ufpath.size(), placements[i].position.bgn, placements[i].position.end, placements[i].fCoverage, erate); continue; } if ((enableLog == true) && (logFileFlagSet(LOG_PLACE_UNPLACED))) writeLog("frag %8u tested tig %6u (%6u reads) at %8u-%8u (cov %7.5f erate %6.4f)\n", fid, placements[i].tigID, tig->ufpath.size(), placements[i].position.bgn, placements[i].position.end, placements[i].fCoverage, erate); if ((b == UINT32_MAX) || (placements[i].errors / placements[i].aligned < placements[b].errors / placements[b].aligned)) b = i; } // If we didn't find a best, b will be invalid; set positions for adding to a new tig. // If we did, save both the position it was placed at, and the tigID it was placed in. if (b == UINT32_MAX) { if ((enableLog == true) && (logFileFlagSet(LOG_PLACE_UNPLACED))) writeLog("frag %8u remains unplaced\n", fid); placedPos[fid].bgn = 0; placedPos[fid].end = FI->fragmentLength(fid); } else { if ((enableLog == true) && (logFileFlagSet(LOG_PLACE_UNPLACED))) writeLog("frag %8u placed tig %6u (%6u reads) at %8u-%8u (cov %7.5f erate %6.4f)\n", fid, placements[b].tigID, unitigs[placements[b].tigID]->ufpath.size(), placements[b].position.bgn, placements[b].position.end, placements[b].fCoverage, placements[b].errors / placements[b].aligned); placedTig[fid] = placements[b].tigID; placedPos[fid] = placements[b].position; } } // All reads placed, now just dump them in their correct tigs. for (uint32 fid=1; fid<FI->numFragments()+1; fid++) { Unitig *tig = NULL; ufNode frg; if (Unitig::fragIn(fid) > 0) continue; // If not placed, dump it in a new unitig. Well, not anymore. These reads were not placed in // any tig initially, were not allowed to seed a tig, and now, could find no place to go. // They're garbage. Plus, it screws up the logging above because we don't know the new tig ID // until now. if (placedTig[fid] == 0) { if (OG->isContained(fid)) nFailedContained++; else nFailed++; //tig = unitigs.newUnitig(false); } // Otherwise, it was placed somewhere, grab the tig. else { if (OG->isContained(fid)) nPlacedContained++; else nPlaced++; tig = unitigs[placedTig[fid]]; } // Regardless, add it to the tig. Logging for this is above. if (tig) { frg.ident = fid; frg.contained = 0; frg.parent = 0; frg.ahang = 0; frg.bhang = 0; frg.position = placedPos[fid]; tig->addFrag(frg, 0, false); } } // Cleanup. delete [] placedPos; delete [] placedTig; writeLog("placeContains()-- Placed %u contained reads and %u unplaced reads.\n", nPlacedContained, nPlaced); writeLog("placeContains()-- Failed to place %u contained reads (too high error suspected) and %u unplaced reads (lack of overlaps suspected).\n", nFailedContained, nFailed); // But wait! All the tigs need to be sorted. Well, not really _all_, but the hard ones to sort // are big, and those quite likely had reads added to them, so it's really not worth the effort // of tracking which ones need sorting, since the ones that don't need it are trivial to sort. for (uint32 ti=1; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg) utg->sort(); } }
vector<overlapPlacement> * findBubbleReadPlacements(UnitigVector &unitigs, BubTargetList &potentialBubbles, double deviationBubble) { uint32 fiLimit = FI->numFragments(); uint32 fiNumThreads = omp_get_max_threads(); uint32 fiBlockSize = (fiLimit < 1000 * fiNumThreads) ? fiNumThreads : fiLimit / 999; vector<overlapPlacement> *placed = new vector<overlapPlacement> [fiLimit + 1]; #pragma omp parallel for schedule(dynamic, fiBlockSize) for (uint32 fi=0; fi<fiLimit; fi++) { uint32 rdAtigID = Unitig::fragIn(fi); if ((rdAtigID == 0) || // Read not placed in a tig, ignore it. (OG->isContained(fi)) || // Read is contained, ignore it. (potentialBubbles.count(rdAtigID) == 0)) // Read isn't in a potential bubble, ignore it. continue; Unitig *rdAtig = unitigs[rdAtigID]; ufNode *rdA = &rdAtig->ufpath[ Unitig::pathPosition(fi) ]; bool rdAfwd = (rdA->position.bgn < rdA->position.end); int32 rdAlo = (rdAfwd) ? rdA->position.bgn : rdA->position.end; int32 rdAhi = (rdAfwd) ? rdA->position.end : rdA->position.bgn; uint32 ovlLen = 0; BAToverlap *ovl = OC->getOverlaps(rdA->ident, AS_MAX_ERATE, ovlLen); set<uint32> intersections; //if ((fi % 100) == 0) // fprintf(stderr, "findBubbleReadPlacements()-- read %8u with %6u overlaps - %6.2f%% finished.\r", // rdA->ident, ovlLen, 100.0 * fi / fiLimit); // Compute all placements for this read. vector<overlapPlacement> placements; placeFragUsingOverlaps(unitigs, AS_MAX_ERATE, NULL, rdA->ident, placements); // Weed out placements that aren't for bubbles, or that are for bubbles but are poor quality. Or are to ourself! for (uint32 pi=0; pi<placements.size(); pi++) { uint32 rdBtigID = placements[pi].tigID; Unitig *rdBtig = unitigs[rdBtigID]; uint32 lo = (placements[pi].position.bgn < placements[pi].position.end) ? placements[pi].position.bgn : placements[pi].position.end; uint32 hi = (placements[pi].position.bgn < placements[pi].position.end) ? placements[pi].position.end : placements[pi].position.bgn; double erate = placements[pi].errors / placements[pi].aligned; // Ignore the placement if it is to ourself. if (rdAtigID == rdBtigID) { //writeLog("tig %6u frag %8u -> tig %6u %6u reads at %8u-%8u (cov %7.5f erate %6.4f) - SAME TIG\n", // rdAtigID, placements[pi].frgID, placements[pi].tigID, rdBtig->ufpath.size(), placements[pi].position.bgn, placements[pi].position.end, placements[pi].fCoverage, erate); continue; } // Ignore the placement if it is to a non-tig / singleton read, or if it didn't place the // read fully. if ((rdBtigID == 0) || (rdBtig == NULL) || (rdBtig->ufpath.size() == 1) || (placements[pi].fCoverage < 0.99)) { if (logFileFlagSet(LOG_BUBBLE_DETAIL)) writeLog("tig %6u frag %8u -> tig %6u %6u reads at %8u-%8u (cov %7.5f erate %6.4f) - PARTIALLY PLACED\n", rdAtigID, placements[pi].frgID, placements[pi].tigID, rdBtig->ufpath.size(), placements[pi].position.bgn, placements[pi].position.end, placements[pi].fCoverage, erate); continue; } // Ignore the placement if it isn't to one of our bubble-popping candidate unitigs. bool dontcare = true; vector<uint32> &pbubbles = potentialBubbles[rdAtigID]; for (uint32 pb=0; pb<pbubbles.size(); pb++) { if (pbubbles[pb] == rdBtigID) dontcare = false; } if (dontcare) { if (logFileFlagSet(LOG_BUBBLE_DETAIL)) writeLog("tig %6u frag %8u -> tig %6u %6u reads at %8u-%8u (cov %7.5f erate %6.4f) - NOT CANDIDATE TIG\n", rdAtigID, placements[pi].frgID, placements[pi].tigID, rdBtig->ufpath.size(), placements[pi].position.bgn, placements[pi].position.end, placements[pi].fCoverage, erate); continue; } // Ignore the placement if it is too diverged from the destination tig. if (rdBtig->overlapConsistentWithTig(deviationBubble, lo, hi, erate) < 0.5) { if (logFileFlagSet(LOG_BUBBLE_DETAIL)) writeLog("tig %6u frag %8u -> tig %6u %6u reads at %8u-%8u (cov %7.5f erate %6.4f) - HIGH ERROR\n", rdAtigID, placements[pi].frgID, placements[pi].tigID, rdBtig->ufpath.size(), placements[pi].position.bgn, placements[pi].position.end, placements[pi].fCoverage, erate); continue; } // Good placement! if (logFileFlagSet(LOG_BUBBLE_DETAIL)) writeLog("tig %6u frag %8u -> tig %6u %6u reads at %8u-%8u (cov %7.5f erate %6.4f)\n", rdAtigID, placements[pi].frgID, placements[pi].tigID, rdBtig->ufpath.size(), placements[pi].position.bgn, placements[pi].position.end, placements[pi].fCoverage, erate); placed[fi].push_back(placements[pi]); } } return(placed); }