Unitig * UnitigVector::newUnitig(bool verbose) { Unitig *u = new Unitig(); #pragma omp critical { u->_id = _totalUnitigs++; if (verbose) writeLog("Creating Unitig %d\n", u->_id); if (_blockNext >= _blockSize) { assert(_numBlocks < _maxBlocks); _blocks[_numBlocks] = new Unitig * [_blockSize]; memset(_blocks[_numBlocks], 0, sizeof(Unitig **) * _blockSize); _numBlocks++; _blockNext = 0; } _blocks[_numBlocks-1][_blockNext++] = u; // The rest are just sanity checks. assert((u->id() / _blockSize) == (_numBlocks - 1)); assert((u->id() % _blockSize) == (_blockNext - 1)); assert(operator[](u->id()) == u); } return(u); };
void UnitigVector::computeArrivalRate(const char *prefix, const char *label) { uint32 tiLimit = size(); uint32 numThreads = omp_get_max_threads(); uint32 blockSize = (tiLimit < 100000 * numThreads) ? numThreads : tiLimit / 99999; fprintf(stderr, "Computing arrival rates for %u unitigs using %u threads.\n", tiLimit, numThreads); vector<int32> hist[6]; //#pragma omp parallel for schedule(dynamic, blockSize) for (uint32 ti=0; ti<tiLimit; ti++) { Unitig *tig = operator[](ti); if (tig == NULL) continue; if (tig->ufpath.size() == 1) continue; tig->computeArrivalRate(prefix, label, hist); } for (uint32 ii=1; ii<6; ii++) { char N[FILENAME_MAX]; sprintf(N, "%s.arrivalRate.%u.dat", prefix, ii); FILE *F = fopen(N, "w"); for (uint32 jj=0; jj<hist[ii].size(); jj++) fprintf(F, "%d\n", hist[ii][jj]); fclose(F); } }
void promoteToSingleton(UnitigVector &unitigs, bool enablePromoteToSingleton) { for (uint32 fi=1; fi<=FI->numFragments(); fi++) { if (Unitig::fragIn(fi) != 0) // Placed already continue; if (FI->fragmentLength(fi) == 0) // Deleted. continue; if (enablePromoteToSingleton == false) { writeLog("promoteToSingleton()-- Repeat fragment "F_U32" removed from assembly.\n", fi); FI->markAsIgnore(fi); continue; } Unitig *utg = unitigs.newUnitig(false); ufNode frag; frag.ident = fi; frag.contained = 0; frag.parent = 0; frag.ahang = 0; frag.bhang = 0; frag.position.bgn = 0; frag.position.end = FI->fragmentLength(fi); frag.containment_depth = 0; utg->addFrag(frag, 0, false); } }
void breakSingletonTigs(UnitigVector &unitigs) { // For any singleton unitig, eject the read and delete the unitig. Eventually, // we will stop making singleton unitigs. uint32 removed = 0; for (uint32 ti=1; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg == NULL) continue; if (utg->ufpath.size() > 1) continue; unitigs[ti] = NULL; // Remove the unitig from the list utg->removeFrag(utg->ufpath[0].ident); // Eject the read delete utg; // Reclaim space removed++; // Count } writeLog("Removed %u read%s from %u singleton unitig%s.\n", removed, (removed != 1) ? "" : "s", removed, (removed != 1) ? "" : "s"); }
void promoteToSingleton(UnitigVector &unitigs) { for (uint32 fi=1; fi<=FI->numFragments(); fi++) { if (Unitig::fragIn(fi) != 0) // Placed already continue; if (FI->fragmentLength(fi) == 0) // Deleted. continue; Unitig *utg = unitigs.newUnitig(false); ufNode frag; frag.ident = fi; frag.contained = 0; frag.parent = 0; frag.ahang = 0; frag.bhang = 0; frag.position.bgn = 0; frag.position.end = FI->fragmentLength(fi); utg->addFrag(frag, 0, false); } }
void checkUnitigMembership(UnitigVector &unitigs) { uint32 *inUnitig = new uint32 [FI->numFragments()+1]; uint32 noUnitig = 0xffffffff; // All reads start of not placed in a unitig. for (uint32 i=0; i<FI->numFragments()+1; i++) inUnitig[i] = noUnitig; // Over all unitigs, remember where each read is. for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *tig = unitigs[ti]; int32 len = 0; if (tig == NULL) continue; for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; if (frg->ident > FI->numFragments()) fprintf(stderr, "tig %u ufpath[%d] ident %u more than number of reads %u\n", tig->id(), fi, frg->ident, FI->numFragments()); if (inUnitig[frg->ident] != noUnitig) fprintf(stderr, "tig %u ufpath[%d] ident %u placed multiple times\n", tig->id(), fi, frg->ident); assert(frg->ident <= FI->numFragments()); // Can't be out of range. assert(inUnitig[frg->ident] == noUnitig); // Read must be not placed yet. inUnitig[frg->ident] = ti; } } // Find any read not placed in a unitig. for (uint32 i=0; i<FI->numFragments()+1; i++) { if (FI->fragmentLength(i) == 0) // Deleted read. continue; assert(inUnitig[i] != 0); // There shouldn't be a unitig 0. assert(inUnitig[i] != noUnitig); // The read should be in a unitig. } delete [] inUnitig; }
void UnitigVector::reportErrorProfiles(const char *prefix, const char *label) { uint32 tiLimit = size(); uint32 numThreads = omp_get_max_threads(); uint32 blockSize = (tiLimit < 100000 * numThreads) ? numThreads : tiLimit / 99999; for (uint32 ti=0; ti<tiLimit; ti++) { Unitig *tig = operator[](ti); if (tig == NULL) continue; if (tig->ufpath.size() == 1) continue; tig->reportErrorProfile(prefix, label); } }
static void makeNewUnitig(UnitigVector &unitigs, uint32 splitFragsLen, ufNode *splitFrags) { Unitig *dangler = unitigs.newUnitig(false); if (logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS)) writeLog("splitDiscontinuous()-- new tig "F_U32" with "F_U32" fragments (starting at frag "F_U32").\n", dangler->id(), splitFragsLen, splitFrags[0].ident); int splitOffset = -MIN(splitFrags[0].position.bgn, splitFrags[0].position.end); // This should already be true, but we force it still splitFrags[0].contained = 0; for (uint32 i=0; i<splitFragsLen; i++) dangler->addFrag(splitFrags[i], splitOffset, false); //logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS)); }
void placeContainsUsingBestOverlaps(UnitigVector &unitigs) { uint32 fragsPlaced = 1; uint32 fragsPending = 0; logFileFlags &= ~LOG_PLACE_FRAG; while (fragsPlaced > 0) { fragsPlaced = 0; fragsPending = 0; writeLog("==> PLACING CONTAINED FRAGMENTS\n"); for (uint32 fid=1; fid<FI->numFragments()+1; fid++) { BestContainment *bestcont = OG->getBestContainer(fid); Unitig *utg; if (bestcont->isContained == false) // Not a contained fragment. continue; if (Unitig::fragIn(fid) != 0) // Containee already placed. continue; if (Unitig::fragIn(bestcont->container) == 0) { // Container not placed (yet). fragsPending++; continue; } utg = unitigs[Unitig::fragIn(bestcont->container)]; utg->addContainedFrag(fid, bestcont, logFileFlagSet(LOG_INITIAL_CONTAINED_PLACEMENT)); if (utg->id() != Unitig::fragIn(fid)) writeLog("placeContainsUsingBestOverlaps()-- FAILED to add frag %d to unitig %d.\n", fid, bestcont->container); assert(utg->id() == Unitig::fragIn(fid)); fragsPlaced++; } writeLog("==> PLACING CONTAINED FRAGMENTS - placed %d fragments; still need to place %d\n", fragsPlaced, fragsPending); if ((fragsPlaced == 0) && (fragsPending > 0)) { writeLog("Stopping contained fragment placement due to zombies.\n"); fragsPlaced = 0; fragsPending = 0; } } for (uint32 ti=1; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg) utg->sort(); } }
void reportUnitigs(UnitigVector &unitigs, const char *prefix, const char *name) { if (logFileFlagSet(LOG_INTERMEDIATE_UNITIGS) == 0) return; uint32 numFragsT = 0; uint32 numFragsP = 0; uint64 utgLen = 0; // Compute average frags per partition. for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg == NULL) continue; numFragsT += utg->ufpath.size(); if (utg->ufpath.size() > 2) utgLen += utg->getLength(); } if (utgLen < 16 * 1024 * 1024) numFragsP = numFragsT / 7; else if (utgLen < 64 * 1024 * 1024) numFragsP = numFragsT / 63; else numFragsP = numFragsT / 127; char tigStorePath[FILENAME_MAX]; sprintf(tigStorePath, "%s.%03u.%s.tigStore", prefix, logFileOrder, name); // Failing to do this results in consensus running about 40 times slower. Three hours instead of // five minutes. setParentAndHang(unitigs); writeUnitigsToStore(unitigs, tigStorePath, tigStorePath, numFragsP, false); }
void UnitigVector::computeErrorProfiles(const char *prefix, const char *label) { uint32 tiLimit = size(); uint32 numThreads = omp_get_max_threads(); uint32 blockSize = (tiLimit < 100000 * numThreads) ? numThreads : tiLimit / 99999; fprintf(stderr, "Computing error profiles for %u unitigs using %u threads.\n", tiLimit, numThreads); //#pragma omp parallel for schedule(dynamic, blockSize) for (uint32 ti=0; ti<tiLimit; ti++) { Unitig *tig = operator[](ti); if (tig == NULL) continue; if (tig->ufpath.size() == 1) continue; tig->computeErrorProfile(prefix, label); } fprintf(stderr, "Computing error profiles - FINISHED.\n"); }
// For every unitig, report the best overlaps contained in the // unitig, and all overlaps contained in the unitig. // // Wow, this is ancient. // void writeOverlapsUsed(UnitigVector &unitigs, char *fileprefix) { char filename[FILENAME_MAX] = {0}; #if 0 GenericMesg pmesg; OverlapMesg omesg; #endif sprintf(filename, "%s.unused.ovl", fileprefix); FILE *file = fopen(filename, "w"); assert(file != NULL); #if 0 for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg == NULL) continue; for (uint32 fi=0; fi<utg->ufpath.size(); fi++) { ufNode *frg = &utg->ufpath[fi]; // Where is our best overlap? Contained or dovetail? BestEdgeOverlap *bestedge5 = OG->getBestEdgeOverlap(frg->ident, false); BestEdgeOverlap *bestedge3 = OG->getBestEdgeOverlap(frg->ident, true); int bestident5 = 0; int bestident3 = 0; if (bestedge5) { bestident5 = bestedge5->fragId(); if ((bestident5 > 0) && (utg->fragIn(bestident5) != utg->id())) { omesg.aifrag = frg->ident; omesg.bifrag = bestident5; omesg.ahg = bestedge5->ahang(); omesg.bhg = bestedge5->bhang(); omesg.orientation.setIsUnknown(); omesg.overlap_type = AS_DOVETAIL; omesg.quality = 0.0; omesg.min_offset = 0; omesg.max_offset = 0; omesg.polymorph_ct = 0; omesg.alignment_trace = NULL; #ifdef AS_MSG_USE_OVL_DELTA omesg.alignment_delta = NULL; #endif // This overlap is off of the 5' end of this fragment. if (bestedge5->frag3p() == false) omesg.orientation.setIsOuttie(); if (bestedge5->frag3p() == true) omesg.orientation.setIsAnti(); pmesg.t = MESG_OVL; pmesg.m = &omesg; WriteProtoMesg_AS(file, &pmesg); } } if (bestedge3) { bestident3 = bestedge3->fragId(); if ((bestident3 > 0) && (utg->fragIn(bestident3) != utg->id())) { omesg.aifrag = frg->ident; omesg.bifrag = bestident3; omesg.ahg = bestedge3->ahang(); omesg.bhg = bestedge3->bhang(); omesg.orientation.setIsUnknown(); omesg.overlap_type = AS_DOVETAIL; omesg.quality = 0.0; omesg.min_offset = 0; omesg.max_offset = 0; omesg.polymorph_ct = 0; omesg.alignment_trace = NULL; #ifdef AS_MSG_USE_OVL_DELTA omesg.alignment_delta = NULL; #endif // This overlap is off of the 3' end of this fragment. if (bestedge3->frag3p() == false) omesg.orientation.setIsNormal(); if (bestedge3->frag3p() == true) omesg.orientation.setIsInnie(); pmesg.t = MESG_OVL; pmesg.m = &omesg; WriteProtoMesg_AS(file, &pmesg); } } } } #endif fclose(file); }
// After splitting and ejecting some contains, check for discontinuous unitigs. // void splitDiscontinuousUnitigs(UnitigVector &unitigs, uint32 minOverlap) { writeLog("==> SPLIT DISCONTINUOUS\n"); uint32 numTested = 0; uint32 numSplit = 0; uint32 numCreated = 0; uint32 splitFragsLen = 0; uint32 splitFragsMax = 0; ufNode *splitFrags = NULL; for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *tig = unitigs[ti]; if ((tig == NULL) || (tig->ufpath.size() < 2)) continue; // Unitig must be sorted. Someone upstream os screwing this up. tig->sort(); // We'll want to build an array of new fragments to split out. This can be up // to the size of the largest unitig. splitFragsMax = MAX(splitFragsMax, tig->ufpath.size()); // Check that the unitig starts at position zero. Not critical for the next loop, but // needs to be dome sometime. int32 minPos = MIN(tig->ufpath[0].position.bgn, tig->ufpath[0].position.end); if (minPos == 0) continue; writeLog("splitDiscontinuous()-- tig "F_U32" offset messed up; reset by "F_S32".\n", tig->id(), minPos); for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; frg->position.bgn -= minPos; frg->position.end -= minPos; } } splitFrags = new ufNode [splitFragsMax]; // Now, finally, we can check for gaps in unitigs. for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *tig = unitigs[ti]; if ((tig == NULL) || (tig->ufpath.size() < 2)) continue; // We don't expect many unitigs to be broken, so we'll do a first quick pass to just // test if it is. int32 maxEnd = MAX(tig->ufpath[0].position.bgn, tig->ufpath[0].position.end); bool isBroken = false; for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; int32 bgn = MIN(frg->position.bgn, frg->position.end); int32 end = MAX(frg->position.bgn, frg->position.end); if (bgn > maxEnd - minOverlap) { isBroken = true; break; } maxEnd = MAX(maxEnd, end); } numTested++; if (isBroken == false) continue; numSplit++; // Dang, busted unitig. Fix it up. splitFragsLen = 0; maxEnd = 0; if (logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS)) writeLog("splitDiscontinuous()-- discontinuous tig "F_U32" with "F_SIZE_T" fragments broken into:\n", tig->id(), tig->ufpath.size()); for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; int32 bgn = MIN(frg->position.bgn, frg->position.end); int32 end = MAX(frg->position.bgn, frg->position.end); // Good thick overlap exists to this fragment, save it. if (bgn <= maxEnd - minOverlap) { assert(splitFragsLen < splitFragsMax); splitFrags[splitFragsLen++] = *frg; maxEnd = MAX(maxEnd, end); continue; } // No thick overlap found. We need to break right here before the current fragment. // If there is exactly one fragment, and it's contained, and it's not mated, move it to the // container. (This has a small positive benefit over just making every read a singleton). // if ((splitFragsLen == 1) && (FI->mateIID(splitFrags[0].ident) == 0) && (splitFrags[0].contained != 0)) { Unitig *dangler = unitigs[tig->fragIn(splitFrags[0].contained)]; // If the parent isn't in a unitig, we must have shattered the repeat unitig it was in. // Do the same here. if (dangler == NULL) { if (logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS)) writeLog("splitDiscontinuous()-- singleton frag "F_U32" shattered.\n", splitFrags[0].ident); Unitig::removeFrag(splitFrags[0].ident); } else { assert(dangler->id() == tig->fragIn(splitFrags[0].contained)); if (logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS)) writeLog("splitDiscontinuous()-- old tig "F_U32" with "F_SIZE_T" fragments (contained frag "F_U32" moved here).\n", dangler->id(), dangler->ufpath.size() + 1, splitFrags[0].ident); BestContainment *bestcont = OG->getBestContainer(splitFrags[0].ident); assert(bestcont->isContained == true); dangler->addContainedFrag(splitFrags[0].ident, bestcont, false); dangler->bubbleSortLastFrag(); assert(dangler->id() == Unitig::fragIn(splitFrags[0].ident)); } } // Otherwise, make an entirely new unitig for these fragments. else { numCreated++; makeNewUnitig(unitigs, splitFragsLen, splitFrags); tig = unitigs[ti]; } // Done with the split, save the current fragment. This resets everything. splitFragsLen = 0; splitFrags[splitFragsLen++] = *frg; maxEnd = end; } // If we did any splitting, then the length of the frags in splitFrags will be less than the length // of the path in the current unitig. Make a final new unitig for the remaining fragments. // if (splitFragsLen != tig->ufpath.size()) { numCreated++; makeNewUnitig(unitigs, splitFragsLen, splitFrags); delete unitigs[ti]; unitigs[ti] = NULL; } } writeLog("splitDiscontinuous()-- Tested "F_U32" unitigs, split "F_U32" into "F_U32" new unitigs.\n", numTested, numSplit, numCreated); delete [] splitFrags; }
vector<overlapPlacement> * findBubbleReadPlacements(UnitigVector &unitigs, BubTargetList &potentialBubbles, double deviationBubble) { uint32 fiLimit = FI->numFragments(); uint32 fiNumThreads = omp_get_max_threads(); uint32 fiBlockSize = (fiLimit < 1000 * fiNumThreads) ? fiNumThreads : fiLimit / 999; vector<overlapPlacement> *placed = new vector<overlapPlacement> [fiLimit + 1]; #pragma omp parallel for schedule(dynamic, fiBlockSize) for (uint32 fi=0; fi<fiLimit; fi++) { uint32 rdAtigID = Unitig::fragIn(fi); if ((rdAtigID == 0) || // Read not placed in a tig, ignore it. (OG->isContained(fi)) || // Read is contained, ignore it. (potentialBubbles.count(rdAtigID) == 0)) // Read isn't in a potential bubble, ignore it. continue; Unitig *rdAtig = unitigs[rdAtigID]; ufNode *rdA = &rdAtig->ufpath[ Unitig::pathPosition(fi) ]; bool rdAfwd = (rdA->position.bgn < rdA->position.end); int32 rdAlo = (rdAfwd) ? rdA->position.bgn : rdA->position.end; int32 rdAhi = (rdAfwd) ? rdA->position.end : rdA->position.bgn; uint32 ovlLen = 0; BAToverlap *ovl = OC->getOverlaps(rdA->ident, AS_MAX_ERATE, ovlLen); set<uint32> intersections; //if ((fi % 100) == 0) // fprintf(stderr, "findBubbleReadPlacements()-- read %8u with %6u overlaps - %6.2f%% finished.\r", // rdA->ident, ovlLen, 100.0 * fi / fiLimit); // Compute all placements for this read. vector<overlapPlacement> placements; placeFragUsingOverlaps(unitigs, AS_MAX_ERATE, NULL, rdA->ident, placements); // Weed out placements that aren't for bubbles, or that are for bubbles but are poor quality. Or are to ourself! for (uint32 pi=0; pi<placements.size(); pi++) { uint32 rdBtigID = placements[pi].tigID; Unitig *rdBtig = unitigs[rdBtigID]; uint32 lo = (placements[pi].position.bgn < placements[pi].position.end) ? placements[pi].position.bgn : placements[pi].position.end; uint32 hi = (placements[pi].position.bgn < placements[pi].position.end) ? placements[pi].position.end : placements[pi].position.bgn; double erate = placements[pi].errors / placements[pi].aligned; // Ignore the placement if it is to ourself. if (rdAtigID == rdBtigID) { //writeLog("tig %6u frag %8u -> tig %6u %6u reads at %8u-%8u (cov %7.5f erate %6.4f) - SAME TIG\n", // rdAtigID, placements[pi].frgID, placements[pi].tigID, rdBtig->ufpath.size(), placements[pi].position.bgn, placements[pi].position.end, placements[pi].fCoverage, erate); continue; } // Ignore the placement if it is to a non-tig / singleton read, or if it didn't place the // read fully. if ((rdBtigID == 0) || (rdBtig == NULL) || (rdBtig->ufpath.size() == 1) || (placements[pi].fCoverage < 0.99)) { if (logFileFlagSet(LOG_BUBBLE_DETAIL)) writeLog("tig %6u frag %8u -> tig %6u %6u reads at %8u-%8u (cov %7.5f erate %6.4f) - PARTIALLY PLACED\n", rdAtigID, placements[pi].frgID, placements[pi].tigID, rdBtig->ufpath.size(), placements[pi].position.bgn, placements[pi].position.end, placements[pi].fCoverage, erate); continue; } // Ignore the placement if it isn't to one of our bubble-popping candidate unitigs. bool dontcare = true; vector<uint32> &pbubbles = potentialBubbles[rdAtigID]; for (uint32 pb=0; pb<pbubbles.size(); pb++) { if (pbubbles[pb] == rdBtigID) dontcare = false; } if (dontcare) { if (logFileFlagSet(LOG_BUBBLE_DETAIL)) writeLog("tig %6u frag %8u -> tig %6u %6u reads at %8u-%8u (cov %7.5f erate %6.4f) - NOT CANDIDATE TIG\n", rdAtigID, placements[pi].frgID, placements[pi].tigID, rdBtig->ufpath.size(), placements[pi].position.bgn, placements[pi].position.end, placements[pi].fCoverage, erate); continue; } // Ignore the placement if it is too diverged from the destination tig. if (rdBtig->overlapConsistentWithTig(deviationBubble, lo, hi, erate) < 0.5) { if (logFileFlagSet(LOG_BUBBLE_DETAIL)) writeLog("tig %6u frag %8u -> tig %6u %6u reads at %8u-%8u (cov %7.5f erate %6.4f) - HIGH ERROR\n", rdAtigID, placements[pi].frgID, placements[pi].tigID, rdBtig->ufpath.size(), placements[pi].position.bgn, placements[pi].position.end, placements[pi].fCoverage, erate); continue; } // Good placement! if (logFileFlagSet(LOG_BUBBLE_DETAIL)) writeLog("tig %6u frag %8u -> tig %6u %6u reads at %8u-%8u (cov %7.5f erate %6.4f)\n", rdAtigID, placements[pi].frgID, placements[pi].tigID, rdBtig->ufpath.size(), placements[pi].position.bgn, placements[pi].position.end, placements[pi].fCoverage, erate); placed[fi].push_back(placements[pi]); } } return(placed); }
void placeUnplacedUsingAllOverlaps(UnitigVector &unitigs, const char *prefix) { uint32 fiLimit = FI->numFragments(); uint32 numThreads = omp_get_max_threads(); uint32 blockSize = (fiLimit < 100 * numThreads) ? numThreads : fiLimit / 99; uint32 *placedTig = new uint32 [FI->numFragments() + 1]; SeqInterval *placedPos = new SeqInterval [FI->numFragments() + 1]; memset(placedTig, 0, sizeof(uint32) * (FI->numFragments() + 1)); memset(placedPos, 0, sizeof(SeqInterval) * (FI->numFragments() + 1)); // Just some logging. Count the number of reads we try to place. uint32 nToPlaceContained = 0; uint32 nToPlace = 0; uint32 nPlacedContained = 0; uint32 nPlaced = 0; uint32 nFailedContained = 0; uint32 nFailed = 0; for (uint32 fid=1; fid<FI->numFragments()+1; fid++) if (Unitig::fragIn(fid) == 0) if (OG->isContained(fid)) nToPlaceContained++; else nToPlace++; writeLog("placeContains()-- placing %u contained and %u unplaced reads, with %d threads.\n", nToPlaceContained, nToPlace, numThreads); // Do the placing! #pragma omp parallel for schedule(dynamic, blockSize) for (uint32 fid=1; fid<FI->numFragments()+1; fid++) { bool enableLog = true; if (Unitig::fragIn(fid) > 0) continue; // Place the read. vector<overlapPlacement> placements; placeFragUsingOverlaps(unitigs, AS_MAX_ERATE, NULL, fid, placements); // Search the placements for the highest expected identity placement using all overlaps in the unitig. uint32 b = UINT32_MAX; for (uint32 i=0; i<placements.size(); i++) { Unitig *tig = unitigs[placements[i].tigID]; if (placements[i].fCoverage < 0.99) // Ignore partially placed reads. continue; if (tig->ufpath.size() == 1) // Ignore placements in singletons. continue; uint32 bgn = (placements[i].position.bgn < placements[i].position.end) ? placements[i].position.bgn : placements[i].position.end; uint32 end = (placements[i].position.bgn < placements[i].position.end) ? placements[i].position.end : placements[i].position.bgn; double erate = placements[i].errors / placements[i].aligned; if (tig->overlapConsistentWithTig(5.0, bgn, end, erate) < 0.5) { if ((enableLog == true) && (logFileFlagSet(LOG_PLACE_UNPLACED))) writeLog("frag %8u tested tig %6u (%6u reads) at %8u-%8u (cov %7.5f erate %6.4f) - HIGH ERROR\n", fid, placements[i].tigID, tig->ufpath.size(), placements[i].position.bgn, placements[i].position.end, placements[i].fCoverage, erate); continue; } if ((enableLog == true) && (logFileFlagSet(LOG_PLACE_UNPLACED))) writeLog("frag %8u tested tig %6u (%6u reads) at %8u-%8u (cov %7.5f erate %6.4f)\n", fid, placements[i].tigID, tig->ufpath.size(), placements[i].position.bgn, placements[i].position.end, placements[i].fCoverage, erate); if ((b == UINT32_MAX) || (placements[i].errors / placements[i].aligned < placements[b].errors / placements[b].aligned)) b = i; } // If we didn't find a best, b will be invalid; set positions for adding to a new tig. // If we did, save both the position it was placed at, and the tigID it was placed in. if (b == UINT32_MAX) { if ((enableLog == true) && (logFileFlagSet(LOG_PLACE_UNPLACED))) writeLog("frag %8u remains unplaced\n", fid); placedPos[fid].bgn = 0; placedPos[fid].end = FI->fragmentLength(fid); } else { if ((enableLog == true) && (logFileFlagSet(LOG_PLACE_UNPLACED))) writeLog("frag %8u placed tig %6u (%6u reads) at %8u-%8u (cov %7.5f erate %6.4f)\n", fid, placements[b].tigID, unitigs[placements[b].tigID]->ufpath.size(), placements[b].position.bgn, placements[b].position.end, placements[b].fCoverage, placements[b].errors / placements[b].aligned); placedTig[fid] = placements[b].tigID; placedPos[fid] = placements[b].position; } } // All reads placed, now just dump them in their correct tigs. for (uint32 fid=1; fid<FI->numFragments()+1; fid++) { Unitig *tig = NULL; ufNode frg; if (Unitig::fragIn(fid) > 0) continue; // If not placed, dump it in a new unitig. Well, not anymore. These reads were not placed in // any tig initially, were not allowed to seed a tig, and now, could find no place to go. // They're garbage. Plus, it screws up the logging above because we don't know the new tig ID // until now. if (placedTig[fid] == 0) { if (OG->isContained(fid)) nFailedContained++; else nFailed++; //tig = unitigs.newUnitig(false); } // Otherwise, it was placed somewhere, grab the tig. else { if (OG->isContained(fid)) nPlacedContained++; else nPlaced++; tig = unitigs[placedTig[fid]]; } // Regardless, add it to the tig. Logging for this is above. if (tig) { frg.ident = fid; frg.contained = 0; frg.parent = 0; frg.ahang = 0; frg.bhang = 0; frg.position = placedPos[fid]; tig->addFrag(frg, 0, false); } } // Cleanup. delete [] placedPos; delete [] placedTig; writeLog("placeContains()-- Placed %u contained reads and %u unplaced reads.\n", nPlacedContained, nPlaced); writeLog("placeContains()-- Failed to place %u contained reads (too high error suspected) and %u unplaced reads (lack of overlaps suspected).\n", nFailedContained, nFailed); // But wait! All the tigs need to be sorted. Well, not really _all_, but the hard ones to sort // are big, and those quite likely had reads added to them, so it's really not worth the effort // of tracking which ones need sorting, since the ones that don't need it are trivial to sort. for (uint32 ti=1; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg) utg->sort(); } }
void popBubbles(UnitigVector &unitigs, double deviationBubble) { BubTargetList potentialBubbles; findPotentialBubbles(unitigs, potentialBubbles); writeStatus("popBubbles()-- Found "F_SIZE_T" potential bubbles.\n", potentialBubbles.size()); //if (potentialBubbles.size() == 0) // return; writeLog("\n"); writeLog("Found "F_SIZE_T" potential bubbles.\n", potentialBubbles.size()); writeLog("\n"); vector<overlapPlacement> *placed = findBubbleReadPlacements(unitigs, potentialBubbles, deviationBubble); // We now have, in 'placed', a list of all the places that each read could be placed. Decide if there is a _single_ // place for each bubble to be popped. uint32 tiLimit = unitigs.size(); //uint32 tiNumThreads = omp_get_max_threads(); //uint32 tiBlockSize = (tiLimit < 100000 * tiNumThreads) ? tiNumThreads : tiLimit / 99999; // Clear flags. for (uint32 ti=0; ti<tiLimit; ti++) { if (unitigs[ti]) { unitigs[ti]->_isBubble = false; unitigs[ti]->_isRepeat = false; } } // In parallel, process the placements. for (uint32 ti=0; ti<tiLimit; ti++) { if (potentialBubbles.count(ti) == 0) // Not a potential bubble continue; // Scan the bubble, decide if there are _ANY_ read placements. Log appropriately. Unitig *bubble = unitigs[ti]; bool hasPlacements = false; for (uint32 fi=0; fi<bubble->ufpath.size(); fi++) { uint32 readID = bubble->ufpath[fi].ident; if (placed[readID].size() > 0) hasPlacements = true; } if (hasPlacements == false) writeLog("potential bubble %u had no valid placements (all were not contained in target tig)\n", ti); else writeLog("potential bubble %u\n", ti); // Split the placements into piles for each target and build an interval list for each target. // For each read in the tig, convert the vector of placements into interval lists, one list per target tig. map<uint32, intervalList<uint32> *> targetIntervals; for (uint32 fi=0; fi<bubble->ufpath.size(); fi++) { uint32 readID = bubble->ufpath[fi].ident; for (uint32 pp=0; pp<placed[readID].size(); pp++) { uint32 tid = placed[readID][pp].tigID; assert(placed[readID][pp].frgID > 0); uint32 bgn = (placed[readID][pp].position.bgn < placed[readID][pp].position.end) ? placed[readID][pp].position.bgn : placed[readID][pp].position.end; uint32 end = (placed[readID][pp].position.bgn < placed[readID][pp].position.end) ? placed[readID][pp].position.end : placed[readID][pp].position.bgn; if (targetIntervals[tid] == NULL) targetIntervals[tid] = new intervalList<uint32>; //writeLog("read %u -> tig %u intervals %u-%u\n", readID, tid, bgn, end); targetIntervals[tid]->add(bgn, end-bgn); } } vector<candidatePop *> targets; // Squish the intervals. Create new candidatePops for each interval that isn't too big or // small. Assign each overlapPlacements to the correct candidatePop. for (map<uint32, intervalList<uint32> *>::iterator it=targetIntervals.begin(); it != targetIntervals.end(); ++it) { uint32 targetID = it->first; intervalList<uint32> *IL = it->second; IL->merge(); // Discard intervals that are significantly too small or large. Save the ones that are // nicely sized. Logging here isn't terribly useful, it's just repeated (out of order) later // when we try to make sense of the read alignments. for (uint32 ii=0; ii<IL->numberOfIntervals(); ii++) { if ((IL->hi(ii) - IL->lo(ii) < 0.75 * bubble->getLength()) || // Too small! (1.25 * bubble->getLength() < IL->hi(ii) - IL->lo(ii))) { // Too big! writeLog("tig %8u length %9u -> target %8u piece %2u position %9u-%9u length %8u - size mismatch, discarded\n", bubble->id(), bubble->getLength(), targetID, ii, IL->lo(ii), IL->hi(ii), IL->hi(ii) - IL->lo(ii)); continue; } writeLog("tig %8u length %9u -> target %8u piece %2u position %9u-%9u length %8u\n", bubble->id(), bubble->getLength(), targetID, ii, IL->lo(ii), IL->hi(ii), IL->hi(ii) - IL->lo(ii)); targets.push_back(new candidatePop(bubble, unitigs[targetID], IL->lo(ii), IL->hi(ii))); } delete IL; } targetIntervals.clear(); // If no targets, nothing to do. if (targets.size() == 0) continue; // Run through the placements again, and assign them to the correct target. // // For each read: // For each acceptable placement: // For each target location: // If the placement is for this target, save it. for (uint32 fi=0; fi<bubble->ufpath.size(); fi++) { uint32 readID = bubble->ufpath[fi].ident; for (uint32 pp=0; pp<placed[readID].size(); pp++) { uint32 tid = placed[readID][pp].tigID; uint32 bgn = (placed[readID][pp].position.bgn < placed[readID][pp].position.end) ? placed[readID][pp].position.bgn : placed[readID][pp].position.end; uint32 end = (placed[readID][pp].position.bgn < placed[readID][pp].position.end) ? placed[readID][pp].position.end : placed[readID][pp].position.bgn; for (uint32 tt=0; tt<targets.size(); tt++) if ((targets[tt]->target->id() == tid) && (targets[tt]->bgn < end) && (bgn < targets[tt]->end)) targets[tt]->placed.push_back(placed[readID][pp]); } } // Count the number of targets that have all the reads (later: in the correct order, etc, etc). Remove those // that don't. uint32 nTargets = 0; set<uint32> tigReads; // Reads in the bubble tig. set<uint32> tgtReads; // Reads in the bubble that have a placement in the target. // Remove duplicate placements from each target. for (uint32 tt=0; tt<targets.size(); tt++) { candidatePop *t = targets[tt]; // Detect duplicates, keep the one with lower error. There are a lot of duplicate // placements, logging isn't terribly useful. for (uint32 aa=0; aa<t->placed.size(); aa++) { for (uint32 bb=0; bb<t->placed.size(); bb++) { if ((aa == bb) || (t->placed[aa].frgID != t->placed[bb].frgID) || (t->placed[aa].frgID == 0) || (t->placed[bb].frgID == 0)) continue; if (t->placed[aa].errors / t->placed[aa].aligned < t->placed[bb].errors / t->placed[bb].aligned) { #ifdef SHOW_MULTIPLE_PLACEMENTS writeLog("duplicate read alignment for tig %u read %u - better %u-%u %.4f - worse %u-%u %.4f\n", t->placed[aa].tigID, t->placed[aa].frgID, t->placed[aa].position.bgn, t->placed[aa].position.end, t->placed[aa].errors / t->placed[aa].aligned, t->placed[bb].position.bgn, t->placed[bb].position.end, t->placed[bb].errors / t->placed[bb].aligned); #endif t->placed[bb] = overlapPlacement(); } else { #ifdef SHOW_MULTIPLE_PLACEMENTS writeLog("duplicate read alignment for tig %u read %u - better %u-%u %.4f - worse %u-%u %.4f\n", t->placed[aa].tigID, t->placed[aa].frgID, t->placed[bb].position.bgn, t->placed[bb].position.end, t->placed[bb].errors / t->placed[bb].aligned, t->placed[aa].position.bgn, t->placed[aa].position.end, t->placed[aa].errors / t->placed[aa].aligned); #endif t->placed[aa] = overlapPlacement(); } } } // Get rid of any now-empty entries. for (uint32 aa=t->placed.size(); aa--; ) { if (t->placed[aa].frgID == 0) { t->placed[aa] = t->placed.back(); t->placed.pop_back(); } } } // Make a set of the reads in the bubble. We'll compare each target against this to decide if all reads are placed. for (uint32 fi=0; fi<bubble->ufpath.size(); fi++) tigReads.insert(bubble->ufpath[fi].ident); uint32 nOrphan = 0; // Full coverage; bubble can be popped. uint32 orphanTarget = 0; uint32 nBubble = 0; // Partial coverage, bubble cannot be popped. uint32 bubbleTarget = 0; for (uint32 tt=0; tt<targets.size(); tt++) { tgtReads.clear(); for (uint32 op=0; op<targets[tt]->placed.size(); op++) { if (logFileFlagSet(LOG_BUBBLE_DETAIL)) writeLog("tig %8u length %9u -> target %8u piece %2u position %9u-%9u length %8u - read %7u at %9u-%9u\n", bubble->id(), bubble->getLength(), targets[tt]->target->id(), tt, targets[tt]->bgn, targets[tt]->end, targets[tt]->end - targets[tt]->bgn, targets[tt]->placed[op].frgID, targets[tt]->placed[op].position.bgn, targets[tt]->placed[op].position.end); assert(targets[tt]->placed[op].frgID > 0); tgtReads.insert(targets[tt]->placed[op].frgID); } // Count the number of consecutive reads from the 5' or 3' end of the bubble that are placed // in the target. // // Also, count the number of reads in the bubble that are placed in the target. Likely the // same as n5 + n3. uint32 n5 = 0; uint32 n3 = 0; uint32 nt = 0; for (uint32 fi=0; fi<bubble->ufpath.size(); fi++) if (tgtReads.count(bubble->ufpath[fi].ident) > 0) n5++; else break; for (uint32 fi=bubble->ufpath.size(); fi-->0; ) if (tgtReads.count(bubble->ufpath[fi].ident) > 0) n3++; else break; for (uint32 fi=0; fi<bubble->ufpath.size(); fi++) if (tgtReads.count(bubble->ufpath[fi].ident) > 0) nt++; // Report now, before we nuke targets[tt] for being not a bubble! if ((nt == bubble->ufpath.size()) || ((n5 > 0) && (n3 > 0))) writeLog("tig %8u length %9u -> target %8u piece %2u position %9u-%9u length %8u - expected %3"F_SIZE_TP" reads, had %3"F_SIZE_TP" reads. n5=%3u n3=%3u nt=%3u\n", bubble->id(), bubble->getLength(), targets[tt]->target->id(), tt, targets[tt]->bgn, targets[tt]->end, targets[tt]->end - targets[tt]->bgn, tigReads.size(), tgtReads.size(), n5, n3, nt); // Decide if this is a bubble, orphan from construction, or repeat. if (nt == bubble->ufpath.size()) { nOrphan++; orphanTarget = tt; } else if ((n5 > 0) && (n3 > 0)) { nBubble++; bubbleTarget = tt; } } // If no placements, pbbbt. if (nOrphan + nBubble == 0) { //writeLog("tig %8u length %8u reads %6u had no bubble or orphan placements.\n", bubble->id(), bubble->getLength(), bubble->ufpath.size()); continue; } // If multiple orphan and/or bubble placements, it's a repeat. if (nOrphan + nBubble > 1) { writeLog("tig %8u length %8u reads %6u - repeat - %u orphan %u bubble placements.\n", bubble->id(), bubble->getLength(), bubble->ufpath.size(), nOrphan, nBubble); writeLog("\n"); bubble->_isRepeat = true; continue; } // If a bubble placement, mark it as a bubble so it can be skipped during repeat detection. if (nBubble > 0) { writeLog("tig %8u length %8u reads %6u - bubble\n", bubble->id(), bubble->getLength(), bubble->ufpath.size()); writeLog("\n"); bubble->_isBubble = true; continue; } // Otherwise, it's an orphan, move the reads to the proper place. writeLog("tig %8u length %8u reads %6u - orphan\n", bubble->id(), bubble->getLength(), bubble->ufpath.size()); for (uint32 op=0, tt=orphanTarget; op<targets[tt]->placed.size(); op++) { ufNode frg; frg.ident = targets[tt]->placed[op].frgID; frg.contained = 0; frg.parent = 0; frg.ahang = 0; frg.bhang = 0; frg.position.bgn = targets[tt]->placed[op].position.bgn; frg.position.end = targets[tt]->placed[op].position.end; writeLog("move read %u from tig %u to tig %u %u-%u\n", frg.ident, bubble->id(), targets[tt]->target->id(), frg.position.bgn, frg.position.end); targets[tt]->target->addFrag(frg, 0, false); } writeLog("\n"); unitigs[bubble->id()] = NULL; delete bubble; } // Over all bubbles writeLog("\n"); // Needed if no bubbles are popped. delete [] placed; // Sort reads in all the tigs. Overkill, but correct. for (uint32 ti=0; ti<tiLimit; ti++) { Unitig *tig = unitigs[ti]; if ((tig == NULL) || // Not a tig, ignore it. (tig->ufpath.size() == 1)) // Singleton, already sorted. continue; tig->sort(); } }
void findPotentialBubbles(UnitigVector &unitigs, BubTargetList &potentialBubbles) { uint32 tiLimit = unitigs.size(); uint32 tiNumThreads = omp_get_max_threads(); uint32 tiBlockSize = (tiLimit < 100000 * tiNumThreads) ? tiNumThreads : tiLimit / 99999; writeStatus("\n"); writeStatus("bubbleDetect()-- working on "F_U32" unitigs, with "F_U32" threads.\n", tiLimit, tiNumThreads); for (uint32 ti=0; ti<tiLimit; ti++) { Unitig *tig = unitigs[ti]; if ((tig == NULL) || // Not a tig, ignore it. (tig->ufpath.size() == 1)) // Singleton, handled elsewhere. continue; uint32 nonContainedReads = 0; bool validBubble = true; map<uint32,uint32> tigOlapsTo; uint32 fiLimit = tig->ufpath.size(); uint32 fiNumThreads = omp_get_max_threads(); uint32 fiBlockSize = (fiLimit < 100 * fiNumThreads) ? fiNumThreads : fiLimit / 99; for (uint32 fi=0; (validBubble == true) && (fi<fiLimit); fi++) { uint32 rid = tig->ufpath[fi].ident; if (OG->isContained(rid) == true) // Don't need to check contained reads. If their container continue; // passes the tests below, the contained read will too. nonContainedReads++; uint32 ovlLen = 0; BAToverlap *ovl = OC->getOverlaps(rid, AS_MAX_ERATE, ovlLen); set<uint32> readOlapsTo; for (uint32 oi=0; oi<ovlLen; oi++) { uint32 ovlTigID = Unitig::fragIn(ovl[oi].b_iid); Unitig *ovlTig = unitigs[ovlTigID]; // Skip this overlap if it is to an unplaced read, to a singleton tig, to ourself, // or to a unitig that is shorter than us. We can not pop this tig as a bubble // in any of those cases. if ((ovlTigID == 0) || (ovlTig == NULL) || (ovlTig->ufpath.size() == 1) || (ovlTig->id() == tig->id()) || (ovlTig->getLength() < tig->getLength())) continue; // Otherwise, remember that we had an overlap to ovlTig. //writeLog("tig %u read %u overlap to tig %u read %u\n", // tig->id(), rid, ovlTigID, ovl[oi].b_iid); readOlapsTo.insert(ovlTigID); } //writeLog("tig %8u read %8u has %u olaps\n", tig->id(), rid, readOlapsTo.size()); // Transfer the per-read counts to the per-unitig counts: add one to the counter for each tig // that we have overlaps to. for (set<uint32>::iterator it=readOlapsTo.begin(); it != readOlapsTo.end(); ++it) tigOlapsTo[*it]++; // Decide if we're a valid potential bubble. If tig id (in it->first) has overlaps to every // read we've seen so far (nonContainedReads), we're still a valid bubble. // // To _attempt_ to have differences in the bubble, we'll accept it if 3/4 of the reads // have overlaps. validBubble = false; for (map<uint32,uint32>::iterator it=tigOlapsTo.begin(); it != tigOlapsTo.end(); ++it) if (it->second >= BUBBLE_READ_FRACTION * nonContainedReads) validBubble = true; // If we've not seen that many reads, pretend it's a valid bubble. It'll get screened out later. if (nonContainedReads < 16) validBubble = true; } // If not validBubble, report. #if 0 if (validBubble == false) { writeLog("notValidBubble tig %8d expects %6u reads\n", tig->id(), nonContainedReads); for (map<uint32,uint32>::iterator it=tigOlapsTo.begin(); it != tigOlapsTo.end(); ++it) writeLog(" to tig %8u overlaps %6u\n", it->first, it->second); } #endif // If validBubble, then there is a tig that every dovetail read has at least one overlap to. // Save those tigs in potentialBubbles. uint32 nTigs = 0; if (validBubble) { for (map<uint32,uint32>::iterator it=tigOlapsTo.begin(); it != tigOlapsTo.end(); ++it) if (it->second >= BUBBLE_READ_FRACTION * nonContainedReads) nTigs++; } // ALWAYS log potential bubbles. if (nTigs > 0) { writeLog("\n"); writeLog("potential bubble tig %8u length %9u nReads %7u to %3u tigs:\n", tig->id(), tig->getLength(), tig->ufpath.size(), nTigs); for (map<uint32,uint32>::iterator it=tigOlapsTo.begin(); it != tigOlapsTo.end(); ++it) { if (it->second >= BUBBLE_READ_FRACTION * nonContainedReads) { Unitig *dest = unitigs[it->first]; writeLog(" tig %8u length %9u nReads %7u\n", dest->id(), dest->getLength(), dest->ufpath.size()); potentialBubbles[ti].push_back(dest->id()); } } } } flushLog(); }
intersectionList::intersectionList(UnitigVector &unitigs) { for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *tig = unitigs[ti]; if (tig == NULL) continue; intersectionEvidence *evidence = new intersectionEvidence [tig->ufpath.size()]; for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; if (OG->isContained(frg->ident)) continue; // For my best overlap, the ID of the unitig that the overlapping fragment is in. evidence[fi].edge5 = *OG->getBestEdgeOverlap(frg->ident, false); evidence[fi].edge3 = *OG->getBestEdgeOverlap(frg->ident, true); evidence[fi].frag5tig = tig->fragIn(evidence[fi].edge5.fragId()); evidence[fi].frag3tig = tig->fragIn(evidence[fi].edge3.fragId()); // Do NOT initialize these! An earlier fragment could have already confirmed an end. // Properly, only the 5' end of a forward fragment (or 3' end of a reverse fragment) can be // confirmed already (otherwise the tig is nonsense), but we don't yet check that. // //evidence[fi].frag5confirmed = false; //evidence[fi].frag3confirmed = false; // But, because the path could be promiscuous, not every overlap to a different tig is bad. // // If my best overlap is to a different tig, but there is an overlapping fragment (in the // unitig placement) with a best edge to me, I'm still good. The BOG build this unitig using // the edge from the other fragment to me. // // If the fragments do not overlap in the layout (yet the best edge still exists) that is a // self-intersection. // // The two blocks are identical, except for 'edge3' and 'edge5'. if (evidence[fi].frag5tig == tig->id()) { uint32 ti = tig->pathPosition(evidence[fi].edge5.fragId()); ufNode *trg = &tig->ufpath[ti]; uint32 minf = (frg->position.bgn < frg->position.end) ? frg->position.bgn : frg->position.end; uint32 maxf = (frg->position.bgn < frg->position.end) ? frg->position.end : frg->position.bgn; uint32 mint = (trg->position.bgn < trg->position.end) ? trg->position.bgn : trg->position.end; uint32 maxt = (trg->position.bgn < trg->position.end) ? trg->position.end : trg->position.bgn; // If they overlap, mark as confirmed, else remember an intersection. if (((minf < mint) && (mint < maxf)) || // t begins inside f ((mint < minf) && (minf < maxt))) { // f begins inside t if (evidence[fi].edge5.frag3p()) evidence[ti].frag3confirmed = true; else evidence[ti].frag5confirmed = true; } else { evidence[fi].frag5self = true; // Not the correct place to report this. Some of these get confirmed by later fragments. //writeLog("BUG1 F: %d,%d T %d,%d\n", minf, maxf, mint, maxt); //writeLog("INTERSECT from unitig %d frag %d end %d TO unitig %d frag %d end %d (SELF)\n", // tig->id(), frg->ident, 5, evidence[fi].frag5tig, evidence[fi].edge5.fragId(), evidence[fi].edge5.frag3p() ? 3 : 5); } } if (evidence[fi].frag3tig == tig->id()) { uint32 ti = tig->pathPosition(evidence[fi].edge3.fragId()); ufNode *trg = &tig->ufpath[ti]; uint32 minf = (frg->position.bgn < frg->position.end) ? frg->position.bgn : frg->position.end; uint32 maxf = (frg->position.bgn < frg->position.end) ? frg->position.end : frg->position.bgn; uint32 mint = (trg->position.bgn < trg->position.end) ? trg->position.bgn : trg->position.end; uint32 maxt = (trg->position.bgn < trg->position.end) ? trg->position.end : trg->position.bgn; if (((minf < mint) && (mint < maxf)) || // t begins inside f ((mint < minf) && (minf < maxt))) { // f begins inside t if (evidence[fi].edge3.frag3p()) evidence[ti].frag3confirmed = true; else evidence[ti].frag5confirmed = true; } else { evidence[fi].frag3self = true; // Not the correct place to report this. Some of these get confirmed by later fragments. //writeLog("BUG2 F: %d,%d T %d,%d\n", minf, maxf, mint, maxt); //writeLog("INTERSECT from unitig %d frag %d end %d TO unitig %d frag %d end %d (SELF)\n", // tig->id(), frg->ident, 3, evidence[fi].frag3tig, evidence[fi].edge3.fragId(), evidence[fi].edge3.frag3p() ? 3 : 5); } } } // // Build the list. // for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; if ((evidence[fi].frag5tig != 0) && (evidence[fi].frag5tig != tig->id()) && (evidence[fi].frag5confirmed == false)) isects.push_back(intersectionPoint(evidence[fi].edge5, frg->ident, false, false)); if ((evidence[fi].frag5tig == tig->id()) && (evidence[fi].frag5self == true) && (evidence[fi].frag5confirmed == false)) isects.push_back(intersectionPoint(evidence[fi].edge5, frg->ident, false, true)); if ((evidence[fi].frag3tig != 0) && (evidence[fi].frag3tig != tig->id()) && (evidence[fi].frag3confirmed == false)) isects.push_back(intersectionPoint(evidence[fi].edge3, frg->ident, true, false)); if ((evidence[fi].frag3tig == tig->id()) && (evidence[fi].frag3self == true) && (evidence[fi].frag3confirmed == false)) isects.push_back(intersectionPoint(evidence[fi].edge3, frg->ident, true, true)); } delete [] evidence; } // Sort the intersections by the ID of the intersected fragment, then build an index into the array. std::sort(isects.begin(), isects.end()); // Terminate the intersection list with a sentinal intersection. This is CRITICAL // to the way we iterate over intersections. isects.push_back(intersectionPoint(BestEdgeOverlap(), 0, true, true)); // Build a map from fragment id to the first intersection in the list. for (uint32 i=0; i<isects.size(); i++) { isectsNum[isects[i].isectFrg]++; if (isectsMap.find(isects[i].isectFrg) == isectsMap.end()) isectsMap[isects[i].isectFrg] = i; } }
static void joinUnitigs_append(UnitigVector &unitigs, joinEntry *join) { uint32 frId = Unitig::fragIn(join->frFragID); uint32 toId = Unitig::fragIn(join->toFragID); Unitig *fr = unitigs[frId]; Unitig *to = unitigs[toId]; uint32 frIdx = Unitig::pathPosition(join->frFragID); uint32 toIdx = Unitig::pathPosition(join->toFragID); // The 'fr' unitig is assumed to be forward, and assumed to be the one we join to. // Compute the offset for our append. We just need to compute where the join fragment would // appear in the unitig. The join fragment MUST be the first thing in the frUnitig. //int32 offset = MIN(frF.position.bgn, frF.position.end); // Over all fragments in the frUnitig, add them to either the joinUnitig or the discUnitig. Unitig *joinUnitig = unitigs.newUnitig(false); Unitig *discUnitig = unitigs.newUnitig(false); // Reverse the 'to' unitig if needed. if (join->toFlip) to->reverseComplement(true); // If we're joining off the 5' end of the fr untiig, add the to reads first. if (join->frFirst == true) { uint32 ii=0; for (; ii < toIdx; ii++) joinUnitig->addFrag(to->ufpath[ii], 0, false); for (; ii < to->ufpath.size(); ii++) discUnitig->addFrag(to->ufpath[ii], 0, false); } // Now add all the fr unitig reads. for (uint32 ii=0; ii < fr->ufpath.size(); ii++) joinUnitig->addFrag(to->ufpath[ii], 0, false); // If we're not joining off the 5' end, add the to unitig reads last. if (join->frFirst == false) { uint32 ii = 0; for (; ii < toIdx; ii++) discUnitig->addFrag(to->ufpath[ii], 0, false); for (; ii < to->ufpath.size(); ii++) joinUnitig->addFrag(to->ufpath[ii], 0, false); } // Delete the donor unitigs. delete fr; delete to; unitigs[frId] = NULL; unitigs[toId] = NULL; // And make sure the new unitigs are consistent. joinUnitig->sort(); discUnitig->sort(); }
// Examine the first (few?) fragments of a unitig, evaluate if they indicate a join should be made. static bool joinUnitigs_examineEnd(UnitigVector &unitigs, Unitig *fr, uint32 idx, bool frFirstEnd, vector<joinEntry> &joins) { uint32 frgIdx = (frFirstEnd) ? (idx) : (fr->ufpath.size() - 1 - idx); ufNode *frg = &fr->ufpath[frgIdx]; bool frgRev = (frg->position.end < frg->position.bgn); // Grab the best edge for this end frag. The last arg requests the 3' end if true. // // If we're looking at the first read, we want to get: // 5' - if the frag is forward // 3' - if the frag is reverse (frgRev == true) // // If we're looking at the lat read, we want to get: // 5' - if the frag is reverse // 3' - if the frag is forward (frgRev == false) // BestEdgeOverlap *bestEdge = OG->getBestEdgeOverlap(frg->ident, (frgRev == frFirstEnd)); uint32 tgtId = bestEdge->fragId(); bool tgt3p = bestEdge->frag3p(); if (tgtId == 0) // No best edge? Skip it. return(false); // Grab the unitig for that best edge. uint32 toID = fr->fragIn(tgtId); Unitig *to = unitigs[toID]; if (to->ufpath.size() == 1) // Joining to something teeny? Don't bother checking further. return(false); if (to->id() == fr->id()) // Join to myself? Nope. return(false); // Grab the read we have an edge to, an compute the overlapping length and left over length. ufNode *tgt = &to->ufpath[to->pathPosition(tgtId)]; bool tgtRev = (tgt->position.end < tgt->position.bgn); // If tgt3p (we overlap to the 3' end) is the same as tgtRev (read is reverse) then the unitig is oriented // correctly. Otherwise, positions need to be reverse-complemented. bool toFlip = false; if ((frFirstEnd == true) && (tgt3p == false) && (tgtRev == false)) // source read is at the start, overlap to 5' and the read is forward, need to flip the target unitig toFlip = true; if ((frFirstEnd == true) && (tgt3p == true) && (tgtRev == true)) // source read is at the start, overlap to 3' and the read is reverse, need to flip the target unitig toFlip = true; if ((frFirstEnd == false) && (tgt3p == false) && (tgtRev == true)) // source read is at the end, overlap to 5' and the read is reverse, need to flip the target unitig toFlip = true; if ((frFirstEnd == false) && (tgt3p == true) && (tgtRev == false)) // source read is at the end, overlap to 3' and the read is forward, need to flip the target unitig toFlip = true; uint32 toMin = MIN(tgt->position.bgn, tgt->position.end); uint32 toMax = MAX(tgt->position.bgn, tgt->position.end); uint32 toLen = to->getLength(); uint32 frLen = fr->getLength(); if (toFlip) { toMin = toLen - MAX(tgt->position.bgn, tgt->position.end); toMax = toLen - MIN(tgt->position.bgn, tgt->position.end); } assert(toMin < toMax); // Our two unitigs are of length frLen and toLen. We are appending some portion of 'to' onto // 'fr', and 'discarding' the rest. If the 'discarded' piece is larger than the 'fr' unitig, we // don't want to do the join. // // We err on the side of the discarded piece. uint32 joinLen = 0; uint32 discLen = 0; if (frFirstEnd == true) { joinLen = toMin + frLen; // Prepend the start of 'to' onto 'fr'. discLen = toLen - toMin; } else { joinLen = frLen + toLen - toMax; // Append the end of 'to' onto 'fr'. discLen = toMax; } // If the discard is bigger than us, we do damage by joining. if (discLen > frLen) return(false); // The joined should be much larger and the discarded much smaller. uint32 maxLen = MAX(frLen, toLen); uint32 minLen = MIN(frLen, toLen); double joinChange = (double)joinLen / maxLen; double discChange = (double)discLen / minLen; bool isBad = false; if ((joinChange < 1.10) || (0.75 < discChange)) // Bad if we didn't really change sizes. isBad = true; if ((1.0 < joinChange) && (discChange < 0.5)) // But good if discard is tiny. This occurs if we merge a small with a big. The join change // is somewhat small (1.05 say) yet most of the smaller unitig is used. isBad = false; if (isBad) { writeLog("joinUnitigs_examineEnd()-- join unitig %6u (%7ubp) frag %6u %s <-> unitig %6u (%7ubp) frag %6u %s <-> length %5.2f %7u and %5.2f %7u BAD\n", fr->id(), fr->getLength(), frg->ident, (frgRev) ? "rev" : "fwd", to->id(), to->getLength(), tgt->ident, (tgtRev) ? "rev" : "fwd", joinChange, joinLen, discChange, discLen); return(false); } // OK, join. writeLog("joinUnitigs_examineEnd()-- join unitig %6u (%7ubp) frag %6u %s <-> unitig %6u (%7ubp) frag %6u %s <-> length %5.2f %7u and %5.2f %7u\n", fr->id(), fr->getLength(), frg->ident, (frgRev) ? "rev" : "fwd", to->id(), to->getLength(), tgt->ident, (tgtRev) ? "rev" : "fwd", joinChange, joinLen, discChange, discLen); joins.push_back(joinEntry(frg->ident, frFirstEnd, tgt->ident, toFlip, joinLen)); return(true); }
void placeZombies(UnitigVector &unitigs, double erate) { writeLog("==> SEARCHING FOR ZOMBIES\n"); uint32 *inUnitig = new uint32 [FI->numFragments()+1]; int numZombies = 0; // Mark fragments as dead, then unmark them if they are in a real living unitig. for (uint32 i=0; i<FI->numFragments()+1; i++) inUnitig[i] = noUnitig; for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg == NULL) continue; for (uint32 fi=0; fi<utg->ufpath.size(); fi++) inUnitig[utg->ufpath[fi].ident] = utg->id(); } // For anything not in a living unitig, reload the overlaps and find a new container. // (NOT IMPLEMENTED - for now we just move these to new singleton unitigs). for (uint32 i=0; i<FI->numFragments()+1; i++) { if (FI->fragmentLength(i) == 0) // Deleted fragment continue; if (inUnitig[i] != noUnitig) // Valid fragment in a unitig continue; Unitig *utg = unitigs.newUnitig(false); ufNode frg; frg.ident = i; frg.contained = 0; frg.parent = 0; frg.ahang = 0; frg.bhang = 0; frg.position.bgn = 0; frg.position.end = FI->fragmentLength(i); frg.containment_depth = 0; utg->addFrag(frg, 0, false); writeLog("placeZombies()-- unitig %d created from zombie fragment %d\n", utg->id(), i); numZombies++; } writeLog("RESURRECTED %d ZOMBIE FRAGMENT%s.\n", numZombies, (numZombies != 1) ? "s" : ""); delete [] inUnitig; }
void extendByMates(UnitigVector &unitigs, double erateGraph) { //logFileFlags |= LOG_CHUNK_GRAPH; logFileFlags |= LOG_POPULATE_UNITIG; writeLog("==> EXTENDING UNITIGS WITH MATE PAIRS.\n"); uint32 tiMax = unitigs.size(); for (uint32 ti=0; ti<tiMax; ti++) { Unitig *target = unitigs[ti]; if (target == NULL) continue; if (target->ufpath.size() < 2) continue; // Build a list of all the fragments in this unitig, and any mates that are not in a unitig. uint32 extraMates = 0; for (uint32 fi=0; fi<target->ufpath.size(); fi++) { uint32 fid = target->ufpath[fi].ident; uint32 mid = FI->mateIID(fid); if ((mid != 0) && (Unitig::fragIn(mid) == 0)) extraMates++; } writeLog("\n"); writeLog("unitig "F_U32" of size "F_SIZE_T" with "F_U32" extra fragments via mates\n", ti, target->ufpath.size(), extraMates); if (extraMates == 0) continue; // Build a set of the fragments in this unitig plus their mates, and a set of just the mates. set<uint32> frags; set<uint32> mates; for (uint32 fi=0; fi<target->ufpath.size(); fi++) { uint32 fid = target->ufpath[fi].ident; uint32 mid = FI->mateIID(fid); frags.insert(fid); if ((mid != 0) && (Unitig::fragIn(mid) == 0)) { writeLog(" mate frag "F_U32"\n", mid); frags.insert(mid); mates.insert(mid); } } // Now, remove all the unitig fragments from the unitig so we can reconstruct it with the // additional mated fragments. Note that this loop cannot be combined with the last, since // the test for 'additional mate' is 'not in the same unitig' -- and if we remove the // fragments too early, we can't distinguish 'additional' from 'included'. for (uint32 fi=0; fi<target->ufpath.size(); fi++) target->removeFrag(target->ufpath[fi].ident); unitigs[ti] = NULL; delete target; // Build a new BOG for just those fragments - in particular, only overlaps within the set are // used for the BOG. BestOverlapGraph *OGsave = OG; ChunkGraph *CGsave = CG; OG = new BestOverlapGraph(erateGraph, &frags); CG = new ChunkGraph(&frags); uint32 numTigs = unitigs.size(); // Build new unitigs. There should only be one new unitig constructed, but that isn't // guaranteed. No new unitigs are built if they are seeded from the mate fragments. This // isn't ideal -- we'd like to allow the first unitig (supposedly the longest) to start from // a mate fragment. However, consider the not-so-rare case where the original unitig is two // backbone fragments and lots of contains. Those contains contribute mate pairs that all // assemble together, giving a longer path than the original unitig. We don't want to // assemble the mated fragments yet (we'll wait until we get the rest of the fragments that // could assemble together). for (uint32 fi = CG->nextFragByChunkLength(); fi > 0; fi=CG->nextFragByChunkLength()) { if ((Unitig::fragIn(fi) != 0) || (mates.count(fi) > 0)) // Fragment already in a unitig, or is an additional mate that we don't want // to seed from. continue; populateUnitig(unitigs, fi); } // Report what was constructed if (unitigs.size() - numTigs > 1) writeLog("WARNING: mate extension split a unitig.\n"); for (uint32 newTigs=numTigs; newTigs<unitigs.size(); newTigs++) { Unitig *tig = unitigs[newTigs]; if (tig == NULL) continue; placeContainsUsingBestOverlaps(tig, &frags); writeLog(" new tig "F_U32" with "F_SIZE_T" fragments\n", tig->id(), tig->ufpath.size()); } delete OG; delete CG; OG = OGsave; CG = CGsave; } }
void reportUnitigs(UnitigVector &unitigs, const char *prefix, const char *name, uint64 genomeSize) { // Generate n50. Assumes unitigs have been 'classified' already. vector<uint32> unassembledLength; vector<uint32> bubbleLength; vector<uint32> repeatLength; vector<uint32> circularLength; vector<uint32> contigLength; for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg == NULL) continue; if (utg->_isUnassembled) { unassembledLength.push_back(utg->getLength()); } else if (utg->_isBubble) { bubbleLength.push_back(utg->getLength()); } else if (utg->_isRepeat) { repeatLength.push_back(utg->getLength()); } else if (utg->_isCircular) { circularLength.push_back(utg->getLength()); } else { contigLength.push_back(utg->getLength()); } } char N[FILENAME_MAX]; sprintf(N, "%s.sizes", getLogFilePrefix()); errno = 0; FILE *F = fopen(N, "w"); if (errno == 0) { reportN50(F, unassembledLength, "UNASSEMBLED", genomeSize); reportN50(F, bubbleLength, "BUBBLE", genomeSize); reportN50(F, repeatLength, "REPEAT", genomeSize); reportN50(F, circularLength, "CIRCULAR", genomeSize); reportN50(F, contigLength, "CONTIGS", genomeSize); fclose(F); } if (logFileFlagSet(LOG_INTERMEDIATE_UNITIGS) == 0) return; // Dump to an intermediate store. char tigStorePath[FILENAME_MAX]; sprintf(tigStorePath, "%s.tigStore", getLogFilePrefix()); fprintf(stderr, "Creating intermediate tigStore '%s'\n", tigStorePath); uint32 numFragsT = 0; uint32 numFragsP = 0; uint64 utgLen = 0; // Compute average frags per partition. for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg == NULL) continue; numFragsT += utg->ufpath.size(); if (utg->ufpath.size() > 2) utgLen += utg->getLength(); } if (utgLen < 16 * 1024 * 1024) numFragsP = numFragsT / 7; else if (utgLen < 64 * 1024 * 1024) numFragsP = numFragsT / 63; else numFragsP = numFragsT / 127; // Dump the unitigs to an intermediate store. setParentAndHang(unitigs); writeUnitigsToStore(unitigs, tigStorePath, tigStorePath, numFragsP, false); }
// Decides if a unitig is unassembled. The other classifications (isBubble, isCircular, isRepeat) // are made when the type is processed (e.g., when bubbles are popped). // // A unitig is unassembled if: // 1) it has fewer than R reads (R=2) // 2) it is shorter than S bases (S=1000) // 3) a single read spans at least fraction F of the lenth (F=1.0) // 4) at least fraction F of the unitig is below read depth D (F=1.0, D=2) // void classifyUnitigsAsUnassembled(UnitigVector &unitigs, uint32 fewReadsNumber, uint32 tooShortLength, double spanFraction, double lowcovFraction, uint32 lowcovDepth) { uint32 nTooFew = 0; uint32 nShort = 0; uint32 nSingle = 0; uint32 nCoverage = 0; uint32 nContig = 0; uint64 bTooFew = 0; uint64 bShort = 0; uint64 bSingle = 0; uint64 bCoverage = 0; uint64 bContig = 0; char N[FILENAME_MAX]; sprintf(N, "%s.unassembled", getLogFilePrefix()); errno = 0; FILE *F = fopen(N, "w"); if (errno) F = NULL; for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg == NULL) continue; utg->_isUnassembled = false; // Rule 1. Too few reads. if (utg->ufpath.size() < fewReadsNumber) { fprintf(F, "unitig "F_U32" unassembled - too few reads ("F_U64" < "F_U32")\n", ti, utg->ufpath.size(), fewReadsNumber); utg->_isUnassembled = true; nTooFew += 1; bTooFew += utg->getLength(); continue; } // Rule 2. Short. if (utg->getLength() < tooShortLength) { fprintf(F, "unitig "F_U32" unassembled - too short ("F_U32" < "F_U32")\n", ti, utg->getLength(), tooShortLength); utg->_isUnassembled = true; nShort += 1; bShort += utg->getLength(); continue; } // Rule 3. Single read spans large fraction of tig. for (uint32 oi=0; oi<utg->ufpath.size(); oi++) { ufNode *frg = &utg->ufpath[oi]; int frgbgn = MIN(frg->position.bgn, frg->position.end); int frgend = MAX(frg->position.bgn, frg->position.end); if (frgend - frgbgn > utg->getLength() * spanFraction) { fprintf(F, "unitig "F_U32" unassembled - single read spans unitig (read "F_U32" "F_U32"-"F_U32" spans fraction %f > %f\n", ti, frg->ident, frg->position.bgn, frg->position.end, (double)(frgend - frgbgn) / utg->getLength(), spanFraction); utg->_isUnassembled = true; nSingle += 1; bSingle += utg->getLength(); break; } } if (utg->_isUnassembled) continue; // Rule 4. Low coverage. intervalList<int32> IL; for (uint32 oi=0; oi<utg->ufpath.size(); oi++) { ufNode *frg = &utg->ufpath[oi]; int frgbgn = MIN(frg->position.bgn, frg->position.end); int frgend = MAX(frg->position.bgn, frg->position.end); IL.add(frgbgn, frgend - frgbgn); } intervalList<int32> ID(IL); uint32 basesLow = 0; uint32 basesHigh = 0; for (uint32 ii=0; ii<ID.numberOfIntervals(); ii++) if (ID.depth(ii) < lowcovDepth) basesLow += ID.hi(ii) - ID.lo(ii) + 1; else basesHigh += ID.hi(ii) - ID.lo(ii) + 1; double lowcov = (double)basesLow / (basesLow + basesHigh); if (lowcov >= lowcovFraction) { fprintf(F, "Unitig "F_U32" unassembled - low coverage (%.4f > %.4f at < "F_U32"x coverage)\n", ti, lowcov, lowcovFraction, lowcovDepth); utg->_isUnassembled = true; nCoverage += 1; bCoverage += utg->getLength(); continue; } // Otherwise, unitig is assembled! nContig += 1; bContig += utg->getLength(); } writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- too few reads\n", nTooFew, bTooFew); writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- too short\n", nShort, bShort); writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- single spanning read\n", nSingle, bSingle); writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- low coverage\n", nCoverage, bCoverage); writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- acceptable contigs\n", nContig, bContig); }
void UnitigGraph::setParentAndHang(void) { for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg == NULL) continue; if (utg->ufpath.size() == 0) continue; // Reset parent and hangs for everything. for (uint32 fi=1; fi<utg->ufpath.size(); fi++) { ufNode *frg = &utg->ufpath[fi]; frg->parent = 0; frg->ahang = 0; frg->bhang = 0; } // For each fragment, set parent/hangs using the edges. for (uint32 fi=0; fi<utg->ufpath.size(); fi++) { ufNode *frg = &utg->ufpath[fi]; // If we're contained, gee, I sure hope the container is here! BestContainment *bestcont = OG->getBestContainer(frg->ident); if ((bestcont) && (utg->fragIn(bestcont->container) == utg->id())) { int32 pi = utg->pathPosition(bestcont->container); ufNode *par = &utg->ufpath[pi]; frg->parent = bestcont->container; // The hangs assume the container is forward; adjust if not so. if (par->position.bgn < par->position.end) { frg->ahang = bestcont->a_hang; frg->bhang = bestcont->b_hang; } else { frg->ahang = -bestcont->b_hang; frg->bhang = -bestcont->a_hang; } continue; } // Nope, not contained. If we don't have a parent set, see if one of our best overlaps // can set it. BestEdgeOverlap *bestedge5 = OG->getBestEdgeOverlap(frg->ident, false); BestEdgeOverlap *bestedge3 = OG->getBestEdgeOverlap(frg->ident, true); if ((bestedge5->fragId()) && (utg->fragIn(bestedge5->fragId()) == utg->id())) { int32 pi5 = utg->pathPosition(bestedge5->fragId()); ufNode *oth = &utg->ufpath[pi5]; // Consensus is expected parent/hangs to be relative to the parent fragment. This is used // ONLY to place the fragment, not to orient the fragment. Orientation comes from the // absolute positioning coordinates. // // Interestingly, all four overlap transformations are used here. // // The inner if tests (on fragment orientation) should be asserts, but due to imprecise // layouts, they are sometimes violated: // A fragment from 271-547 had a 5'overlap to something after it; // the frag after was at 543-272, close enough to a tie to screw up placements // if (pi5 < fi) { // We have an edge off our 5' end to something before us --> fragment MUST be forward. // Flip the overlap so it is relative to the other fragment. if (frg->position.bgn < frg->position.end) { frg->parent = bestedge5->fragId(); frg->ahang = -bestedge5->ahang(); frg->bhang = -bestedge5->bhang(); assert(frg->ahang >= 0); } } else { // We have an edge off our 5' end to something after us --> fragment MUST be reverse. // Because our fragment is now reverse, we must reverse the overlap too. if (frg->position.end < frg->position.bgn) { oth->parent = frg->ident; oth->ahang = -bestedge5->bhang(); oth->bhang = -bestedge5->ahang(); assert(oth->ahang >= 0); } } } if ((bestedge3->fragId()) && (utg->fragIn(bestedge3->fragId()) == utg->id())) { int32 pi3 = utg->pathPosition(bestedge3->fragId()); ufNode *oth = &utg->ufpath[pi3]; if (pi3 < fi) { // We have an edge off our 3' end to something before us --> fragment MUST be reverse. // Flip the overlap so it is relative to the other fragment. // Because our fragment is now reverse, we must reverse the overlap too. if (frg->position.end < frg->position.bgn) { frg->parent = bestedge3->fragId(); frg->ahang = bestedge3->bhang(); frg->bhang = bestedge3->ahang(); assert(frg->ahang >= 0); } } else { // We have an edge off our 3' end to something after us --> fragment MUST be forward. // This is the simplest case, the overlap is already correct. if (frg->position.bgn < frg->position.end) { oth->parent = frg->ident; oth->ahang = bestedge3->ahang(); oth->bhang = bestedge3->bhang(); assert(oth->ahang >= 0); } } } } } }
void writeUnitigsToStore(UnitigVector &unitigs, char *fileprefix, char *tigStorePath, uint32 frg_count_target, bool isFinal) { uint32 utg_count = 0; uint32 frg_count = 0; uint32 prt_count = 1; char filename[FILENAME_MAX] = {0}; uint32 *partmap = new uint32 [unitigs.size()]; // This code closely follows that in AS_CGB_unitigger.c::output_the_chunks() if (isFinal) checkUnitigMembership(unitigs); // Open up the initial output file sprintf(filename, "%s.iidmap", fileprefix); FILE *iidm = fopen(filename, "w"); assert(NULL != iidm); sprintf(filename, "%s.partitioning", fileprefix); FILE *part = fopen(filename, "w"); assert(NULL != part); sprintf(filename, "%s.partitioningInfo", fileprefix); FILE *pari = fopen(filename, "w"); assert(NULL != pari); // Step through all the unitigs once to build the partition mapping and IID mapping. memset(partmap, 0xff, sizeof(uint32) * unitigs.size()); for (uint32 iumiid=0, ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; uint32 nf = (utg) ? utg->getNumFrags() : 0; if ((utg == NULL) || (nf == 0)) continue; assert(utg->getLength() > 0); assert(nf == utg->ufpath.size()); if ((frg_count + nf >= frg_count_target) && (frg_count > 0)) { fprintf(pari, "Partition %d has %d unitigs and %d fragments.\n", prt_count, utg_count, frg_count); prt_count++; utg_count = 0; frg_count = 0; } uint32 tigid = (isFinal) ? iumiid : ti; assert(tigid < unitigs.size()); partmap[tigid] = prt_count; fprintf(iidm, "Unitig "F_U32" == IUM "F_U32" (in partition "F_U32" with "F_U32" frags)\n", utg->id(), (tigid), partmap[(tigid)], nf); for (uint32 fragIdx=0; fragIdx<nf; fragIdx++) { ufNode *f = &utg->ufpath[fragIdx]; fprintf(part, "%d\t%d\n", prt_count, f->ident); } utg_count += 1; frg_count += nf; iumiid++; } fprintf(pari, "Partition %d has %d unitigs and %d fragments.\n", prt_count, utg_count, frg_count); fclose(pari); fclose(part); fclose(iidm); // Step through all the unitigs once to build the partition mapping and IID mapping. tgStore *tigStore = new tgStore(tigStorePath); tgTig *tig = new tgTig; for (uint32 iumiid=0, ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; uint32 nf = (utg) ? utg->getNumFrags() : 0; if ((utg == NULL) || (nf == 0)) continue; unitigToTig(tig, (isFinal) ? iumiid : ti, utg); tigStore->insertTig(tig, false); iumiid++; } delete tig; delete tigStore; delete [] partmap; }
void breakUnitigs(UnitigVector &unitigs, char *output_prefix, bool enableIntersectionBreaking) { writeLog("==> BREAKING UNITIGS.\n"); intersectionList *ilist = new intersectionList(unitigs); // Stop when we've seen all current unitigs. Replace tiMax // in the for loop below with unitigs.size() to recursively // split unitigs. uint32 tiMax = unitigs.size(); for (uint32 ti=0; ti<tiMax; ti++) { Unitig *tig = unitigs[ti]; if (tig == NULL) continue; vector<breakPoint> breaks; for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; intersectionPoint *isect = ilist->getIntersection(frg->ident, 0); if (isect == NULL) continue; for (; isect->isectFrg == frg->ident; isect++) { assert(tig->id() == Unitig::fragIn(isect->isectFrg)); // Grab the invading unitig Unitig *inv = unitigs[Unitig::fragIn(isect->invadFrg)]; assert(inv->id() == Unitig::fragIn(isect->invadFrg)); // Grab the best edges off the invading fragment. BestEdgeOverlap *best5 = OG->getBestEdgeOverlap(isect->invadFrg, false); BestEdgeOverlap *best3 = OG->getBestEdgeOverlap(isect->invadFrg, true); // Check if the incoming tig is a spur, and we should just ignore it immediately if ((inv->ufpath.size() == 1) && ((best5->fragId() == 0) || (best3->fragId() == 0))) { if (logFileFlagSet(LOG_INTERSECTION_BREAKING)) writeLog("unitig %d frag %d end %c' into unitig %d frag %d end %c' -- IS A SPUR, skip it\n", inv->id(), isect->invadFrg, isect->invad3p ? '3' : '5', tig->id(), isect->isectFrg, isect->isect3p ? '3' : '5'); continue; } // Keep only significant intersections if ((inv->getLength() > MIN_BREAK_LENGTH) && (inv->ufpath.size() > MIN_BREAK_FRAGS)) { if (logFileFlagSet(LOG_INTERSECTION_BREAKING)) writeLog("unitig %d frag %d end %c' into unitig %d frag %d end %c'\n", inv->id(), isect->invadFrg, isect->invad3p ? '3' : '5', tig->id(), isect->isectFrg, isect->isect3p ? '3' : '5'); breaks.push_back(breakPoint(isect->isectFrg, isect->isect3p, true, false)); } } // Over all incoming fragments // If this is the last fragment, terminate the break point list with a 'fakeEnd' (in AS_BAT_Breaking.cc) break point // at the end of the unitig. if ((fi+1 == tig->ufpath.size()) && (breaks.size() > 0)) { breaks.push_back(breakPoint(frg->ident, (frg->position.bgn < frg->position.end), true, false)); } } // Over all fragments in the unitig if (breaks.size() == 0) continue; // Report where breaks occur. 'breaks' is a list, not a vector. #if 0 // We've lost the fields in breaks[i] -- but the reports above aren't updated yet. if (logFileFlagSet(LOG_INTERSECTION_BREAKING) || logFileFlagSet(LOG_MATE_SPLIT_COVERAGE_PLOT)) for (uint32 i=0; i<breaks.size(); i++) writeLog("BREAK unitig %d at position %d,%d from inSize %d inFrags %d.\n", tig->id(), breaks[i].fragPos.bgn, breaks[i].fragPos.end, breaks[i].inSize, breaks[i].inFrags); #endif // Actually do the breaking. if (enableIntersectionBreaking) breakUnitigAt(unitigs, tig, breaks, true); breaks.clear(); } // Over all unitigs }
void writeUnitigsToStore(UnitigVector &unitigs, char *fileprefix, char *tigStorePath, uint32 frg_count_target, bool isFinal) { uint32 utg_count = 0; uint32 frg_count = 0; uint32 prt_count = 1; char filename[FILENAME_MAX] = {0}; uint32 *partmap = new uint32 [unitigs.size()]; // This code closely follows that in AS_CGB_unitigger.c::output_the_chunks() if (isFinal) checkUnitigMembership(unitigs); // Open up the initial output file sprintf(filename, "%s.iidmap", fileprefix); FILE *iidm = fopen(filename, "w"); assert(NULL != iidm); sprintf(filename, "%s.partitioning", fileprefix); FILE *part = fopen(filename, "w"); assert(NULL != part); sprintf(filename, "%s.partitioningInfo", fileprefix); FILE *pari = fopen(filename, "w"); assert(NULL != pari); // Step through all the unitigs once to build the partition mapping and IID mapping. tgStore *tigStore = new tgStore(tigStorePath); tgTig *tig = new tgTig; for (uint32 tigID=0, ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if ((utg == NULL) || (utg->getNumFrags() == 0)) continue; assert(utg->getLength() > 0); // Convert the bogart tig to a tgTig and save to the store. unitigToTig(tig, (isFinal) ? tigID : ti, utg); tigID++; tigStore->insertTig(tig, false); // Increment the partition if the current one is too large. if ((frg_count + utg->getNumFrags() >= frg_count_target) && (frg_count > 0)) { fprintf(pari, "Partition %d has %d unitigs and %d fragments.\n", prt_count, utg_count, frg_count); prt_count++; utg_count = 0; frg_count = 0; } // Note that the tig is included in this partition. utg_count += 1; frg_count += utg->getNumFrags(); // Map the tig to a partition, and log both the tig-to-partition map and the partition-to-read map. fprintf(iidm, "bogart "F_U32" -> tig "F_U32" (in partition "F_U32" with "F_U32" frags)\n", utg->id(), utg->tigID(), prt_count, utg->getNumFrags()); for (uint32 fragIdx=0; fragIdx<utg->getNumFrags(); fragIdx++) fprintf(part, "%d\t%d\n", prt_count, utg->ufpath[fragIdx].ident); } fprintf(pari, "Partition %d has %d unitigs and %d fragments.\n", // Don't forget to log the last partition! prt_count, utg_count, frg_count); fclose(pari); fclose(part); fclose(iidm); delete tig; delete tigStore; }
void popMateBubbles(UnitigVector &unitigs) { uint32 nBubblePopped = 0; uint32 nBubbleTooBig = 0; uint32 nBubbleConflict = 0; writeLog("==> SEARCHING FOR MATE BUBBLES\n"); // For each unitig, if all (or most) of the external mates are to a single other unitig (not // counting singletons), then this is a potential bubble popping unitig. // // At present, this is exploratory only. for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *tig = unitigs[ti]; if ((tig == NULL) || (tig->ufpath.size() == 0)) // No tig here. continue; if ((tig->getLength() > 1000) || (tig->ufpath.size() >= 3000)) // Tig too big. continue; //if ((tig->getLength() < 150) || // (tig->ufpath.size() < 5)) // // Tig too small. // continue; uint32 *lkg = new uint32 [tig->ufpath.size()]; uint32 lkgLen = 0; uint32 lkgExt = 0; for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; int32 frgID = frg->ident; int32 matID = FI->mateIID(frgID); uint32 mtigID = 0; Unitig *mtig = 0L; if (matID == 0) // No mate. continue; mtigID = tig->fragIn(matID); mtig = unitigs[mtigID]; if (mtigID == tig->id()) // Mate is not external. continue; lkgExt++; if (mtig->ufpath.size() < 2) // Mate is in singleton. continue; lkg[lkgLen++] = mtigID; } if (lkgLen == 0) // No external mates. continue; sort(lkg, lkg+lkgLen); uint32 last = lkg[0]; uint32 lcnt = 1; for (uint32 i=1; i<lkgLen; i++) { if (last != lkg[i]) { if ((lcnt > 3)) writeLog("popMateBubble()-- tig %d len %d might pop bubble in tig %u (%u mates in there out of %d external mates)\n", tig->id(), tig->getLength(), last, lcnt, lkgExt); last = lkg[i]; lcnt = 0; } lcnt++; } if ((lcnt > 3)) writeLog("popMateBubble()-- tig %d len %d might pop bubble in tig %u (%u mates in there out of %d external mates)\n", tig->id(), tig->getLength(), last, lcnt, lkgExt); delete [] lkg; } }
// For every unitig, report the best overlaps contained in the // unitig, and all overlaps contained in the unitig. // // Wow, this is ancient. // void writeOverlapsUsed(UnitigVector &unitigs, char *prefix) { char N[FILENAME_MAX]; sprintf(N, "%s.unused.best.edges", prefix); FILE *F = fopen(N, "w"); for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *tig = unitigs[ti]; Unitig *ovl = NULL; char tyt = 'C'; if (tig == NULL) continue; if (tig->_isUnassembled) tyt = 'U'; if (tig->_isBubble) tyt = 'B'; if (tig->_isRepeat) tyt = 'R'; if (tig->_isCircular) tyt = 'O'; for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; ufNode *oth = NULL; // Report the unused best edge BestEdgeOverlap *be5 = OG->getBestEdgeOverlap(frg->ident, false); uint32 rd5 = (be5 == NULL) ? 0 : be5->fragId(); Unitig *tg5 = (be5 == NULL) ? NULL : unitigs[Unitig::fragIn(rd5)]; char ty5 = 'C'; if ((tg5 != NULL) && (tg5->tigID() != tig->tigID())) { uint32 ord = Unitig::pathPosition(rd5); ufNode *oth = &tg5->ufpath[ord]; if (tig->_isUnassembled) ty5 = 'U'; if (tig->_isBubble) ty5 = 'B'; if (tig->_isRepeat) ty5 = 'R'; if (tig->_isCircular) ty5 = 'O'; fprintf(F, "tig %7u %c read %8u at %9u %-9u %c' -- %8d %-8d -- tig %7u %c read %8u at %9u %-9u %c'\n", tig->tigID(), tyt, frg->ident, frg->position.bgn, frg->position.end, '5', be5->ahang(), be5->bhang(), tg5->tigID(), ty5, oth->ident, oth->position.bgn, oth->position.end, (be5->frag3p() == false) ? '5' : '3'); } BestEdgeOverlap *be3 = OG->getBestEdgeOverlap(frg->ident, true); uint32 rd3 = (be3 == NULL) ? 0 : be3->fragId(); Unitig *tg3 = (be3 == NULL) ? NULL : unitigs[Unitig::fragIn(rd3)]; char ty3 = 'C'; if ((tg3 != NULL) && (tg3->tigID() != tig->tigID())) { uint32 ord = Unitig::pathPosition(rd3); ufNode *oth = &tg3->ufpath[ord]; if (tig->_isUnassembled) ty3 = 'U'; if (tig->_isBubble) ty3 = 'B'; if (tig->_isRepeat) ty3 = 'R'; if (tig->_isCircular) ty3 = 'O'; fprintf(F, "tig %7u %c read %8u at %9u %-9u %c' -- %8d %-8d -- tig %7u %c read %8u at %9u %-9u %c'\n", tig->tigID(), tyt, frg->ident, frg->position.bgn, frg->position.end, '3', be3->ahang(), be3->bhang(), tg3->tigID(), ty3, oth->ident, oth->position.bgn, oth->position.end, (be3->frag3p() == false) ? '5' : '3'); } } } fclose(F); }