void promoteToSingleton(UnitigVector &unitigs, bool enablePromoteToSingleton) { for (uint32 fi=1; fi<=FI->numFragments(); fi++) { if (Unitig::fragIn(fi) != 0) // Placed already continue; if (FI->fragmentLength(fi) == 0) // Deleted. continue; if (enablePromoteToSingleton == false) { writeLog("promoteToSingleton()-- Repeat fragment "F_U32" removed from assembly.\n", fi); FI->markAsIgnore(fi); continue; } Unitig *utg = unitigs.newUnitig(false); ufNode frag; frag.ident = fi; frag.contained = 0; frag.parent = 0; frag.ahang = 0; frag.bhang = 0; frag.position.bgn = 0; frag.position.end = FI->fragmentLength(fi); frag.containment_depth = 0; utg->addFrag(frag, 0, false); } }
void promoteToSingleton(UnitigVector &unitigs) { for (uint32 fi=1; fi<=FI->numFragments(); fi++) { if (Unitig::fragIn(fi) != 0) // Placed already continue; if (FI->fragmentLength(fi) == 0) // Deleted. continue; Unitig *utg = unitigs.newUnitig(false); ufNode frag; frag.ident = fi; frag.contained = 0; frag.parent = 0; frag.ahang = 0; frag.bhang = 0; frag.position.bgn = 0; frag.position.end = FI->fragmentLength(fi); utg->addFrag(frag, 0, false); } }
static void makeNewUnitig(UnitigVector &unitigs, uint32 splitFragsLen, ufNode *splitFrags) { Unitig *dangler = unitigs.newUnitig(false); if (logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS)) writeLog("splitDiscontinuous()-- new tig "F_U32" with "F_U32" fragments (starting at frag "F_U32").\n", dangler->id(), splitFragsLen, splitFrags[0].ident); int splitOffset = -MIN(splitFrags[0].position.bgn, splitFrags[0].position.end); // This should already be true, but we force it still splitFrags[0].contained = 0; for (uint32 i=0; i<splitFragsLen; i++) dangler->addFrag(splitFrags[i], splitOffset, false); //logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS)); }
void placeUnplacedUsingAllOverlaps(UnitigVector &unitigs, const char *prefix) { uint32 fiLimit = FI->numFragments(); uint32 numThreads = omp_get_max_threads(); uint32 blockSize = (fiLimit < 100 * numThreads) ? numThreads : fiLimit / 99; uint32 *placedTig = new uint32 [FI->numFragments() + 1]; SeqInterval *placedPos = new SeqInterval [FI->numFragments() + 1]; memset(placedTig, 0, sizeof(uint32) * (FI->numFragments() + 1)); memset(placedPos, 0, sizeof(SeqInterval) * (FI->numFragments() + 1)); // Just some logging. Count the number of reads we try to place. uint32 nToPlaceContained = 0; uint32 nToPlace = 0; uint32 nPlacedContained = 0; uint32 nPlaced = 0; uint32 nFailedContained = 0; uint32 nFailed = 0; for (uint32 fid=1; fid<FI->numFragments()+1; fid++) if (Unitig::fragIn(fid) == 0) if (OG->isContained(fid)) nToPlaceContained++; else nToPlace++; writeLog("placeContains()-- placing %u contained and %u unplaced reads, with %d threads.\n", nToPlaceContained, nToPlace, numThreads); // Do the placing! #pragma omp parallel for schedule(dynamic, blockSize) for (uint32 fid=1; fid<FI->numFragments()+1; fid++) { bool enableLog = true; if (Unitig::fragIn(fid) > 0) continue; // Place the read. vector<overlapPlacement> placements; placeFragUsingOverlaps(unitigs, AS_MAX_ERATE, NULL, fid, placements); // Search the placements for the highest expected identity placement using all overlaps in the unitig. uint32 b = UINT32_MAX; for (uint32 i=0; i<placements.size(); i++) { Unitig *tig = unitigs[placements[i].tigID]; if (placements[i].fCoverage < 0.99) // Ignore partially placed reads. continue; if (tig->ufpath.size() == 1) // Ignore placements in singletons. continue; uint32 bgn = (placements[i].position.bgn < placements[i].position.end) ? placements[i].position.bgn : placements[i].position.end; uint32 end = (placements[i].position.bgn < placements[i].position.end) ? placements[i].position.end : placements[i].position.bgn; double erate = placements[i].errors / placements[i].aligned; if (tig->overlapConsistentWithTig(5.0, bgn, end, erate) < 0.5) { if ((enableLog == true) && (logFileFlagSet(LOG_PLACE_UNPLACED))) writeLog("frag %8u tested tig %6u (%6u reads) at %8u-%8u (cov %7.5f erate %6.4f) - HIGH ERROR\n", fid, placements[i].tigID, tig->ufpath.size(), placements[i].position.bgn, placements[i].position.end, placements[i].fCoverage, erate); continue; } if ((enableLog == true) && (logFileFlagSet(LOG_PLACE_UNPLACED))) writeLog("frag %8u tested tig %6u (%6u reads) at %8u-%8u (cov %7.5f erate %6.4f)\n", fid, placements[i].tigID, tig->ufpath.size(), placements[i].position.bgn, placements[i].position.end, placements[i].fCoverage, erate); if ((b == UINT32_MAX) || (placements[i].errors / placements[i].aligned < placements[b].errors / placements[b].aligned)) b = i; } // If we didn't find a best, b will be invalid; set positions for adding to a new tig. // If we did, save both the position it was placed at, and the tigID it was placed in. if (b == UINT32_MAX) { if ((enableLog == true) && (logFileFlagSet(LOG_PLACE_UNPLACED))) writeLog("frag %8u remains unplaced\n", fid); placedPos[fid].bgn = 0; placedPos[fid].end = FI->fragmentLength(fid); } else { if ((enableLog == true) && (logFileFlagSet(LOG_PLACE_UNPLACED))) writeLog("frag %8u placed tig %6u (%6u reads) at %8u-%8u (cov %7.5f erate %6.4f)\n", fid, placements[b].tigID, unitigs[placements[b].tigID]->ufpath.size(), placements[b].position.bgn, placements[b].position.end, placements[b].fCoverage, placements[b].errors / placements[b].aligned); placedTig[fid] = placements[b].tigID; placedPos[fid] = placements[b].position; } } // All reads placed, now just dump them in their correct tigs. for (uint32 fid=1; fid<FI->numFragments()+1; fid++) { Unitig *tig = NULL; ufNode frg; if (Unitig::fragIn(fid) > 0) continue; // If not placed, dump it in a new unitig. Well, not anymore. These reads were not placed in // any tig initially, were not allowed to seed a tig, and now, could find no place to go. // They're garbage. Plus, it screws up the logging above because we don't know the new tig ID // until now. if (placedTig[fid] == 0) { if (OG->isContained(fid)) nFailedContained++; else nFailed++; //tig = unitigs.newUnitig(false); } // Otherwise, it was placed somewhere, grab the tig. else { if (OG->isContained(fid)) nPlacedContained++; else nPlaced++; tig = unitigs[placedTig[fid]]; } // Regardless, add it to the tig. Logging for this is above. if (tig) { frg.ident = fid; frg.contained = 0; frg.parent = 0; frg.ahang = 0; frg.bhang = 0; frg.position = placedPos[fid]; tig->addFrag(frg, 0, false); } } // Cleanup. delete [] placedPos; delete [] placedTig; writeLog("placeContains()-- Placed %u contained reads and %u unplaced reads.\n", nPlacedContained, nPlaced); writeLog("placeContains()-- Failed to place %u contained reads (too high error suspected) and %u unplaced reads (lack of overlaps suspected).\n", nFailedContained, nFailed); // But wait! All the tigs need to be sorted. Well, not really _all_, but the hard ones to sort // are big, and those quite likely had reads added to them, so it's really not worth the effort // of tracking which ones need sorting, since the ones that don't need it are trivial to sort. for (uint32 ti=1; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg) utg->sort(); } }
static void joinUnitigs_append(UnitigVector &unitigs, joinEntry *join) { uint32 frId = Unitig::fragIn(join->frFragID); uint32 toId = Unitig::fragIn(join->toFragID); Unitig *fr = unitigs[frId]; Unitig *to = unitigs[toId]; uint32 frIdx = Unitig::pathPosition(join->frFragID); uint32 toIdx = Unitig::pathPosition(join->toFragID); // The 'fr' unitig is assumed to be forward, and assumed to be the one we join to. // Compute the offset for our append. We just need to compute where the join fragment would // appear in the unitig. The join fragment MUST be the first thing in the frUnitig. //int32 offset = MIN(frF.position.bgn, frF.position.end); // Over all fragments in the frUnitig, add them to either the joinUnitig or the discUnitig. Unitig *joinUnitig = unitigs.newUnitig(false); Unitig *discUnitig = unitigs.newUnitig(false); // Reverse the 'to' unitig if needed. if (join->toFlip) to->reverseComplement(true); // If we're joining off the 5' end of the fr untiig, add the to reads first. if (join->frFirst == true) { uint32 ii=0; for (; ii < toIdx; ii++) joinUnitig->addFrag(to->ufpath[ii], 0, false); for (; ii < to->ufpath.size(); ii++) discUnitig->addFrag(to->ufpath[ii], 0, false); } // Now add all the fr unitig reads. for (uint32 ii=0; ii < fr->ufpath.size(); ii++) joinUnitig->addFrag(to->ufpath[ii], 0, false); // If we're not joining off the 5' end, add the to unitig reads last. if (join->frFirst == false) { uint32 ii = 0; for (; ii < toIdx; ii++) discUnitig->addFrag(to->ufpath[ii], 0, false); for (; ii < to->ufpath.size(); ii++) joinUnitig->addFrag(to->ufpath[ii], 0, false); } // Delete the donor unitigs. delete fr; delete to; unitigs[frId] = NULL; unitigs[toId] = NULL; // And make sure the new unitigs are consistent. joinUnitig->sort(); discUnitig->sort(); }
void placeZombies(UnitigVector &unitigs, double erate) { writeLog("==> SEARCHING FOR ZOMBIES\n"); uint32 *inUnitig = new uint32 [FI->numFragments()+1]; int numZombies = 0; // Mark fragments as dead, then unmark them if they are in a real living unitig. for (uint32 i=0; i<FI->numFragments()+1; i++) inUnitig[i] = noUnitig; for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg == NULL) continue; for (uint32 fi=0; fi<utg->ufpath.size(); fi++) inUnitig[utg->ufpath[fi].ident] = utg->id(); } // For anything not in a living unitig, reload the overlaps and find a new container. // (NOT IMPLEMENTED - for now we just move these to new singleton unitigs). for (uint32 i=0; i<FI->numFragments()+1; i++) { if (FI->fragmentLength(i) == 0) // Deleted fragment continue; if (inUnitig[i] != noUnitig) // Valid fragment in a unitig continue; Unitig *utg = unitigs.newUnitig(false); ufNode frg; frg.ident = i; frg.contained = 0; frg.parent = 0; frg.ahang = 0; frg.bhang = 0; frg.position.bgn = 0; frg.position.end = FI->fragmentLength(i); frg.containment_depth = 0; utg->addFrag(frg, 0, false); writeLog("placeZombies()-- unitig %d created from zombie fragment %d\n", utg->id(), i); numZombies++; } writeLog("RESURRECTED %d ZOMBIE FRAGMENT%s.\n", numZombies, (numZombies != 1) ? "s" : ""); delete [] inUnitig; }
// Make sure that contained fragments are in the same unitig // as their container. Due to sorting, contained fragments // can come much later in the unitig: // // ------------1 // -------------2 // --------------3 // ----4 (contained in 1, too much error keeps it out of 2 and 3) // // So, our first pass is to move contained fragments around. // void UnitigGraph::moveContains(void) { for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *thisUnitig = unitigs[ti]; if ((thisUnitig == NULL) || (thisUnitig->ufpath.size() < 2)) continue; MateLocation positions(thisUnitig); ufNode *frags = new ufNode [thisUnitig->ufpath.size()]; uint32 fragsLen = 0; if (logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS)) fprintf(logFile, "moveContain unitig %d\n", thisUnitig->id()); for (uint32 fi=0; fi<thisUnitig->ufpath.size(); fi++) { ufNode *frg = &thisUnitig->ufpath[fi]; BestContainment *bestcont = OG->getBestContainer(frg->ident); MateLocationEntry mloc = positions.getById(frg->ident); uint32 thisFrgID = frg->ident; uint32 contFrgID = (bestcont) ? bestcont->container : 0; uint32 mateFrgID = FI->mateIID(frg->ident); uint32 thisUtgID = thisUnitig->fragIn(thisFrgID); uint32 contUtgID = thisUnitig->fragIn(contFrgID); uint32 mateUtgID = thisUnitig->fragIn(mateFrgID); // id1 != 0 -> we found the fragment in the mate happiness table // isBad -> and the mate is unhappy. // // What's id1 vs id2 in MateLocationEntry? Dunno. All I // know is that if there is no mate present, one of those // will be 0. (Similar test used above too.) // bool isMated = (mateFrgID > 0); bool isGrumpy = ((isMated) && (mloc.mleFrgID1 != 0) && (mloc.mleFrgID2 != 0) && (mloc.isGrumpy == true)); // // Figure out what to do. // bool moveToContainer = false; bool moveToSingleton = false; if ((frg->contained == 0) && (bestcont == NULL)) { // CASE 1: Not contained. Leave the fragment here. //fprintf(logFile, "case1 frag %d fragsLen %d\n", thisFrgID, fragsLen); } else if (isMated == false) { // CASE 2: Contained but not mated. Move to be with the // container (if the container isn't here). //fprintf(logFile, "case2 frag %d contID %d fragsLen %d\n", thisFrgID, contUtgID, fragsLen); if (thisUtgID != contUtgID) moveToContainer = true; } else if ((isGrumpy == true) && (thisUtgID == mateUtgID)) { // CASE 3: Not happy, and the frag and mate are together. // Kick out to a singleton. //fprintf(logFile, "case3 frag %d utg %d mate %d utg %d cont %d utg %d fragsLen %d\n", // thisFrgID, thisUtgID, mateFrgID, mateUtgID, contFrgID, contUtgID, fragsLen); if (thisUtgID == mateUtgID) moveToSingleton = true; } else { // This makes for some ugly code (we break the nice if else // if else structure we had going on) but the next two cases // need to know if there is an overlap to the rest of the // unitig. bool hasOverlap = (thisUtgID == contUtgID); bool allContained = false; if (hasOverlap == false) { if (fragsLen == 0) { // The first fragment. Check fragments after to see if // there is an overlap (note only frags with an overlap // in the layout are tested). In rare cases, we ejected // the container, and left a containee with no overlap to // fragments remaining. // // Note that this checks if there is an overlap to the // very first non-contained (aka dovetail) fragment ONLY. // If there isn't an overlap to the first non-contained // fragment, then that fragment will likely NOT align // correctly. uint32 ft = fi + 1; #warning 2x BUGS IN COMPARISON HERE // Skip all the contains. while ((ft < thisUnitig->ufpath.size()) && (OG->isContained(thisUnitig->ufpath[ft].ident) == true) && (MAX(frg->position.bgn, frg->position.end) < MIN(thisUnitig->ufpath[ft].position.bgn, thisUnitig->ufpath[ft].position.end))) ft++; // If the frag is not contained (we could be the // container), and overlaps in the layout, see if there // is a real overlap. if ((ft < thisUnitig->ufpath.size()) && (OG->isContained(thisUnitig->ufpath[ft].ident) == false) && (MAX(frg->position.bgn, frg->position.end) < MIN(thisUnitig->ufpath[ft].position.bgn, thisUnitig->ufpath[ft].position.end))) hasOverlap = OG->containHaveEdgeTo(thisFrgID, thisUnitig->ufpath[ft].ident); } else { // Not the first fragment, search for an overlap to an // already placed frag. uint32 ft = fi; do { ft--; // OK to overlap to a contained frag; he could be our // container. hasOverlap = OG->containHaveEdgeTo(thisFrgID, thisUnitig->ufpath[ft].ident); // Stop if we found an overlap, or we just checked the // first frag in the unitig, or we no longer overlap in // the layout. } while ((hasOverlap == false) && (ft > 0) && (MIN(frg->position.bgn, frg->position.end) < MAX(thisUnitig->ufpath[ft].position.bgn, thisUnitig->ufpath[ft].position.end))); } } // end of hasOverlap // An unbelievabe special case. When the unitig is just a // single container fragment (and any contained frags under // it) rule 4 breaks. The first fragment has no overlap (all // later reads are contained) and so we want to eject it to a // new unitig. Since there are multiple fragments in this // unitig, the ejection occurs. Later, all the contains get // moved to the new unitig. And we repeat. To prevent, we // abort the ejection if the unitig is all contained in one // fragment. // if (fragsLen == 0) { allContained = true; for (uint32 ft = fi + 1; ((allContained == true) && (ft < thisUnitig->ufpath.size())); ft++) allContained = OG->isContained(thisUnitig->ufpath[ft].ident); } if (isGrumpy == true) { // CASE 4: Not happy and not with the mate. This one is a // bit of a decision. // // If an overlap exists to the rest of the unitig, we'll // leave it here. We'll also leave it here if it is the // rest of the unitig is all contained in this fragment. // // If no overlap, and the mate and container are in the // same unitig, we'll just eject. That also implies the // other unitig is somewhat large, at least as big as the // insert size. // // Otherwise, we'll move to the container and cross our // fingers we place it correctly. The alternative is to // eject, and hope that we didn't also eject the mate to a // singleton. //fprintf(logFile, "case4 frag %d utg %d mate %d utg %d cont %d utg %d fragsLen %d\n", // thisFrgID, thisUtgID, mateFrgID, mateUtgID, contFrgID, contUtgID, fragsLen); if ((hasOverlap == false) && (allContained == false)) if (mateUtgID == contUtgID) moveToSingleton = true; else moveToContainer = true; } else { // CASE 5: Happy! If with container, or an overlap exists to // some earlier fragment, leave it here. Otherwise, eject it // to a singleton. The fragment is ejected instead of moved // to be with its container since we don't know which is // correct - the mate or the overlap. // // If not happy, we've already made sure that the mate is not // here (that was case 3). //fprintf(logFile, "case5 frag %d utg %d mate %d utg %d cont %d utg %d fragsLen %d\n", // thisFrgID, thisUtgID, mateFrgID, mateUtgID, contFrgID, contUtgID, fragsLen); // If no overlap (so not with container or no overlap to // other frags) eject. if ((hasOverlap == false) && (allContained == false)) moveToSingleton = true; } } // End of cases // // Do it. // if (moveToContainer == true) { // Move the fragment to be with its container. Unitig *thatUnitig = unitigs[contUtgID]; ufNode containee = *frg; assert(thatUnitig->id() == contUtgID); // Nuke the fragment in the current list frg->ident = 999999999; frg->contained = 999999999; frg->position.bgn = 0; frg->position.end = 0; assert(thatUnitig->id() == contUtgID); if (logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS)) fprintf(logFile, "Moving contained fragment %d from unitig %d to be with its container %d in unitig %d\n", thisFrgID, thisUtgID, contFrgID, contUtgID); assert(bestcont->container == contFrgID); thatUnitig->addContainedFrag(thisFrgID, bestcont, logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS)); assert(thatUnitig->id() == Unitig::fragIn(thisFrgID)); } else if ((moveToSingleton == true) && (thisUnitig->getNumFrags() != 1)) { // Eject the fragment to a singleton (unless we ARE the singleton) Unitig *singUnitig = new Unitig(logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS)); ufNode containee = *frg; // Nuke the fragment in the current list frg->ident = 999999999; frg->contained = 999999999; frg->position.bgn = 0; frg->position.end = 0; if (logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS)) fprintf(logFile, "Ejecting unhappy contained fragment %d from unitig %d into new unitig %d\n", thisFrgID, thisUtgID, singUnitig->id()); containee.contained = 0; singUnitig->addFrag(containee, -MIN(containee.position.bgn, containee.position.end), logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS)); unitigs.push_back(singUnitig); thisUnitig = unitigs[ti]; // Reset the pointer; unitigs might be reallocated } else { // Leave fragment here. Copy the fragment to the list -- if // we need to rebuild the unitig (because fragments were // removed), the list is used, otherwise, we have already // made the changes needed. // // Also, very important, update our containment mark. If our // container was moved, but we stayed put because of a happy // mate, we're still marked as being contained. Rather than // put this check in all the places where we stay put in the // above if-else-else-else, it's here. if ((frg->contained) && (thisUtgID != contUtgID)) frg->contained = 0; frags[fragsLen] = *frg; fragsLen++; } } // over all frags // Now, rebuild this unitig if we made changes. if (fragsLen != thisUnitig->ufpath.size()) { if (logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS)) fprintf(logFile, "Rebuild unitig %d after removing contained fragments.\n", thisUnitig->id()); thisUnitig->ufpath.clear(); // Occasionally, we move all fragments out of the original unitig. Might be worth checking // if that makes sense!! // #warning EMPTIED OUT A UNITIG if (fragsLen > 0) { // No need to resort. Offsets only need adjustment if the first fragment is thrown out. // If not, splitOffset will be zero. // int splitOffset = -MIN(frags[0].position.bgn, frags[0].position.end); // This is where we clean up from the splitting not dealing with contained fragments -- we // force the first frag to be uncontained. // frags[0].contained = 0; for (uint32 i=0; i<fragsLen; i++) thisUnitig->addFrag(frags[i], splitOffset, logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS)); } } delete [] frags; frags = NULL; } // Over all unitigs }