void popMateBubbles(UnitigVector &unitigs) { uint32 nBubblePopped = 0; uint32 nBubbleTooBig = 0; uint32 nBubbleConflict = 0; writeLog("==> SEARCHING FOR MATE BUBBLES\n"); // For each unitig, if all (or most) of the external mates are to a single other unitig (not // counting singletons), then this is a potential bubble popping unitig. // // At present, this is exploratory only. for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *tig = unitigs[ti]; if ((tig == NULL) || (tig->ufpath.size() == 0)) // No tig here. continue; if ((tig->getLength() > 1000) || (tig->ufpath.size() >= 3000)) // Tig too big. continue; //if ((tig->getLength() < 150) || // (tig->ufpath.size() < 5)) // // Tig too small. // continue; uint32 *lkg = new uint32 [tig->ufpath.size()]; uint32 lkgLen = 0; uint32 lkgExt = 0; for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; int32 frgID = frg->ident; int32 matID = FI->mateIID(frgID); uint32 mtigID = 0; Unitig *mtig = 0L; if (matID == 0) // No mate. continue; mtigID = tig->fragIn(matID); mtig = unitigs[mtigID]; if (mtigID == tig->id()) // Mate is not external. continue; lkgExt++; if (mtig->ufpath.size() < 2) // Mate is in singleton. continue; lkg[lkgLen++] = mtigID; } if (lkgLen == 0) // No external mates. continue; sort(lkg, lkg+lkgLen); uint32 last = lkg[0]; uint32 lcnt = 1; for (uint32 i=1; i<lkgLen; i++) { if (last != lkg[i]) { if ((lcnt > 3)) writeLog("popMateBubble()-- tig %d len %d might pop bubble in tig %u (%u mates in there out of %d external mates)\n", tig->id(), tig->getLength(), last, lcnt, lkgExt); last = lkg[i]; lcnt = 0; } lcnt++; } if ((lcnt > 3)) writeLog("popMateBubble()-- tig %d len %d might pop bubble in tig %u (%u mates in there out of %d external mates)\n", tig->id(), tig->getLength(), last, lcnt, lkgExt); delete [] lkg; } }
intersectionList::intersectionList(UnitigVector &unitigs) { for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *tig = unitigs[ti]; if (tig == NULL) continue; intersectionEvidence *evidence = new intersectionEvidence [tig->ufpath.size()]; for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; if (OG->isContained(frg->ident)) continue; // For my best overlap, the ID of the unitig that the overlapping fragment is in. evidence[fi].edge5 = *OG->getBestEdgeOverlap(frg->ident, false); evidence[fi].edge3 = *OG->getBestEdgeOverlap(frg->ident, true); evidence[fi].frag5tig = tig->fragIn(evidence[fi].edge5.fragId()); evidence[fi].frag3tig = tig->fragIn(evidence[fi].edge3.fragId()); // Do NOT initialize these! An earlier fragment could have already confirmed an end. // Properly, only the 5' end of a forward fragment (or 3' end of a reverse fragment) can be // confirmed already (otherwise the tig is nonsense), but we don't yet check that. // //evidence[fi].frag5confirmed = false; //evidence[fi].frag3confirmed = false; // But, because the path could be promiscuous, not every overlap to a different tig is bad. // // If my best overlap is to a different tig, but there is an overlapping fragment (in the // unitig placement) with a best edge to me, I'm still good. The BOG build this unitig using // the edge from the other fragment to me. // // If the fragments do not overlap in the layout (yet the best edge still exists) that is a // self-intersection. // // The two blocks are identical, except for 'edge3' and 'edge5'. if (evidence[fi].frag5tig == tig->id()) { uint32 ti = tig->pathPosition(evidence[fi].edge5.fragId()); ufNode *trg = &tig->ufpath[ti]; uint32 minf = (frg->position.bgn < frg->position.end) ? frg->position.bgn : frg->position.end; uint32 maxf = (frg->position.bgn < frg->position.end) ? frg->position.end : frg->position.bgn; uint32 mint = (trg->position.bgn < trg->position.end) ? trg->position.bgn : trg->position.end; uint32 maxt = (trg->position.bgn < trg->position.end) ? trg->position.end : trg->position.bgn; // If they overlap, mark as confirmed, else remember an intersection. if (((minf < mint) && (mint < maxf)) || // t begins inside f ((mint < minf) && (minf < maxt))) { // f begins inside t if (evidence[fi].edge5.frag3p()) evidence[ti].frag3confirmed = true; else evidence[ti].frag5confirmed = true; } else { evidence[fi].frag5self = true; // Not the correct place to report this. Some of these get confirmed by later fragments. //writeLog("BUG1 F: %d,%d T %d,%d\n", minf, maxf, mint, maxt); //writeLog("INTERSECT from unitig %d frag %d end %d TO unitig %d frag %d end %d (SELF)\n", // tig->id(), frg->ident, 5, evidence[fi].frag5tig, evidence[fi].edge5.fragId(), evidence[fi].edge5.frag3p() ? 3 : 5); } } if (evidence[fi].frag3tig == tig->id()) { uint32 ti = tig->pathPosition(evidence[fi].edge3.fragId()); ufNode *trg = &tig->ufpath[ti]; uint32 minf = (frg->position.bgn < frg->position.end) ? frg->position.bgn : frg->position.end; uint32 maxf = (frg->position.bgn < frg->position.end) ? frg->position.end : frg->position.bgn; uint32 mint = (trg->position.bgn < trg->position.end) ? trg->position.bgn : trg->position.end; uint32 maxt = (trg->position.bgn < trg->position.end) ? trg->position.end : trg->position.bgn; if (((minf < mint) && (mint < maxf)) || // t begins inside f ((mint < minf) && (minf < maxt))) { // f begins inside t if (evidence[fi].edge3.frag3p()) evidence[ti].frag3confirmed = true; else evidence[ti].frag5confirmed = true; } else { evidence[fi].frag3self = true; // Not the correct place to report this. Some of these get confirmed by later fragments. //writeLog("BUG2 F: %d,%d T %d,%d\n", minf, maxf, mint, maxt); //writeLog("INTERSECT from unitig %d frag %d end %d TO unitig %d frag %d end %d (SELF)\n", // tig->id(), frg->ident, 3, evidence[fi].frag3tig, evidence[fi].edge3.fragId(), evidence[fi].edge3.frag3p() ? 3 : 5); } } } // // Build the list. // for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; if ((evidence[fi].frag5tig != 0) && (evidence[fi].frag5tig != tig->id()) && (evidence[fi].frag5confirmed == false)) isects.push_back(intersectionPoint(evidence[fi].edge5, frg->ident, false, false)); if ((evidence[fi].frag5tig == tig->id()) && (evidence[fi].frag5self == true) && (evidence[fi].frag5confirmed == false)) isects.push_back(intersectionPoint(evidence[fi].edge5, frg->ident, false, true)); if ((evidence[fi].frag3tig != 0) && (evidence[fi].frag3tig != tig->id()) && (evidence[fi].frag3confirmed == false)) isects.push_back(intersectionPoint(evidence[fi].edge3, frg->ident, true, false)); if ((evidence[fi].frag3tig == tig->id()) && (evidence[fi].frag3self == true) && (evidence[fi].frag3confirmed == false)) isects.push_back(intersectionPoint(evidence[fi].edge3, frg->ident, true, true)); } delete [] evidence; } // Sort the intersections by the ID of the intersected fragment, then build an index into the array. std::sort(isects.begin(), isects.end()); // Terminate the intersection list with a sentinal intersection. This is CRITICAL // to the way we iterate over intersections. isects.push_back(intersectionPoint(BestEdgeOverlap(), 0, true, true)); // Build a map from fragment id to the first intersection in the list. for (uint32 i=0; i<isects.size(); i++) { isectsNum[isects[i].isectFrg]++; if (isectsMap.find(isects[i].isectFrg) == isectsMap.end()) isectsMap[isects[i].isectFrg] = i; } }
// For every unitig, report the best overlaps contained in the // unitig, and all overlaps contained in the unitig. // // Wow, this is ancient. // void writeOverlapsUsed(UnitigVector &unitigs, char *fileprefix) { char filename[FILENAME_MAX] = {0}; #if 0 GenericMesg pmesg; OverlapMesg omesg; #endif sprintf(filename, "%s.unused.ovl", fileprefix); FILE *file = fopen(filename, "w"); assert(file != NULL); #if 0 for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg == NULL) continue; for (uint32 fi=0; fi<utg->ufpath.size(); fi++) { ufNode *frg = &utg->ufpath[fi]; // Where is our best overlap? Contained or dovetail? BestEdgeOverlap *bestedge5 = OG->getBestEdgeOverlap(frg->ident, false); BestEdgeOverlap *bestedge3 = OG->getBestEdgeOverlap(frg->ident, true); int bestident5 = 0; int bestident3 = 0; if (bestedge5) { bestident5 = bestedge5->fragId(); if ((bestident5 > 0) && (utg->fragIn(bestident5) != utg->id())) { omesg.aifrag = frg->ident; omesg.bifrag = bestident5; omesg.ahg = bestedge5->ahang(); omesg.bhg = bestedge5->bhang(); omesg.orientation.setIsUnknown(); omesg.overlap_type = AS_DOVETAIL; omesg.quality = 0.0; omesg.min_offset = 0; omesg.max_offset = 0; omesg.polymorph_ct = 0; omesg.alignment_trace = NULL; #ifdef AS_MSG_USE_OVL_DELTA omesg.alignment_delta = NULL; #endif // This overlap is off of the 5' end of this fragment. if (bestedge5->frag3p() == false) omesg.orientation.setIsOuttie(); if (bestedge5->frag3p() == true) omesg.orientation.setIsAnti(); pmesg.t = MESG_OVL; pmesg.m = &omesg; WriteProtoMesg_AS(file, &pmesg); } } if (bestedge3) { bestident3 = bestedge3->fragId(); if ((bestident3 > 0) && (utg->fragIn(bestident3) != utg->id())) { omesg.aifrag = frg->ident; omesg.bifrag = bestident3; omesg.ahg = bestedge3->ahang(); omesg.bhg = bestedge3->bhang(); omesg.orientation.setIsUnknown(); omesg.overlap_type = AS_DOVETAIL; omesg.quality = 0.0; omesg.min_offset = 0; omesg.max_offset = 0; omesg.polymorph_ct = 0; omesg.alignment_trace = NULL; #ifdef AS_MSG_USE_OVL_DELTA omesg.alignment_delta = NULL; #endif // This overlap is off of the 3' end of this fragment. if (bestedge3->frag3p() == false) omesg.orientation.setIsNormal(); if (bestedge3->frag3p() == true) omesg.orientation.setIsInnie(); pmesg.t = MESG_OVL; pmesg.m = &omesg; WriteProtoMesg_AS(file, &pmesg); } } } } #endif fclose(file); }
// After splitting and ejecting some contains, check for discontinuous unitigs. // void splitDiscontinuousUnitigs(UnitigVector &unitigs, uint32 minOverlap) { writeLog("==> SPLIT DISCONTINUOUS\n"); uint32 numTested = 0; uint32 numSplit = 0; uint32 numCreated = 0; uint32 splitFragsLen = 0; uint32 splitFragsMax = 0; ufNode *splitFrags = NULL; for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *tig = unitigs[ti]; if ((tig == NULL) || (tig->ufpath.size() < 2)) continue; // Unitig must be sorted. Someone upstream os screwing this up. tig->sort(); // We'll want to build an array of new fragments to split out. This can be up // to the size of the largest unitig. splitFragsMax = MAX(splitFragsMax, tig->ufpath.size()); // Check that the unitig starts at position zero. Not critical for the next loop, but // needs to be dome sometime. int32 minPos = MIN(tig->ufpath[0].position.bgn, tig->ufpath[0].position.end); if (minPos == 0) continue; writeLog("splitDiscontinuous()-- tig "F_U32" offset messed up; reset by "F_S32".\n", tig->id(), minPos); for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; frg->position.bgn -= minPos; frg->position.end -= minPos; } } splitFrags = new ufNode [splitFragsMax]; // Now, finally, we can check for gaps in unitigs. for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *tig = unitigs[ti]; if ((tig == NULL) || (tig->ufpath.size() < 2)) continue; // We don't expect many unitigs to be broken, so we'll do a first quick pass to just // test if it is. int32 maxEnd = MAX(tig->ufpath[0].position.bgn, tig->ufpath[0].position.end); bool isBroken = false; for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; int32 bgn = MIN(frg->position.bgn, frg->position.end); int32 end = MAX(frg->position.bgn, frg->position.end); if (bgn > maxEnd - minOverlap) { isBroken = true; break; } maxEnd = MAX(maxEnd, end); } numTested++; if (isBroken == false) continue; numSplit++; // Dang, busted unitig. Fix it up. splitFragsLen = 0; maxEnd = 0; if (logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS)) writeLog("splitDiscontinuous()-- discontinuous tig "F_U32" with "F_SIZE_T" fragments broken into:\n", tig->id(), tig->ufpath.size()); for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; int32 bgn = MIN(frg->position.bgn, frg->position.end); int32 end = MAX(frg->position.bgn, frg->position.end); // Good thick overlap exists to this fragment, save it. if (bgn <= maxEnd - minOverlap) { assert(splitFragsLen < splitFragsMax); splitFrags[splitFragsLen++] = *frg; maxEnd = MAX(maxEnd, end); continue; } // No thick overlap found. We need to break right here before the current fragment. // If there is exactly one fragment, and it's contained, and it's not mated, move it to the // container. (This has a small positive benefit over just making every read a singleton). // if ((splitFragsLen == 1) && (FI->mateIID(splitFrags[0].ident) == 0) && (splitFrags[0].contained != 0)) { Unitig *dangler = unitigs[tig->fragIn(splitFrags[0].contained)]; // If the parent isn't in a unitig, we must have shattered the repeat unitig it was in. // Do the same here. if (dangler == NULL) { if (logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS)) writeLog("splitDiscontinuous()-- singleton frag "F_U32" shattered.\n", splitFrags[0].ident); Unitig::removeFrag(splitFrags[0].ident); } else { assert(dangler->id() == tig->fragIn(splitFrags[0].contained)); if (logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS)) writeLog("splitDiscontinuous()-- old tig "F_U32" with "F_SIZE_T" fragments (contained frag "F_U32" moved here).\n", dangler->id(), dangler->ufpath.size() + 1, splitFrags[0].ident); BestContainment *bestcont = OG->getBestContainer(splitFrags[0].ident); assert(bestcont->isContained == true); dangler->addContainedFrag(splitFrags[0].ident, bestcont, false); dangler->bubbleSortLastFrag(); assert(dangler->id() == Unitig::fragIn(splitFrags[0].ident)); } } // Otherwise, make an entirely new unitig for these fragments. else { numCreated++; makeNewUnitig(unitigs, splitFragsLen, splitFrags); tig = unitigs[ti]; } // Done with the split, save the current fragment. This resets everything. splitFragsLen = 0; splitFrags[splitFragsLen++] = *frg; maxEnd = end; } // If we did any splitting, then the length of the frags in splitFrags will be less than the length // of the path in the current unitig. Make a final new unitig for the remaining fragments. // if (splitFragsLen != tig->ufpath.size()) { numCreated++; makeNewUnitig(unitigs, splitFragsLen, splitFrags); delete unitigs[ti]; unitigs[ti] = NULL; } } writeLog("splitDiscontinuous()-- Tested "F_U32" unitigs, split "F_U32" into "F_U32" new unitigs.\n", numTested, numSplit, numCreated); delete [] splitFrags; }
void UnitigGraph::setParentAndHang(void) { for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg == NULL) continue; if (utg->ufpath.size() == 0) continue; // Reset parent and hangs for everything. for (uint32 fi=1; fi<utg->ufpath.size(); fi++) { ufNode *frg = &utg->ufpath[fi]; frg->parent = 0; frg->ahang = 0; frg->bhang = 0; } // For each fragment, set parent/hangs using the edges. for (uint32 fi=0; fi<utg->ufpath.size(); fi++) { ufNode *frg = &utg->ufpath[fi]; // If we're contained, gee, I sure hope the container is here! BestContainment *bestcont = OG->getBestContainer(frg->ident); if ((bestcont) && (utg->fragIn(bestcont->container) == utg->id())) { int32 pi = utg->pathPosition(bestcont->container); ufNode *par = &utg->ufpath[pi]; frg->parent = bestcont->container; // The hangs assume the container is forward; adjust if not so. if (par->position.bgn < par->position.end) { frg->ahang = bestcont->a_hang; frg->bhang = bestcont->b_hang; } else { frg->ahang = -bestcont->b_hang; frg->bhang = -bestcont->a_hang; } continue; } // Nope, not contained. If we don't have a parent set, see if one of our best overlaps // can set it. BestEdgeOverlap *bestedge5 = OG->getBestEdgeOverlap(frg->ident, false); BestEdgeOverlap *bestedge3 = OG->getBestEdgeOverlap(frg->ident, true); if ((bestedge5->fragId()) && (utg->fragIn(bestedge5->fragId()) == utg->id())) { int32 pi5 = utg->pathPosition(bestedge5->fragId()); ufNode *oth = &utg->ufpath[pi5]; // Consensus is expected parent/hangs to be relative to the parent fragment. This is used // ONLY to place the fragment, not to orient the fragment. Orientation comes from the // absolute positioning coordinates. // // Interestingly, all four overlap transformations are used here. // // The inner if tests (on fragment orientation) should be asserts, but due to imprecise // layouts, they are sometimes violated: // A fragment from 271-547 had a 5'overlap to something after it; // the frag after was at 543-272, close enough to a tie to screw up placements // if (pi5 < fi) { // We have an edge off our 5' end to something before us --> fragment MUST be forward. // Flip the overlap so it is relative to the other fragment. if (frg->position.bgn < frg->position.end) { frg->parent = bestedge5->fragId(); frg->ahang = -bestedge5->ahang(); frg->bhang = -bestedge5->bhang(); assert(frg->ahang >= 0); } } else { // We have an edge off our 5' end to something after us --> fragment MUST be reverse. // Because our fragment is now reverse, we must reverse the overlap too. if (frg->position.end < frg->position.bgn) { oth->parent = frg->ident; oth->ahang = -bestedge5->bhang(); oth->bhang = -bestedge5->ahang(); assert(oth->ahang >= 0); } } } if ((bestedge3->fragId()) && (utg->fragIn(bestedge3->fragId()) == utg->id())) { int32 pi3 = utg->pathPosition(bestedge3->fragId()); ufNode *oth = &utg->ufpath[pi3]; if (pi3 < fi) { // We have an edge off our 3' end to something before us --> fragment MUST be reverse. // Flip the overlap so it is relative to the other fragment. // Because our fragment is now reverse, we must reverse the overlap too. if (frg->position.end < frg->position.bgn) { frg->parent = bestedge3->fragId(); frg->ahang = bestedge3->bhang(); frg->bhang = bestedge3->ahang(); assert(frg->ahang >= 0); } } else { // We have an edge off our 3' end to something after us --> fragment MUST be forward. // This is the simplest case, the overlap is already correct. if (frg->position.bgn < frg->position.end) { oth->parent = frg->ident; oth->ahang = bestedge3->ahang(); oth->bhang = bestedge3->bhang(); assert(oth->ahang >= 0); } } } } } }
// Make sure that contained fragments are in the same unitig // as their container. Due to sorting, contained fragments // can come much later in the unitig: // // ------------1 // -------------2 // --------------3 // ----4 (contained in 1, too much error keeps it out of 2 and 3) // // So, our first pass is to move contained fragments around. // void UnitigGraph::moveContains(void) { for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *thisUnitig = unitigs[ti]; if ((thisUnitig == NULL) || (thisUnitig->ufpath.size() < 2)) continue; MateLocation positions(thisUnitig); ufNode *frags = new ufNode [thisUnitig->ufpath.size()]; uint32 fragsLen = 0; if (logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS)) fprintf(logFile, "moveContain unitig %d\n", thisUnitig->id()); for (uint32 fi=0; fi<thisUnitig->ufpath.size(); fi++) { ufNode *frg = &thisUnitig->ufpath[fi]; BestContainment *bestcont = OG->getBestContainer(frg->ident); MateLocationEntry mloc = positions.getById(frg->ident); uint32 thisFrgID = frg->ident; uint32 contFrgID = (bestcont) ? bestcont->container : 0; uint32 mateFrgID = FI->mateIID(frg->ident); uint32 thisUtgID = thisUnitig->fragIn(thisFrgID); uint32 contUtgID = thisUnitig->fragIn(contFrgID); uint32 mateUtgID = thisUnitig->fragIn(mateFrgID); // id1 != 0 -> we found the fragment in the mate happiness table // isBad -> and the mate is unhappy. // // What's id1 vs id2 in MateLocationEntry? Dunno. All I // know is that if there is no mate present, one of those // will be 0. (Similar test used above too.) // bool isMated = (mateFrgID > 0); bool isGrumpy = ((isMated) && (mloc.mleFrgID1 != 0) && (mloc.mleFrgID2 != 0) && (mloc.isGrumpy == true)); // // Figure out what to do. // bool moveToContainer = false; bool moveToSingleton = false; if ((frg->contained == 0) && (bestcont == NULL)) { // CASE 1: Not contained. Leave the fragment here. //fprintf(logFile, "case1 frag %d fragsLen %d\n", thisFrgID, fragsLen); } else if (isMated == false) { // CASE 2: Contained but not mated. Move to be with the // container (if the container isn't here). //fprintf(logFile, "case2 frag %d contID %d fragsLen %d\n", thisFrgID, contUtgID, fragsLen); if (thisUtgID != contUtgID) moveToContainer = true; } else if ((isGrumpy == true) && (thisUtgID == mateUtgID)) { // CASE 3: Not happy, and the frag and mate are together. // Kick out to a singleton. //fprintf(logFile, "case3 frag %d utg %d mate %d utg %d cont %d utg %d fragsLen %d\n", // thisFrgID, thisUtgID, mateFrgID, mateUtgID, contFrgID, contUtgID, fragsLen); if (thisUtgID == mateUtgID) moveToSingleton = true; } else { // This makes for some ugly code (we break the nice if else // if else structure we had going on) but the next two cases // need to know if there is an overlap to the rest of the // unitig. bool hasOverlap = (thisUtgID == contUtgID); bool allContained = false; if (hasOverlap == false) { if (fragsLen == 0) { // The first fragment. Check fragments after to see if // there is an overlap (note only frags with an overlap // in the layout are tested). In rare cases, we ejected // the container, and left a containee with no overlap to // fragments remaining. // // Note that this checks if there is an overlap to the // very first non-contained (aka dovetail) fragment ONLY. // If there isn't an overlap to the first non-contained // fragment, then that fragment will likely NOT align // correctly. uint32 ft = fi + 1; #warning 2x BUGS IN COMPARISON HERE // Skip all the contains. while ((ft < thisUnitig->ufpath.size()) && (OG->isContained(thisUnitig->ufpath[ft].ident) == true) && (MAX(frg->position.bgn, frg->position.end) < MIN(thisUnitig->ufpath[ft].position.bgn, thisUnitig->ufpath[ft].position.end))) ft++; // If the frag is not contained (we could be the // container), and overlaps in the layout, see if there // is a real overlap. if ((ft < thisUnitig->ufpath.size()) && (OG->isContained(thisUnitig->ufpath[ft].ident) == false) && (MAX(frg->position.bgn, frg->position.end) < MIN(thisUnitig->ufpath[ft].position.bgn, thisUnitig->ufpath[ft].position.end))) hasOverlap = OG->containHaveEdgeTo(thisFrgID, thisUnitig->ufpath[ft].ident); } else { // Not the first fragment, search for an overlap to an // already placed frag. uint32 ft = fi; do { ft--; // OK to overlap to a contained frag; he could be our // container. hasOverlap = OG->containHaveEdgeTo(thisFrgID, thisUnitig->ufpath[ft].ident); // Stop if we found an overlap, or we just checked the // first frag in the unitig, or we no longer overlap in // the layout. } while ((hasOverlap == false) && (ft > 0) && (MIN(frg->position.bgn, frg->position.end) < MAX(thisUnitig->ufpath[ft].position.bgn, thisUnitig->ufpath[ft].position.end))); } } // end of hasOverlap // An unbelievabe special case. When the unitig is just a // single container fragment (and any contained frags under // it) rule 4 breaks. The first fragment has no overlap (all // later reads are contained) and so we want to eject it to a // new unitig. Since there are multiple fragments in this // unitig, the ejection occurs. Later, all the contains get // moved to the new unitig. And we repeat. To prevent, we // abort the ejection if the unitig is all contained in one // fragment. // if (fragsLen == 0) { allContained = true; for (uint32 ft = fi + 1; ((allContained == true) && (ft < thisUnitig->ufpath.size())); ft++) allContained = OG->isContained(thisUnitig->ufpath[ft].ident); } if (isGrumpy == true) { // CASE 4: Not happy and not with the mate. This one is a // bit of a decision. // // If an overlap exists to the rest of the unitig, we'll // leave it here. We'll also leave it here if it is the // rest of the unitig is all contained in this fragment. // // If no overlap, and the mate and container are in the // same unitig, we'll just eject. That also implies the // other unitig is somewhat large, at least as big as the // insert size. // // Otherwise, we'll move to the container and cross our // fingers we place it correctly. The alternative is to // eject, and hope that we didn't also eject the mate to a // singleton. //fprintf(logFile, "case4 frag %d utg %d mate %d utg %d cont %d utg %d fragsLen %d\n", // thisFrgID, thisUtgID, mateFrgID, mateUtgID, contFrgID, contUtgID, fragsLen); if ((hasOverlap == false) && (allContained == false)) if (mateUtgID == contUtgID) moveToSingleton = true; else moveToContainer = true; } else { // CASE 5: Happy! If with container, or an overlap exists to // some earlier fragment, leave it here. Otherwise, eject it // to a singleton. The fragment is ejected instead of moved // to be with its container since we don't know which is // correct - the mate or the overlap. // // If not happy, we've already made sure that the mate is not // here (that was case 3). //fprintf(logFile, "case5 frag %d utg %d mate %d utg %d cont %d utg %d fragsLen %d\n", // thisFrgID, thisUtgID, mateFrgID, mateUtgID, contFrgID, contUtgID, fragsLen); // If no overlap (so not with container or no overlap to // other frags) eject. if ((hasOverlap == false) && (allContained == false)) moveToSingleton = true; } } // End of cases // // Do it. // if (moveToContainer == true) { // Move the fragment to be with its container. Unitig *thatUnitig = unitigs[contUtgID]; ufNode containee = *frg; assert(thatUnitig->id() == contUtgID); // Nuke the fragment in the current list frg->ident = 999999999; frg->contained = 999999999; frg->position.bgn = 0; frg->position.end = 0; assert(thatUnitig->id() == contUtgID); if (logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS)) fprintf(logFile, "Moving contained fragment %d from unitig %d to be with its container %d in unitig %d\n", thisFrgID, thisUtgID, contFrgID, contUtgID); assert(bestcont->container == contFrgID); thatUnitig->addContainedFrag(thisFrgID, bestcont, logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS)); assert(thatUnitig->id() == Unitig::fragIn(thisFrgID)); } else if ((moveToSingleton == true) && (thisUnitig->getNumFrags() != 1)) { // Eject the fragment to a singleton (unless we ARE the singleton) Unitig *singUnitig = new Unitig(logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS)); ufNode containee = *frg; // Nuke the fragment in the current list frg->ident = 999999999; frg->contained = 999999999; frg->position.bgn = 0; frg->position.end = 0; if (logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS)) fprintf(logFile, "Ejecting unhappy contained fragment %d from unitig %d into new unitig %d\n", thisFrgID, thisUtgID, singUnitig->id()); containee.contained = 0; singUnitig->addFrag(containee, -MIN(containee.position.bgn, containee.position.end), logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS)); unitigs.push_back(singUnitig); thisUnitig = unitigs[ti]; // Reset the pointer; unitigs might be reallocated } else { // Leave fragment here. Copy the fragment to the list -- if // we need to rebuild the unitig (because fragments were // removed), the list is used, otherwise, we have already // made the changes needed. // // Also, very important, update our containment mark. If our // container was moved, but we stayed put because of a happy // mate, we're still marked as being contained. Rather than // put this check in all the places where we stay put in the // above if-else-else-else, it's here. if ((frg->contained) && (thisUtgID != contUtgID)) frg->contained = 0; frags[fragsLen] = *frg; fragsLen++; } } // over all frags // Now, rebuild this unitig if we made changes. if (fragsLen != thisUnitig->ufpath.size()) { if (logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS)) fprintf(logFile, "Rebuild unitig %d after removing contained fragments.\n", thisUnitig->id()); thisUnitig->ufpath.clear(); // Occasionally, we move all fragments out of the original unitig. Might be worth checking // if that makes sense!! // #warning EMPTIED OUT A UNITIG if (fragsLen > 0) { // No need to resort. Offsets only need adjustment if the first fragment is thrown out. // If not, splitOffset will be zero. // int splitOffset = -MIN(frags[0].position.bgn, frags[0].position.end); // This is where we clean up from the splitting not dealing with contained fragments -- we // force the first frag to be uncontained. // frags[0].contained = 0; for (uint32 i=0; i<fragsLen; i++) thisUnitig->addFrag(frags[i], splitOffset, logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS)); } } delete [] frags; frags = NULL; } // Over all unitigs }