void placeContainsUsingBestOverlaps(UnitigVector &unitigs) { uint32 fragsPlaced = 1; uint32 fragsPending = 0; logFileFlags &= ~LOG_PLACE_FRAG; while (fragsPlaced > 0) { fragsPlaced = 0; fragsPending = 0; writeLog("==> PLACING CONTAINED FRAGMENTS\n"); for (uint32 fid=1; fid<FI->numFragments()+1; fid++) { BestContainment *bestcont = OG->getBestContainer(fid); Unitig *utg; if (bestcont->isContained == false) // Not a contained fragment. continue; if (Unitig::fragIn(fid) != 0) // Containee already placed. continue; if (Unitig::fragIn(bestcont->container) == 0) { // Container not placed (yet). fragsPending++; continue; } utg = unitigs[Unitig::fragIn(bestcont->container)]; utg->addContainedFrag(fid, bestcont, logFileFlagSet(LOG_INITIAL_CONTAINED_PLACEMENT)); if (utg->id() != Unitig::fragIn(fid)) writeLog("placeContainsUsingBestOverlaps()-- FAILED to add frag %d to unitig %d.\n", fid, bestcont->container); assert(utg->id() == Unitig::fragIn(fid)); fragsPlaced++; } writeLog("==> PLACING CONTAINED FRAGMENTS - placed %d fragments; still need to place %d\n", fragsPlaced, fragsPending); if ((fragsPlaced == 0) && (fragsPending > 0)) { writeLog("Stopping contained fragment placement due to zombies.\n"); fragsPlaced = 0; fragsPending = 0; } } for (uint32 ti=1; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg) utg->sort(); } }
ChunkGraph::ChunkGraph(const char *output_prefix) { setLogFile(output_prefix, "ChunkGraph"); _maxFragment = FI->numFragments(); _restrict = NULL; _pathLen = new uint32 [_maxFragment * 2 + 2]; _chunkLength = new ChunkLength [_maxFragment]; _chunkLengthIter = 0; memset(_pathLen, 0, sizeof(uint32) * (_maxFragment * 2 + 2)); memset(_chunkLength, 0, sizeof(ChunkLength) * (_maxFragment)); for (uint32 fid=1; fid <= _maxFragment; fid++) { if (OG->isContained(fid)) { if (logFileFlagSet(LOG_CHUNK_GRAPH)) writeLog("read %u contained\n", fid); continue; } if (OG->isSuspicious(fid)) { if (logFileFlagSet(LOG_CHUNK_GRAPH)) writeLog("read %u suspicious\n", fid); continue; } uint32 l5 = countFullWidth(FragmentEnd(fid, false)); uint32 l3 = countFullWidth(FragmentEnd(fid, true)); _chunkLength[fid-1].fragId = fid; _chunkLength[fid-1].cnt = l5 + l3; } delete [] _pathLen; _pathLen = NULL; std::sort(_chunkLength, _chunkLength + _maxFragment); }
static void makeNewUnitig(UnitigVector &unitigs, uint32 splitFragsLen, ufNode *splitFrags) { Unitig *dangler = unitigs.newUnitig(false); if (logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS)) writeLog("splitDiscontinuous()-- new tig "F_U32" with "F_U32" fragments (starting at frag "F_U32").\n", dangler->id(), splitFragsLen, splitFrags[0].ident); int splitOffset = -MIN(splitFrags[0].position.bgn, splitFrags[0].position.end); // This should already be true, but we force it still splitFrags[0].contained = 0; for (uint32 i=0; i<splitFragsLen; i++) dangler->addFrag(splitFrags[i], splitOffset, false); //logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS)); }
void reportUnitigs(UnitigVector &unitigs, const char *prefix, const char *name) { if (logFileFlagSet(LOG_INTERMEDIATE_UNITIGS) == 0) return; uint32 numFragsT = 0; uint32 numFragsP = 0; uint64 utgLen = 0; // Compute average frags per partition. for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg == NULL) continue; numFragsT += utg->ufpath.size(); if (utg->ufpath.size() > 2) utgLen += utg->getLength(); } if (utgLen < 16 * 1024 * 1024) numFragsP = numFragsT / 7; else if (utgLen < 64 * 1024 * 1024) numFragsP = numFragsT / 63; else numFragsP = numFragsT / 127; char tigStorePath[FILENAME_MAX]; sprintf(tigStorePath, "%s.%03u.%s.tigStore", prefix, logFileOrder, name); // Failing to do this results in consensus running about 40 times slower. Three hours instead of // five minutes. setParentAndHang(unitigs); writeUnitigsToStore(unitigs, tigStorePath, tigStorePath, numFragsP, false); }
// Closes the current logFile, opens a new one called 'prefix.logFileOrder.label'. If 'label' is // NULL, the logFile is reset to stderr. void setLogFile(char const *prefix, char const *label) { assert(prefix != NULL); // Allocate space. if (logFileThread == NULL) logFileThread = new logFileInstance [omp_get_max_threads()]; // If writing to stderr, that's all we needed to do. if (logFileFlagSet(LOG_STDERR)) return; // Close out the old. logFileMain.close(); for (int32 tn=0; tn<omp_get_max_threads(); tn++) logFileThread[tn].close(); // Move to the next iteration. logFileOrder++; // Set up for that iteration. logFileMain.set(prefix, logFileOrder, label, 0); for (int32 tn=0; tn<omp_get_max_threads(); tn++) logFileThread[tn].set(prefix, logFileOrder, label, tn+1); // File open is delayed until it is used. if (label != NULL) fprintf(stderr, "setLogFile()-- Now logging to '%s.%03d.%s'\n", prefix, logFileOrder, label); }
void breakUnitigs(UnitigVector &unitigs, char *output_prefix, bool enableIntersectionBreaking) { writeLog("==> BREAKING UNITIGS.\n"); intersectionList *ilist = new intersectionList(unitigs); // Stop when we've seen all current unitigs. Replace tiMax // in the for loop below with unitigs.size() to recursively // split unitigs. uint32 tiMax = unitigs.size(); for (uint32 ti=0; ti<tiMax; ti++) { Unitig *tig = unitigs[ti]; if (tig == NULL) continue; vector<breakPoint> breaks; for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; intersectionPoint *isect = ilist->getIntersection(frg->ident, 0); if (isect == NULL) continue; for (; isect->isectFrg == frg->ident; isect++) { assert(tig->id() == Unitig::fragIn(isect->isectFrg)); // Grab the invading unitig Unitig *inv = unitigs[Unitig::fragIn(isect->invadFrg)]; assert(inv->id() == Unitig::fragIn(isect->invadFrg)); // Grab the best edges off the invading fragment. BestEdgeOverlap *best5 = OG->getBestEdgeOverlap(isect->invadFrg, false); BestEdgeOverlap *best3 = OG->getBestEdgeOverlap(isect->invadFrg, true); // Check if the incoming tig is a spur, and we should just ignore it immediately if ((inv->ufpath.size() == 1) && ((best5->fragId() == 0) || (best3->fragId() == 0))) { if (logFileFlagSet(LOG_INTERSECTION_BREAKING)) writeLog("unitig %d frag %d end %c' into unitig %d frag %d end %c' -- IS A SPUR, skip it\n", inv->id(), isect->invadFrg, isect->invad3p ? '3' : '5', tig->id(), isect->isectFrg, isect->isect3p ? '3' : '5'); continue; } // Keep only significant intersections if ((inv->getLength() > MIN_BREAK_LENGTH) && (inv->ufpath.size() > MIN_BREAK_FRAGS)) { if (logFileFlagSet(LOG_INTERSECTION_BREAKING)) writeLog("unitig %d frag %d end %c' into unitig %d frag %d end %c'\n", inv->id(), isect->invadFrg, isect->invad3p ? '3' : '5', tig->id(), isect->isectFrg, isect->isect3p ? '3' : '5'); breaks.push_back(breakPoint(isect->isectFrg, isect->isect3p, true, false)); } } // Over all incoming fragments // If this is the last fragment, terminate the break point list with a 'fakeEnd' (in AS_BAT_Breaking.cc) break point // at the end of the unitig. if ((fi+1 == tig->ufpath.size()) && (breaks.size() > 0)) { breaks.push_back(breakPoint(frg->ident, (frg->position.bgn < frg->position.end), true, false)); } } // Over all fragments in the unitig if (breaks.size() == 0) continue; // Report where breaks occur. 'breaks' is a list, not a vector. #if 0 // We've lost the fields in breaks[i] -- but the reports above aren't updated yet. if (logFileFlagSet(LOG_INTERSECTION_BREAKING) || logFileFlagSet(LOG_MATE_SPLIT_COVERAGE_PLOT)) for (uint32 i=0; i<breaks.size(); i++) writeLog("BREAK unitig %d at position %d,%d from inSize %d inFrags %d.\n", tig->id(), breaks[i].fragPos.bgn, breaks[i].fragPos.end, breaks[i].inSize, breaks[i].inFrags); #endif // Actually do the breaking. if (enableIntersectionBreaking) breakUnitigAt(unitigs, tig, breaks, true); breaks.clear(); } // Over all unitigs }
// After splitting and ejecting some contains, check for discontinuous unitigs. // void splitDiscontinuousUnitigs(UnitigVector &unitigs, uint32 minOverlap) { writeLog("==> SPLIT DISCONTINUOUS\n"); uint32 numTested = 0; uint32 numSplit = 0; uint32 numCreated = 0; uint32 splitFragsLen = 0; uint32 splitFragsMax = 0; ufNode *splitFrags = NULL; for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *tig = unitigs[ti]; if ((tig == NULL) || (tig->ufpath.size() < 2)) continue; // Unitig must be sorted. Someone upstream os screwing this up. tig->sort(); // We'll want to build an array of new fragments to split out. This can be up // to the size of the largest unitig. splitFragsMax = MAX(splitFragsMax, tig->ufpath.size()); // Check that the unitig starts at position zero. Not critical for the next loop, but // needs to be dome sometime. int32 minPos = MIN(tig->ufpath[0].position.bgn, tig->ufpath[0].position.end); if (minPos == 0) continue; writeLog("splitDiscontinuous()-- tig "F_U32" offset messed up; reset by "F_S32".\n", tig->id(), minPos); for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; frg->position.bgn -= minPos; frg->position.end -= minPos; } } splitFrags = new ufNode [splitFragsMax]; // Now, finally, we can check for gaps in unitigs. for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *tig = unitigs[ti]; if ((tig == NULL) || (tig->ufpath.size() < 2)) continue; // We don't expect many unitigs to be broken, so we'll do a first quick pass to just // test if it is. int32 maxEnd = MAX(tig->ufpath[0].position.bgn, tig->ufpath[0].position.end); bool isBroken = false; for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; int32 bgn = MIN(frg->position.bgn, frg->position.end); int32 end = MAX(frg->position.bgn, frg->position.end); if (bgn > maxEnd - minOverlap) { isBroken = true; break; } maxEnd = MAX(maxEnd, end); } numTested++; if (isBroken == false) continue; numSplit++; // Dang, busted unitig. Fix it up. splitFragsLen = 0; maxEnd = 0; if (logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS)) writeLog("splitDiscontinuous()-- discontinuous tig "F_U32" with "F_SIZE_T" fragments broken into:\n", tig->id(), tig->ufpath.size()); for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; int32 bgn = MIN(frg->position.bgn, frg->position.end); int32 end = MAX(frg->position.bgn, frg->position.end); // Good thick overlap exists to this fragment, save it. if (bgn <= maxEnd - minOverlap) { assert(splitFragsLen < splitFragsMax); splitFrags[splitFragsLen++] = *frg; maxEnd = MAX(maxEnd, end); continue; } // No thick overlap found. We need to break right here before the current fragment. // If there is exactly one fragment, and it's contained, and it's not mated, move it to the // container. (This has a small positive benefit over just making every read a singleton). // if ((splitFragsLen == 1) && (FI->mateIID(splitFrags[0].ident) == 0) && (splitFrags[0].contained != 0)) { Unitig *dangler = unitigs[tig->fragIn(splitFrags[0].contained)]; // If the parent isn't in a unitig, we must have shattered the repeat unitig it was in. // Do the same here. if (dangler == NULL) { if (logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS)) writeLog("splitDiscontinuous()-- singleton frag "F_U32" shattered.\n", splitFrags[0].ident); Unitig::removeFrag(splitFrags[0].ident); } else { assert(dangler->id() == tig->fragIn(splitFrags[0].contained)); if (logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS)) writeLog("splitDiscontinuous()-- old tig "F_U32" with "F_SIZE_T" fragments (contained frag "F_U32" moved here).\n", dangler->id(), dangler->ufpath.size() + 1, splitFrags[0].ident); BestContainment *bestcont = OG->getBestContainer(splitFrags[0].ident); assert(bestcont->isContained == true); dangler->addContainedFrag(splitFrags[0].ident, bestcont, false); dangler->bubbleSortLastFrag(); assert(dangler->id() == Unitig::fragIn(splitFrags[0].ident)); } } // Otherwise, make an entirely new unitig for these fragments. else { numCreated++; makeNewUnitig(unitigs, splitFragsLen, splitFrags); tig = unitigs[ti]; } // Done with the split, save the current fragment. This resets everything. splitFragsLen = 0; splitFrags[splitFragsLen++] = *frg; maxEnd = end; } // If we did any splitting, then the length of the frags in splitFrags will be less than the length // of the path in the current unitig. Make a final new unitig for the remaining fragments. // if (splitFragsLen != tig->ufpath.size()) { numCreated++; makeNewUnitig(unitigs, splitFragsLen, splitFrags); delete unitigs[ti]; unitigs[ti] = NULL; } } writeLog("splitDiscontinuous()-- Tested "F_U32" unitigs, split "F_U32" into "F_U32" new unitigs.\n", numTested, numSplit, numCreated); delete [] splitFrags; }
void MateLocation::buildHappinessGraphs(UnitigVector &unitigs) { // First entry is always zero. Needed for the getById() accessor. assert(_table[0].mleFrgID1 == 0); assert(_table[0].mleFrgID2 == 0); assert(IS != NULL); for (uint32 mleidx=1; mleidx<_table.size(); mleidx++) { MateLocationEntry &loc = _table[mleidx]; // We MUST have mleFrgID1 defined. If mleFrgID2 is not defined, then the mate is external. assert(loc.mleFrgID1 != 0); // Well, this bit of ugly is here to fill out the location of the mate (when it is in a // different unitig, the location for this fragment is not set)....EXCEPT that the mate might // not even be placed in a unitig yet (if it is a contain, and we're called before contains are // placed). // // This is currently only used for logging -- AND statistics on mate happiness. Later we // should use it to determine if the mate is buried in the other unitig, which would make the // fragment in this unitig bad. // if (loc.mleFrgID2 == 0) { assert(loc.mleUtgID2 == 0); loc.mleFrgID2 = FI->mateIID(loc.mleFrgID1); loc.mleUtgID2 = _tig->fragIn(loc.mleFrgID2); if (loc.mleUtgID2 != 0) { Unitig *mt = unitigs[loc.mleUtgID2]; uint32 fi = _tig->pathPosition(loc.mleFrgID2); loc.mlePos2 = mt->ufpath[fi].position; } } uint32 lib = FI->libraryIID(loc.mleFrgID1); if (lib == 0) // Shouldn't occur, but just in case, ignore fragments in the legacy library. continue; if (IS->valid(lib) == false) // Don't check libs that we didn't generate good stats for continue; int32 badMaxInter = static_cast<int32>(IS->mean(lib) + BADMATE_INTER_STDDEV * IS->stddev(lib)); int32 badMinInter = static_cast<int32>(IS->mean(lib) - BADMATE_INTER_STDDEV * IS->stddev(lib)); int32 badMaxIntra = static_cast<int32>(IS->mean(lib) + BADMATE_INTRA_STDDEV * IS->stddev(lib)); int32 badMinIntra = static_cast<int32>(IS->mean(lib) - BADMATE_INTRA_STDDEV * IS->stddev(lib)); // To keep the results the same as the previous version (1.89) badMaxIntra = badMaxInter; badMinIntra = badMinInter; int32 dist = 0; int32 bgn = 0; int32 end = 0; // Bgn and End MUST be signed. int32 matBgn = loc.mlePos2.bgn; int32 matEnd = loc.mlePos2.end; int32 matLen = (matBgn < matEnd) ? (matEnd - matBgn) : (matBgn - matEnd); int32 frgBgn = loc.mlePos1.bgn; int32 frgEnd = loc.mlePos1.end; int32 frgLen = (frgBgn < frgEnd) ? (frgEnd - frgBgn) : (frgBgn - frgEnd); int32 nContained = 0; if (OG->getBestContainer(loc.mleFrgID1)->isContained == true) nContained++; if (OG->getBestContainer(loc.mleFrgID2)->isContained == true) nContained++; if ((matLen >= MIN(badMaxInter, badMaxIntra)) || (frgLen >= MIN(badMaxInter, badMaxIntra))) // Yikes, fragment longer than insert size! continue; // Until reset, assume this is a bad mate pair. loc.isGrumpy = true; int32 ULEN1 = 0; //unitigs[loc.mleUtgID1]->getLength() int32 ULEN2 = 0; //unitigs[loc.mleUtgID2]->getLength() // If the mate is in another unitig, mark bad only if there is enough space to fit the mate in // this unitig. // // TODO: OR if there isn't enough space on the end of the OTHER unitig // if (loc.mleUtgID1 != loc.mleUtgID2) { if ((isReverse(loc.mlePos1) == true) && (badMaxInter < frgBgn)) { incrRange(badExternalRev, -1, frgBgn - badMaxInter, frgEnd); incrRange(badExternalRev, -1, frgBgn, frgEnd); nbadExternalRev[nContained]++; if (logFileFlagSet(LOG_HAPPINESS)) writeLog("buildHappinessGraph()-- unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- bad external reverse\n", loc.mleUtgID1, ULEN1, loc.mleFrgID1, frgBgn, frgEnd, frgLen, loc.mleUtgID2, ULEN2, loc.mleFrgID2, matBgn, matEnd, matLen); goto markBad; } if ((isReverse(loc.mlePos1) == false) && (badMaxInter < _tigLen - frgBgn)) { incrRange(badExternalFwd, -1, frgEnd, frgBgn + badMaxInter); incrRange(badExternalFwd, -1, frgEnd, frgBgn); nbadExternalFwd[nContained]++; if (logFileFlagSet(LOG_HAPPINESS)) writeLog("buildHappinessGraph()-- unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- bad external forward\n", loc.mleUtgID1, ULEN1, loc.mleFrgID1, frgBgn, frgEnd, frgLen, loc.mleUtgID2, ULEN2, loc.mleFrgID2, matBgn, matEnd, matLen); goto markBad; } // Not enough space. Not a grumpy mate pair. loc.isGrumpy = false; if (isReverse(loc.mlePos1) == true) ngoodExternalRev[nContained]++; else ngoodExternalFwd[nContained]++; if (logFileFlagSet(LOG_HAPPINESS)) writeLog("buildHappinessGraph()-- unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- not bad, not enough space\n", loc.mleUtgID1, ULEN1, loc.mleFrgID1, frgBgn, frgEnd, frgLen, loc.mleUtgID2, ULEN2, loc.mleFrgID2, matBgn, matEnd, matLen); continue; } // Both mates are in this unitig. // Same orientation? if ((isReverse(loc.mlePos1) == false) && (isReverse(loc.mlePos2) == false)) { incrRange(badNormal, -1, MIN(frgBgn, matBgn), MAX(frgEnd, matEnd)); nbadNormal[nContained]++; if (logFileFlagSet(LOG_HAPPINESS)) writeLog("buildHappinessGraph()-- unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- bad normal\n", loc.mleUtgID1, ULEN1, loc.mleFrgID1, frgBgn, frgEnd, frgLen, loc.mleUtgID2, ULEN2, loc.mleFrgID2, matBgn, matEnd, matLen); goto markBad; } if ((isReverse(loc.mlePos1) == true) && (isReverse(loc.mlePos2) == true)) { incrRange(badAnti, -1, MIN(frgEnd, matEnd), MAX(frgBgn, matBgn)); nbadAnti[nContained]++; if (logFileFlagSet(LOG_HAPPINESS)) writeLog("buildHappinessGraph()-- unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- bad anti\n", loc.mleUtgID1, ULEN1, loc.mleFrgID1, frgBgn, frgEnd, frgLen, loc.mleUtgID2, ULEN2, loc.mleFrgID2, matBgn, matEnd, matLen); goto markBad; } // Check a special case for a circular unitig, outtie mates, but close enough to the end to // plausibly be linking the ends together. // // <--- ---> // ========unitig========== // if ((isReverse(loc.mlePos1) == true) && (badMinIntra <= frgBgn + _tigLen - matBgn) && (frgBgn + _tigLen - matBgn <= badMaxIntra)) { loc.isGrumpy = false; // IT'S GOOD, kind of. ngood[nContained]++; if (logFileFlagSet(LOG_HAPPINESS)) writeLog("buildHappinessGraph()-- unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- good because circular\n", loc.mleUtgID1, ULEN1, loc.mleFrgID1, frgBgn, frgEnd, frgLen, loc.mleUtgID2, ULEN2, loc.mleFrgID2, matBgn, matEnd, matLen); continue; } // Outties? True if pos1.end < pos2.bgn. (For the second case, swap pos1 and pos2) // // (pos1.end) <------ (pos1.bgn) // (pos2.bgn) -------> (pos2.end) // if ((isReverse(loc.mlePos1) == true) && (loc.mlePos1.end < loc.mlePos2.bgn)) { incrRange(badOuttie, -1, MIN(frgBgn, frgEnd), MAX(matBgn, matEnd)); nbadOuttie[nContained]++; if (logFileFlagSet(LOG_HAPPINESS)) writeLog("buildHappinessGraph()-- unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- bad outtie (case 1)\n", loc.mleUtgID1, ULEN1, loc.mleFrgID1, frgBgn, frgEnd, frgLen, loc.mleUtgID2, ULEN2, loc.mleFrgID2, matBgn, matEnd, matLen); goto markBad; } if ((isReverse(loc.mlePos1) == false) && (loc.mlePos2.end < loc.mlePos1.bgn)) { incrRange(badOuttie, -1, MIN(frgBgn, frgEnd), MAX(matBgn, matEnd)); nbadOuttie[nContained]++; if (logFileFlagSet(LOG_HAPPINESS)) writeLog("buildHappinessGraph()-- unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- bad outtie (case 2)\n", loc.mleUtgID1, ULEN1, loc.mleFrgID1, frgBgn, frgEnd, frgLen, loc.mleUtgID2, ULEN2, loc.mleFrgID2, matBgn, matEnd, matLen); goto markBad; } // So, now not NORMAL or ANTI or OUTTIE. We must be left with innies. if (isReverse(loc.mlePos1) == false) // First fragment is on the left, second is on the right. dist = loc.mlePos2.bgn - loc.mlePos1.bgn; else // First fragment is on the right, second is on the left. dist = loc.mlePos1.bgn - loc.mlePos2.bgn; assert(dist >= 0); if (dist < badMinIntra) { incrRange(badCompressed, -1, MIN(frgBgn, matBgn), MAX(frgBgn, matBgn)); nbadCompressed[nContained]++; if (logFileFlagSet(LOG_HAPPINESS)) writeLog("buildHappinessGraph()-- unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- bad compressed\n", loc.mleUtgID1, ULEN1, loc.mleFrgID1, frgBgn, frgEnd, frgLen, loc.mleUtgID2, ULEN2, loc.mleFrgID2, matBgn, matEnd, matLen); goto markBad; } if (badMaxIntra < dist) { incrRange(badStretched, -1, MIN(frgBgn, matBgn), MAX(frgBgn, matBgn)); nbadStretched[nContained]++; if (logFileFlagSet(LOG_HAPPINESS)) writeLog("buildHappinessGraph()-- unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- bad stretched\n", loc.mleUtgID1, ULEN1, loc.mleFrgID1, frgBgn, frgEnd, frgLen, loc.mleUtgID2, ULEN2, loc.mleFrgID2, matBgn, matEnd, matLen); goto markBad; } assert(badMinIntra <= dist); assert(dist <= badMaxIntra); incrRange(good, 1, MIN(frgBgn, matBgn), MAX(frgBgn, matBgn)); loc.isGrumpy = false; // IT'S GOOD! ngood[nContained]++; if (logFileFlagSet(LOG_HAPPINESS)) writeLog("buildHappinessGraph()-- unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- GOOD!\n", loc.mleUtgID1, ULEN1, loc.mleFrgID1, frgBgn, frgEnd, frgLen, loc.mleUtgID2, ULEN2, loc.mleFrgID2, matBgn, matEnd, matLen); continue; markBad: // Mark bad from the 3' end of the fragment till the upper limit where the mate should go. if (loc.mleUtgID1 == _tig->id()) { assert(loc.mleFrgID1 != 0); if (isReverse(loc.mlePos1) == false) { // Mark bad for forward fagment 1 assert(frgBgn < frgEnd); bgn = frgEnd; end = frgBgn + badMaxIntra; incrRange(badFwd, -1, bgn, end); } else { // Mark bad for reverse fragment 1 assert(frgEnd < frgBgn); bgn = frgBgn - badMaxIntra; end = frgEnd; incrRange(badRev, -1, bgn, end); } } if (loc.mleUtgID2 == _tig->id()) { assert(loc.mleFrgID2 != 0); if (isReverse(loc.mlePos2) == false) { // Mark bad for forward fragment 2 assert(matBgn < matEnd); bgn = matEnd; end = matBgn + badMaxIntra; incrRange(badFwd, -1, bgn, end); } else { // Mark bad for reverse fragment 2 assert(matEnd < matBgn); bgn = matBgn - badMaxIntra; end = matEnd; incrRange(badRev, -1, bgn, end); } } } // Over all MateLocationEntries in the table } // buildHappinessGraph()
uint32 ChunkGraph::countFullWidth(FragmentEnd firstEnd) { uint64 firstIdx = getIndex(firstEnd); assert(firstIdx < _maxFragment * 2 + 2); if (_pathLen[firstIdx] > 0) return _pathLen[firstIdx]; uint32 length = 0; std::set<FragmentEnd> seen; FragmentEnd lastEnd = firstEnd; uint64 lastIdx = firstIdx; // Until we run off the chain, or we hit a fragment with a known length, compute the length FROM // THE START. // while ((lastIdx != 0) && (_pathLen[lastIdx] == 0)) { seen.insert(lastEnd); _pathLen[lastIdx] = ++length; // Follow the path of lastEnd lastEnd = OG->followOverlap(lastEnd); lastIdx = getIndex(lastEnd); } // Check why we stopped. Three cases: // // 1) We ran out of best edges to follow -- lastEnd.fragId() == 0 // 2) We encountered a fragment with known length -- _pathLen[lastEnd.index()] > 0 // 3) We encountered a self-loop (same condition as case 2) // // To distinguish case 2 and 3, we keep a set<> of the fragments we've seen in this construction. // If 'lastEnd' is in that set, then we're case 3. If so, adjust every node in the cycle to have // the same length, the length of the cycle itself. // // 'lastEnd' and 'index' are the first fragment in the cycle; we've seen this one before. // if (lastEnd.fragId() == 0) { // Case 1. Do nothing. ; } else if (seen.find(lastEnd) != seen.end()) { // Case 3, a cycle. uint32 cycleLen = length - _pathLen[lastIdx] + 1; FragmentEnd currEnd = lastEnd; uint64 currIdx = lastIdx; do { _pathLen[currIdx] = cycleLen; currEnd = OG->followOverlap(currEnd); currIdx = getIndex(currEnd); } while (lastEnd != currEnd); } else { // Case 2, an existing path. length += _pathLen[lastIdx]; } // Our return value is now whatever count we're at. uint32 lengthMax = length; // Traverse again, converting "path length from the start" into "path length from the end". Any // cycle has had its length set correctly already, and we stop at either the start of the cycle, // or at the start of any existing path. // FragmentEnd currEnd = firstEnd; uint64 currIdx = firstIdx; while (currEnd != lastEnd) { _pathLen[currIdx] = length--; currEnd = OG->followOverlap(currEnd); currIdx = getIndex(currEnd); } if (logFileFlagSet(LOG_CHUNK_GRAPH)) { seen.clear(); currEnd = firstEnd; currIdx = firstIdx; writeLog("path from %d,%d length %d:", firstEnd.fragId(), (firstEnd.frag3p()) ? 3 : 5, _pathLen[firstIdx]); while ((currEnd.fragId() != 0) && (seen.find(currEnd) == seen.end())) { seen.insert(currEnd); if (currEnd == lastEnd) writeLog(" LAST"); writeLog(" %d,%d(%d)", currEnd.fragId(), (currEnd.frag3p()) ? 3 : 5, _pathLen[currIdx]); currEnd = OG->followOverlap(currEnd); currIdx = getIndex(currEnd); } if (seen.find(currEnd) != seen.end()) writeLog(" CYCLE %d,%d(%d)", currEnd.fragId(), (currEnd.frag3p()) ? 3 : 5, _pathLen[currIdx]); writeLog("\n"); } if (lengthMax != _pathLen[firstIdx]) writeLog("ERROR: lengthMax %d _pathLen[] %d\n", lengthMax, _pathLen[firstIdx]); assert(lengthMax == _pathLen[firstIdx]); return(_pathLen[firstIdx]); }
void placeUnplacedUsingAllOverlaps(UnitigVector &unitigs, const char *prefix) { uint32 fiLimit = FI->numFragments(); uint32 numThreads = omp_get_max_threads(); uint32 blockSize = (fiLimit < 100 * numThreads) ? numThreads : fiLimit / 99; uint32 *placedTig = new uint32 [FI->numFragments() + 1]; SeqInterval *placedPos = new SeqInterval [FI->numFragments() + 1]; memset(placedTig, 0, sizeof(uint32) * (FI->numFragments() + 1)); memset(placedPos, 0, sizeof(SeqInterval) * (FI->numFragments() + 1)); // Just some logging. Count the number of reads we try to place. uint32 nToPlaceContained = 0; uint32 nToPlace = 0; uint32 nPlacedContained = 0; uint32 nPlaced = 0; uint32 nFailedContained = 0; uint32 nFailed = 0; for (uint32 fid=1; fid<FI->numFragments()+1; fid++) if (Unitig::fragIn(fid) == 0) if (OG->isContained(fid)) nToPlaceContained++; else nToPlace++; writeLog("placeContains()-- placing %u contained and %u unplaced reads, with %d threads.\n", nToPlaceContained, nToPlace, numThreads); // Do the placing! #pragma omp parallel for schedule(dynamic, blockSize) for (uint32 fid=1; fid<FI->numFragments()+1; fid++) { bool enableLog = true; if (Unitig::fragIn(fid) > 0) continue; // Place the read. vector<overlapPlacement> placements; placeFragUsingOverlaps(unitigs, AS_MAX_ERATE, NULL, fid, placements); // Search the placements for the highest expected identity placement using all overlaps in the unitig. uint32 b = UINT32_MAX; for (uint32 i=0; i<placements.size(); i++) { Unitig *tig = unitigs[placements[i].tigID]; if (placements[i].fCoverage < 0.99) // Ignore partially placed reads. continue; if (tig->ufpath.size() == 1) // Ignore placements in singletons. continue; uint32 bgn = (placements[i].position.bgn < placements[i].position.end) ? placements[i].position.bgn : placements[i].position.end; uint32 end = (placements[i].position.bgn < placements[i].position.end) ? placements[i].position.end : placements[i].position.bgn; double erate = placements[i].errors / placements[i].aligned; if (tig->overlapConsistentWithTig(5.0, bgn, end, erate) < 0.5) { if ((enableLog == true) && (logFileFlagSet(LOG_PLACE_UNPLACED))) writeLog("frag %8u tested tig %6u (%6u reads) at %8u-%8u (cov %7.5f erate %6.4f) - HIGH ERROR\n", fid, placements[i].tigID, tig->ufpath.size(), placements[i].position.bgn, placements[i].position.end, placements[i].fCoverage, erate); continue; } if ((enableLog == true) && (logFileFlagSet(LOG_PLACE_UNPLACED))) writeLog("frag %8u tested tig %6u (%6u reads) at %8u-%8u (cov %7.5f erate %6.4f)\n", fid, placements[i].tigID, tig->ufpath.size(), placements[i].position.bgn, placements[i].position.end, placements[i].fCoverage, erate); if ((b == UINT32_MAX) || (placements[i].errors / placements[i].aligned < placements[b].errors / placements[b].aligned)) b = i; } // If we didn't find a best, b will be invalid; set positions for adding to a new tig. // If we did, save both the position it was placed at, and the tigID it was placed in. if (b == UINT32_MAX) { if ((enableLog == true) && (logFileFlagSet(LOG_PLACE_UNPLACED))) writeLog("frag %8u remains unplaced\n", fid); placedPos[fid].bgn = 0; placedPos[fid].end = FI->fragmentLength(fid); } else { if ((enableLog == true) && (logFileFlagSet(LOG_PLACE_UNPLACED))) writeLog("frag %8u placed tig %6u (%6u reads) at %8u-%8u (cov %7.5f erate %6.4f)\n", fid, placements[b].tigID, unitigs[placements[b].tigID]->ufpath.size(), placements[b].position.bgn, placements[b].position.end, placements[b].fCoverage, placements[b].errors / placements[b].aligned); placedTig[fid] = placements[b].tigID; placedPos[fid] = placements[b].position; } } // All reads placed, now just dump them in their correct tigs. for (uint32 fid=1; fid<FI->numFragments()+1; fid++) { Unitig *tig = NULL; ufNode frg; if (Unitig::fragIn(fid) > 0) continue; // If not placed, dump it in a new unitig. Well, not anymore. These reads were not placed in // any tig initially, were not allowed to seed a tig, and now, could find no place to go. // They're garbage. Plus, it screws up the logging above because we don't know the new tig ID // until now. if (placedTig[fid] == 0) { if (OG->isContained(fid)) nFailedContained++; else nFailed++; //tig = unitigs.newUnitig(false); } // Otherwise, it was placed somewhere, grab the tig. else { if (OG->isContained(fid)) nPlacedContained++; else nPlaced++; tig = unitigs[placedTig[fid]]; } // Regardless, add it to the tig. Logging for this is above. if (tig) { frg.ident = fid; frg.contained = 0; frg.parent = 0; frg.ahang = 0; frg.bhang = 0; frg.position = placedPos[fid]; tig->addFrag(frg, 0, false); } } // Cleanup. delete [] placedPos; delete [] placedTig; writeLog("placeContains()-- Placed %u contained reads and %u unplaced reads.\n", nPlacedContained, nPlaced); writeLog("placeContains()-- Failed to place %u contained reads (too high error suspected) and %u unplaced reads (lack of overlaps suspected).\n", nFailedContained, nFailed); // But wait! All the tigs need to be sorted. Well, not really _all_, but the hard ones to sort // are big, and those quite likely had reads added to them, so it's really not worth the effort // of tracking which ones need sorting, since the ones that don't need it are trivial to sort. for (uint32 ti=1; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg) utg->sort(); } }
void reportUnitigs(UnitigVector &unitigs, const char *prefix, const char *name, uint64 genomeSize) { // Generate n50. Assumes unitigs have been 'classified' already. vector<uint32> unassembledLength; vector<uint32> bubbleLength; vector<uint32> repeatLength; vector<uint32> circularLength; vector<uint32> contigLength; for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg == NULL) continue; if (utg->_isUnassembled) { unassembledLength.push_back(utg->getLength()); } else if (utg->_isBubble) { bubbleLength.push_back(utg->getLength()); } else if (utg->_isRepeat) { repeatLength.push_back(utg->getLength()); } else if (utg->_isCircular) { circularLength.push_back(utg->getLength()); } else { contigLength.push_back(utg->getLength()); } } char N[FILENAME_MAX]; sprintf(N, "%s.sizes", getLogFilePrefix()); errno = 0; FILE *F = fopen(N, "w"); if (errno == 0) { reportN50(F, unassembledLength, "UNASSEMBLED", genomeSize); reportN50(F, bubbleLength, "BUBBLE", genomeSize); reportN50(F, repeatLength, "REPEAT", genomeSize); reportN50(F, circularLength, "CIRCULAR", genomeSize); reportN50(F, contigLength, "CONTIGS", genomeSize); fclose(F); } if (logFileFlagSet(LOG_INTERMEDIATE_UNITIGS) == 0) return; // Dump to an intermediate store. char tigStorePath[FILENAME_MAX]; sprintf(tigStorePath, "%s.tigStore", getLogFilePrefix()); fprintf(stderr, "Creating intermediate tigStore '%s'\n", tigStorePath); uint32 numFragsT = 0; uint32 numFragsP = 0; uint64 utgLen = 0; // Compute average frags per partition. for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg == NULL) continue; numFragsT += utg->ufpath.size(); if (utg->ufpath.size() > 2) utgLen += utg->getLength(); } if (utgLen < 16 * 1024 * 1024) numFragsP = numFragsT / 7; else if (utgLen < 64 * 1024 * 1024) numFragsP = numFragsT / 63; else numFragsP = numFragsT / 127; // Dump the unitigs to an intermediate store. setParentAndHang(unitigs); writeUnitigsToStore(unitigs, tigStorePath, tigStorePath, numFragsP, false); }
void MateLocation::buildHappinessGraphs(Unitig *utg) { // First entry is always zero. Needed for the getById() accessor. assert(_table[0].mleFrgID1 == 0); assert(_table[0].mleFrgID2 == 0); for (uint32 mleidx=1; mleidx<_table.size(); mleidx++) { MateLocationEntry &loc = _table[mleidx]; // We MUST have mleFrgID1 defined. If mleFrgID2 is not defined, then the mate is external. assert(loc.mleFrgID1 != 0); uint32 lib = FI->libraryIID(loc.mleFrgID1); if (lib == 0) // Shouldn't occur, but just in case, ignore fragments in the legacy library. continue; if (IS->valid(lib) == false) // Don't check libs that we didn't generate good stats for continue; int32 badMaxInter = static_cast<int32>(IS->mean(lib) + BADMATE_INTER_STDDEV * IS->stddev(lib)); int32 badMinInter = static_cast<int32>(IS->mean(lib) - BADMATE_INTER_STDDEV * IS->stddev(lib)); int32 badMaxIntra = static_cast<int32>(IS->mean(lib) + BADMATE_INTRA_STDDEV * IS->stddev(lib)); int32 badMinIntra = static_cast<int32>(IS->mean(lib) - BADMATE_INTRA_STDDEV * IS->stddev(lib)); // To keep the results the same as the previous version (1.89) badMaxIntra = badMaxInter; badMinIntra = badMinInter; int32 dist = 0; int32 bgn = 0; int32 end = 0; // Bgn and End MUST be signed. int32 matBgn = loc.mlePos2.bgn; int32 matEnd = loc.mlePos2.end; int32 matLen = (matBgn < matEnd) ? (matEnd - matBgn) : (matBgn - matEnd); int32 frgBgn = loc.mlePos1.bgn; int32 frgEnd = loc.mlePos1.end; int32 frgLen = (frgBgn < frgEnd) ? (frgEnd - frgBgn) : (frgBgn - frgEnd); if ((matLen >= MIN(badMaxInter, badMaxIntra)) || (frgLen >= MIN(badMaxInter, badMaxIntra))) // Yikes, fragment longer than insert size! continue; // Until reset, assume this is a bad mate pair. loc.isGrumpy = true; // If the mate is in another unitig, mark bad only if there is enough space to fit the mate in // this unitig. if (loc.mleUtgID1 != loc.mleUtgID2) { if ((isReverse(loc.mlePos1) == true) && (badMaxInter < frgBgn)) { incrRange(badExternalRev, -1, frgBgn - badMaxInter, frgEnd); incrRange(badExternalRev, -1, frgBgn, frgEnd); if (logFileFlagSet(LOG_HAPPINESS)) fprintf(logFile, "buildHappinessGraph()-- unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- bad external reverse\n", loc.mleUtgID1, 0, loc.mleFrgID1, frgBgn, frgEnd, frgLen, loc.mleUtgID2, 0, loc.mleFrgID2, matBgn, matEnd, matLen); goto markBad; } if ((isReverse(loc.mlePos1) == false) && (badMaxInter < _tigLen - frgBgn)) { incrRange(badExternalFwd, -1, frgEnd, frgBgn + badMaxInter); incrRange(badExternalFwd, -1, frgEnd, frgBgn); if (logFileFlagSet(LOG_HAPPINESS)) fprintf(logFile, "buildHappinessGraph()-- unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- bad external forward\n", loc.mleUtgID1, 0, loc.mleFrgID1, frgBgn, frgEnd, frgLen, loc.mleUtgID2, 0, loc.mleFrgID2, matBgn, matEnd, matLen); goto markBad; } // Not enough space. Not a grumpy mate pair. loc.isGrumpy = false; if (logFileFlagSet(LOG_HAPPINESS)) fprintf(logFile, "buildHappinessGraph()-- unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- not bad, not enough space\n", loc.mleUtgID1, 0, loc.mleFrgID1, frgBgn, frgEnd, frgLen, loc.mleUtgID2, 0, loc.mleFrgID2, matBgn, matEnd, matLen); continue; } // Both mates are in this unitig. // Same orientation? if ((isReverse(loc.mlePos1) == false) && (isReverse(loc.mlePos2) == false)) { incrRange(badNormal, -1, MIN(frgBgn, matBgn), MAX(frgEnd, matEnd)); if (logFileFlagSet(LOG_HAPPINESS)) fprintf(logFile, "buildHappinessGraph()-- unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- bad normal\n", loc.mleUtgID1, 0, loc.mleFrgID1, frgBgn, frgEnd, frgLen, loc.mleUtgID2, 0, loc.mleFrgID2, matBgn, matEnd, matLen); goto markBad; } if ((isReverse(loc.mlePos1) == true) && (isReverse(loc.mlePos2) == true)) { incrRange(badAnti, -1, MIN(frgEnd, matEnd), MAX(frgBgn, matBgn)); if (logFileFlagSet(LOG_HAPPINESS)) fprintf(logFile, "buildHappinessGraph()-- unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- bad anti\n", loc.mleUtgID1, 0, loc.mleFrgID1, frgBgn, frgEnd, frgLen, loc.mleUtgID2, 0, loc.mleFrgID2, matBgn, matEnd, matLen); goto markBad; } // Check a special case for a circular unitig, outtie mates, but close enough to the end to // plausibly be linking the ends together. // // <--- ---> // ========unitig========== // if ((isReverse(loc.mlePos1) == true) && (badMinIntra <= frgBgn + _tigLen - matBgn) && (frgBgn + _tigLen - matBgn <= badMaxIntra)) { loc.isGrumpy = false; // IT'S GOOD, kind of. if (logFileFlagSet(LOG_HAPPINESS)) fprintf(logFile, "buildHappinessGraph()-- unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- good because circular\n", loc.mleUtgID1, 0, loc.mleFrgID1, frgBgn, frgEnd, frgLen, loc.mleUtgID2, 0, loc.mleFrgID2, matBgn, matEnd, matLen); continue; } // Outties? True if pos1.end < pos2.bgn. (For the second case, swap pos1 and pos2) // // (pos1.end) <------ (pos1.bgn) // (pos2.bgn) -------> (pos2.end) // if ((isReverse(loc.mlePos1) == true) && (loc.mlePos1.end < loc.mlePos2.bgn)) { incrRange(badOuttie, -1, MIN(frgBgn, frgEnd), MAX(matBgn, matEnd)); if (logFileFlagSet(LOG_HAPPINESS)) fprintf(logFile, "buildHappinessGraph()-- unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- bad outtie (case 1)\n", loc.mleUtgID1, 0, loc.mleFrgID1, frgBgn, frgEnd, frgLen, loc.mleUtgID2, 0, loc.mleFrgID2, matBgn, matEnd, matLen); goto markBad; } if ((isReverse(loc.mlePos1) == false) && (loc.mlePos2.end < loc.mlePos1.bgn)) { incrRange(badOuttie, -1, MIN(frgBgn, frgEnd), MAX(matBgn, matEnd)); if (logFileFlagSet(LOG_HAPPINESS)) fprintf(logFile, "buildHappinessGraph()-- unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- bad outtie (case 2)\n", loc.mleUtgID1, 0, loc.mleFrgID1, frgBgn, frgEnd, frgLen, loc.mleUtgID2, 0, loc.mleFrgID2, matBgn, matEnd, matLen); goto markBad; } // So, now not NORMAL or ANTI or OUTTIE. We must be left with innies. if (isReverse(loc.mlePos1) == false) // First fragment is on the left, second is on the right. dist = loc.mlePos2.bgn - loc.mlePos1.bgn; else // First fragment is on the right, second is on the left. dist = loc.mlePos1.bgn - loc.mlePos2.bgn; assert(dist >= 0); if (dist < badMinIntra) { incrRange(badCompressed, -1, MIN(frgBgn, matBgn), MAX(frgBgn, matBgn)); if (logFileFlagSet(LOG_HAPPINESS)) fprintf(logFile, "buildHappinessGraph()-- unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- bad compressed\n", loc.mleUtgID1, 0, loc.mleFrgID1, frgBgn, frgEnd, frgLen, loc.mleUtgID2, 0, loc.mleFrgID2, matBgn, matEnd, matLen); goto markBad; } if (badMaxIntra < dist) { incrRange(badStretched, -1, MIN(frgBgn, matBgn), MAX(frgBgn, matBgn)); if (logFileFlagSet(LOG_HAPPINESS)) fprintf(logFile, "buildHappinessGraph()-- unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- bad stretched\n", loc.mleUtgID1, 0, loc.mleFrgID1, frgBgn, frgEnd, frgLen, loc.mleUtgID2, 0, loc.mleFrgID2, matBgn, matEnd, matLen); goto markBad; } assert(badMinIntra <= dist); assert(dist <= badMaxIntra); incrRange(goodGraph, 1, MIN(frgBgn, matBgn), MAX(frgBgn, matBgn)); loc.isGrumpy = false; // IT'S GOOD! if (logFileFlagSet(LOG_HAPPINESS)) fprintf(logFile, "buildHappinessGraph()-- unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- GOOD!\n", loc.mleUtgID1, 0, loc.mleFrgID1, frgBgn, frgEnd, frgLen, loc.mleUtgID2, 0, loc.mleFrgID2, matBgn, matEnd, matLen); continue; markBad: // Mark bad from the 3' end of the fragment till the upper limit where the mate should go. if (loc.mleUtgID1 == utg->id()) { assert(loc.mleFrgID1 != 0); if (isReverse(loc.mlePos1) == false) { // Mark bad for forward fagment 1 assert(frgBgn < frgEnd); bgn = frgEnd; end = frgBgn + badMaxIntra; incrRange(badFwdGraph, -1, bgn, end); } else { // Mark bad for reverse fragment 1 assert(frgEnd < frgBgn); bgn = frgBgn - badMaxIntra; end = frgEnd; incrRange(badRevGraph, -1, bgn, end); } } if (loc.mleUtgID2 == utg->id()) { assert(loc.mleFrgID2 != 0); if (isReverse(loc.mlePos2) == false) { // Mark bad for forward fragment 2 assert(matBgn < matEnd); bgn = matEnd; end = matBgn + badMaxIntra; incrRange(badFwdGraph, -1, bgn, end); } else { // Mark bad for reverse fragment 2 assert(matEnd < matBgn); bgn = matBgn - badMaxIntra; end = matEnd; incrRange(badRevGraph, -1, bgn, end); } } } // Over all MateLocationEntries in the table } // buildHappinessGraph()
// Make sure that contained fragments are in the same unitig // as their container. Due to sorting, contained fragments // can come much later in the unitig: // // ------------1 // -------------2 // --------------3 // ----4 (contained in 1, too much error keeps it out of 2 and 3) // // So, our first pass is to move contained fragments around. // void UnitigGraph::moveContains(void) { for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *thisUnitig = unitigs[ti]; if ((thisUnitig == NULL) || (thisUnitig->ufpath.size() < 2)) continue; MateLocation positions(thisUnitig); ufNode *frags = new ufNode [thisUnitig->ufpath.size()]; uint32 fragsLen = 0; if (logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS)) fprintf(logFile, "moveContain unitig %d\n", thisUnitig->id()); for (uint32 fi=0; fi<thisUnitig->ufpath.size(); fi++) { ufNode *frg = &thisUnitig->ufpath[fi]; BestContainment *bestcont = OG->getBestContainer(frg->ident); MateLocationEntry mloc = positions.getById(frg->ident); uint32 thisFrgID = frg->ident; uint32 contFrgID = (bestcont) ? bestcont->container : 0; uint32 mateFrgID = FI->mateIID(frg->ident); uint32 thisUtgID = thisUnitig->fragIn(thisFrgID); uint32 contUtgID = thisUnitig->fragIn(contFrgID); uint32 mateUtgID = thisUnitig->fragIn(mateFrgID); // id1 != 0 -> we found the fragment in the mate happiness table // isBad -> and the mate is unhappy. // // What's id1 vs id2 in MateLocationEntry? Dunno. All I // know is that if there is no mate present, one of those // will be 0. (Similar test used above too.) // bool isMated = (mateFrgID > 0); bool isGrumpy = ((isMated) && (mloc.mleFrgID1 != 0) && (mloc.mleFrgID2 != 0) && (mloc.isGrumpy == true)); // // Figure out what to do. // bool moveToContainer = false; bool moveToSingleton = false; if ((frg->contained == 0) && (bestcont == NULL)) { // CASE 1: Not contained. Leave the fragment here. //fprintf(logFile, "case1 frag %d fragsLen %d\n", thisFrgID, fragsLen); } else if (isMated == false) { // CASE 2: Contained but not mated. Move to be with the // container (if the container isn't here). //fprintf(logFile, "case2 frag %d contID %d fragsLen %d\n", thisFrgID, contUtgID, fragsLen); if (thisUtgID != contUtgID) moveToContainer = true; } else if ((isGrumpy == true) && (thisUtgID == mateUtgID)) { // CASE 3: Not happy, and the frag and mate are together. // Kick out to a singleton. //fprintf(logFile, "case3 frag %d utg %d mate %d utg %d cont %d utg %d fragsLen %d\n", // thisFrgID, thisUtgID, mateFrgID, mateUtgID, contFrgID, contUtgID, fragsLen); if (thisUtgID == mateUtgID) moveToSingleton = true; } else { // This makes for some ugly code (we break the nice if else // if else structure we had going on) but the next two cases // need to know if there is an overlap to the rest of the // unitig. bool hasOverlap = (thisUtgID == contUtgID); bool allContained = false; if (hasOverlap == false) { if (fragsLen == 0) { // The first fragment. Check fragments after to see if // there is an overlap (note only frags with an overlap // in the layout are tested). In rare cases, we ejected // the container, and left a containee with no overlap to // fragments remaining. // // Note that this checks if there is an overlap to the // very first non-contained (aka dovetail) fragment ONLY. // If there isn't an overlap to the first non-contained // fragment, then that fragment will likely NOT align // correctly. uint32 ft = fi + 1; #warning 2x BUGS IN COMPARISON HERE // Skip all the contains. while ((ft < thisUnitig->ufpath.size()) && (OG->isContained(thisUnitig->ufpath[ft].ident) == true) && (MAX(frg->position.bgn, frg->position.end) < MIN(thisUnitig->ufpath[ft].position.bgn, thisUnitig->ufpath[ft].position.end))) ft++; // If the frag is not contained (we could be the // container), and overlaps in the layout, see if there // is a real overlap. if ((ft < thisUnitig->ufpath.size()) && (OG->isContained(thisUnitig->ufpath[ft].ident) == false) && (MAX(frg->position.bgn, frg->position.end) < MIN(thisUnitig->ufpath[ft].position.bgn, thisUnitig->ufpath[ft].position.end))) hasOverlap = OG->containHaveEdgeTo(thisFrgID, thisUnitig->ufpath[ft].ident); } else { // Not the first fragment, search for an overlap to an // already placed frag. uint32 ft = fi; do { ft--; // OK to overlap to a contained frag; he could be our // container. hasOverlap = OG->containHaveEdgeTo(thisFrgID, thisUnitig->ufpath[ft].ident); // Stop if we found an overlap, or we just checked the // first frag in the unitig, or we no longer overlap in // the layout. } while ((hasOverlap == false) && (ft > 0) && (MIN(frg->position.bgn, frg->position.end) < MAX(thisUnitig->ufpath[ft].position.bgn, thisUnitig->ufpath[ft].position.end))); } } // end of hasOverlap // An unbelievabe special case. When the unitig is just a // single container fragment (and any contained frags under // it) rule 4 breaks. The first fragment has no overlap (all // later reads are contained) and so we want to eject it to a // new unitig. Since there are multiple fragments in this // unitig, the ejection occurs. Later, all the contains get // moved to the new unitig. And we repeat. To prevent, we // abort the ejection if the unitig is all contained in one // fragment. // if (fragsLen == 0) { allContained = true; for (uint32 ft = fi + 1; ((allContained == true) && (ft < thisUnitig->ufpath.size())); ft++) allContained = OG->isContained(thisUnitig->ufpath[ft].ident); } if (isGrumpy == true) { // CASE 4: Not happy and not with the mate. This one is a // bit of a decision. // // If an overlap exists to the rest of the unitig, we'll // leave it here. We'll also leave it here if it is the // rest of the unitig is all contained in this fragment. // // If no overlap, and the mate and container are in the // same unitig, we'll just eject. That also implies the // other unitig is somewhat large, at least as big as the // insert size. // // Otherwise, we'll move to the container and cross our // fingers we place it correctly. The alternative is to // eject, and hope that we didn't also eject the mate to a // singleton. //fprintf(logFile, "case4 frag %d utg %d mate %d utg %d cont %d utg %d fragsLen %d\n", // thisFrgID, thisUtgID, mateFrgID, mateUtgID, contFrgID, contUtgID, fragsLen); if ((hasOverlap == false) && (allContained == false)) if (mateUtgID == contUtgID) moveToSingleton = true; else moveToContainer = true; } else { // CASE 5: Happy! If with container, or an overlap exists to // some earlier fragment, leave it here. Otherwise, eject it // to a singleton. The fragment is ejected instead of moved // to be with its container since we don't know which is // correct - the mate or the overlap. // // If not happy, we've already made sure that the mate is not // here (that was case 3). //fprintf(logFile, "case5 frag %d utg %d mate %d utg %d cont %d utg %d fragsLen %d\n", // thisFrgID, thisUtgID, mateFrgID, mateUtgID, contFrgID, contUtgID, fragsLen); // If no overlap (so not with container or no overlap to // other frags) eject. if ((hasOverlap == false) && (allContained == false)) moveToSingleton = true; } } // End of cases // // Do it. // if (moveToContainer == true) { // Move the fragment to be with its container. Unitig *thatUnitig = unitigs[contUtgID]; ufNode containee = *frg; assert(thatUnitig->id() == contUtgID); // Nuke the fragment in the current list frg->ident = 999999999; frg->contained = 999999999; frg->position.bgn = 0; frg->position.end = 0; assert(thatUnitig->id() == contUtgID); if (logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS)) fprintf(logFile, "Moving contained fragment %d from unitig %d to be with its container %d in unitig %d\n", thisFrgID, thisUtgID, contFrgID, contUtgID); assert(bestcont->container == contFrgID); thatUnitig->addContainedFrag(thisFrgID, bestcont, logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS)); assert(thatUnitig->id() == Unitig::fragIn(thisFrgID)); } else if ((moveToSingleton == true) && (thisUnitig->getNumFrags() != 1)) { // Eject the fragment to a singleton (unless we ARE the singleton) Unitig *singUnitig = new Unitig(logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS)); ufNode containee = *frg; // Nuke the fragment in the current list frg->ident = 999999999; frg->contained = 999999999; frg->position.bgn = 0; frg->position.end = 0; if (logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS)) fprintf(logFile, "Ejecting unhappy contained fragment %d from unitig %d into new unitig %d\n", thisFrgID, thisUtgID, singUnitig->id()); containee.contained = 0; singUnitig->addFrag(containee, -MIN(containee.position.bgn, containee.position.end), logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS)); unitigs.push_back(singUnitig); thisUnitig = unitigs[ti]; // Reset the pointer; unitigs might be reallocated } else { // Leave fragment here. Copy the fragment to the list -- if // we need to rebuild the unitig (because fragments were // removed), the list is used, otherwise, we have already // made the changes needed. // // Also, very important, update our containment mark. If our // container was moved, but we stayed put because of a happy // mate, we're still marked as being contained. Rather than // put this check in all the places where we stay put in the // above if-else-else-else, it's here. if ((frg->contained) && (thisUtgID != contUtgID)) frg->contained = 0; frags[fragsLen] = *frg; fragsLen++; } } // over all frags // Now, rebuild this unitig if we made changes. if (fragsLen != thisUnitig->ufpath.size()) { if (logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS)) fprintf(logFile, "Rebuild unitig %d after removing contained fragments.\n", thisUnitig->id()); thisUnitig->ufpath.clear(); // Occasionally, we move all fragments out of the original unitig. Might be worth checking // if that makes sense!! // #warning EMPTIED OUT A UNITIG if (fragsLen > 0) { // No need to resort. Offsets only need adjustment if the first fragment is thrown out. // If not, splitOffset will be zero. // int splitOffset = -MIN(frags[0].position.bgn, frags[0].position.end); // This is where we clean up from the splitting not dealing with contained fragments -- we // force the first frag to be uncontained. // frags[0].contained = 0; for (uint32 i=0; i<fragsLen; i++) thisUnitig->addFrag(frags[i], splitOffset, logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS)); } } delete [] frags; frags = NULL; } // Over all unitigs }
// For every unitig, report the best overlaps contained in the // unitig, and all overlaps contained in the unitig. void reportOverlapsUsed(UnitigVector &unitigs, const char *prefix, const char *name) { if (logFileFlagSet(LOG_OVERLAPS_USED) == 0) return; char ovlPath[FILENAME_MAX]; sprintf(ovlPath, "%s.%03u.%s.overlaps", prefix, logFileOrder, name); FILE *F = fopen(ovlPath, "w"); if (F == NULL) return; for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg == NULL) continue; for (uint32 fi=0; fi<utg->ufpath.size(); fi++) { ufNode *frg = &utg->ufpath[fi]; // Where is our best overlap? Contained or dovetail? BestEdgeOverlap *bestedge5 = OG->getBestEdgeOverlap(frg->ident, false); BestEdgeOverlap *bestedge3 = OG->getBestEdgeOverlap(frg->ident, true); uint32 bestident5 = 0; uint32 bestident3 = 0; if (bestedge5) bestident5 = bestedge5->fragId(); if (bestedge3) bestident3 = bestedge3->fragId(); // Now search ahead, reporting any overlap to any fragment. // for (uint32 oi=fi+1; oi<utg->ufpath.size(); oi++) { ufNode *ooo = &utg->ufpath[oi]; int frgbgn = MIN(frg->position.bgn, frg->position.end); int frgend = MAX(frg->position.bgn, frg->position.end); int ooobgn = MIN(ooo->position.bgn, ooo->position.end); int oooend = MAX(ooo->position.bgn, ooo->position.end); if ((frgbgn <= ooobgn) && (ooobgn + 40 < frgend)) { BestContainment *bestcont = OG->getBestContainer(ooo->ident); uint32 bestident = 0; if (bestcont->isContained) bestident = bestcont->container; bool isBest = ((frg->ident == bestident) || (ooo->ident == bestident5) || (ooo->ident == bestident3)); fprintf(F, "%d\t%d%s\n", frg->ident, ooo->ident, (isBest) ? ((bestident) ? "\tbc" : "\tbe") : ""); } if (frgend < ooobgn) break; } } } fclose(F); }