Unitig * UnitigVector::newUnitig(bool verbose) { Unitig *u = new Unitig(); #pragma omp critical { u->_id = _totalUnitigs++; if (verbose) writeLog("Creating Unitig %d\n", u->_id); if (_blockNext >= _blockSize) { assert(_numBlocks < _maxBlocks); _blocks[_numBlocks] = new Unitig * [_blockSize]; memset(_blocks[_numBlocks], 0, sizeof(Unitig **) * _blockSize); _numBlocks++; _blockNext = 0; } _blocks[_numBlocks-1][_blockNext++] = u; // The rest are just sanity checks. assert((u->id() / _blockSize) == (_numBlocks - 1)); assert((u->id() % _blockSize) == (_blockNext - 1)); assert(operator[](u->id()) == u); } return(u); };
void placeContainsUsingBestOverlaps(UnitigVector &unitigs) { uint32 fragsPlaced = 1; uint32 fragsPending = 0; logFileFlags &= ~LOG_PLACE_FRAG; while (fragsPlaced > 0) { fragsPlaced = 0; fragsPending = 0; writeLog("==> PLACING CONTAINED FRAGMENTS\n"); for (uint32 fid=1; fid<FI->numFragments()+1; fid++) { BestContainment *bestcont = OG->getBestContainer(fid); Unitig *utg; if (bestcont->isContained == false) // Not a contained fragment. continue; if (Unitig::fragIn(fid) != 0) // Containee already placed. continue; if (Unitig::fragIn(bestcont->container) == 0) { // Container not placed (yet). fragsPending++; continue; } utg = unitigs[Unitig::fragIn(bestcont->container)]; utg->addContainedFrag(fid, bestcont, logFileFlagSet(LOG_INITIAL_CONTAINED_PLACEMENT)); if (utg->id() != Unitig::fragIn(fid)) writeLog("placeContainsUsingBestOverlaps()-- FAILED to add frag %d to unitig %d.\n", fid, bestcont->container); assert(utg->id() == Unitig::fragIn(fid)); fragsPlaced++; } writeLog("==> PLACING CONTAINED FRAGMENTS - placed %d fragments; still need to place %d\n", fragsPlaced, fragsPending); if ((fragsPlaced == 0) && (fragsPending > 0)) { writeLog("Stopping contained fragment placement due to zombies.\n"); fragsPlaced = 0; fragsPending = 0; } } for (uint32 ti=1; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg) utg->sort(); } }
void checkUnitigMembership(UnitigVector &unitigs) { uint32 *inUnitig = new uint32 [FI->numFragments()+1]; uint32 noUnitig = 0xffffffff; // All reads start of not placed in a unitig. for (uint32 i=0; i<FI->numFragments()+1; i++) inUnitig[i] = noUnitig; // Over all unitigs, remember where each read is. for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *tig = unitigs[ti]; int32 len = 0; if (tig == NULL) continue; for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; if (frg->ident > FI->numFragments()) fprintf(stderr, "tig %u ufpath[%d] ident %u more than number of reads %u\n", tig->id(), fi, frg->ident, FI->numFragments()); if (inUnitig[frg->ident] != noUnitig) fprintf(stderr, "tig %u ufpath[%d] ident %u placed multiple times\n", tig->id(), fi, frg->ident); assert(frg->ident <= FI->numFragments()); // Can't be out of range. assert(inUnitig[frg->ident] == noUnitig); // Read must be not placed yet. inUnitig[frg->ident] = ti; } } // Find any read not placed in a unitig. for (uint32 i=0; i<FI->numFragments()+1; i++) { if (FI->fragmentLength(i) == 0) // Deleted read. continue; assert(inUnitig[i] != 0); // There shouldn't be a unitig 0. assert(inUnitig[i] != noUnitig); // The read should be in a unitig. } delete [] inUnitig; }
static void makeNewUnitig(UnitigVector &unitigs, uint32 splitFragsLen, ufNode *splitFrags) { Unitig *dangler = unitigs.newUnitig(false); if (logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS)) writeLog("splitDiscontinuous()-- new tig "F_U32" with "F_U32" fragments (starting at frag "F_U32").\n", dangler->id(), splitFragsLen, splitFrags[0].ident); int splitOffset = -MIN(splitFrags[0].position.bgn, splitFrags[0].position.end); // This should already be true, but we force it still splitFrags[0].contained = 0; for (uint32 i=0; i<splitFragsLen; i++) dangler->addFrag(splitFrags[i], splitOffset, false); //logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS)); }
// For every unitig, report the best overlaps contained in the // unitig, and all overlaps contained in the unitig. // // Wow, this is ancient. // void writeOverlapsUsed(UnitigVector &unitigs, char *fileprefix) { char filename[FILENAME_MAX] = {0}; #if 0 GenericMesg pmesg; OverlapMesg omesg; #endif sprintf(filename, "%s.unused.ovl", fileprefix); FILE *file = fopen(filename, "w"); assert(file != NULL); #if 0 for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg == NULL) continue; for (uint32 fi=0; fi<utg->ufpath.size(); fi++) { ufNode *frg = &utg->ufpath[fi]; // Where is our best overlap? Contained or dovetail? BestEdgeOverlap *bestedge5 = OG->getBestEdgeOverlap(frg->ident, false); BestEdgeOverlap *bestedge3 = OG->getBestEdgeOverlap(frg->ident, true); int bestident5 = 0; int bestident3 = 0; if (bestedge5) { bestident5 = bestedge5->fragId(); if ((bestident5 > 0) && (utg->fragIn(bestident5) != utg->id())) { omesg.aifrag = frg->ident; omesg.bifrag = bestident5; omesg.ahg = bestedge5->ahang(); omesg.bhg = bestedge5->bhang(); omesg.orientation.setIsUnknown(); omesg.overlap_type = AS_DOVETAIL; omesg.quality = 0.0; omesg.min_offset = 0; omesg.max_offset = 0; omesg.polymorph_ct = 0; omesg.alignment_trace = NULL; #ifdef AS_MSG_USE_OVL_DELTA omesg.alignment_delta = NULL; #endif // This overlap is off of the 5' end of this fragment. if (bestedge5->frag3p() == false) omesg.orientation.setIsOuttie(); if (bestedge5->frag3p() == true) omesg.orientation.setIsAnti(); pmesg.t = MESG_OVL; pmesg.m = &omesg; WriteProtoMesg_AS(file, &pmesg); } } if (bestedge3) { bestident3 = bestedge3->fragId(); if ((bestident3 > 0) && (utg->fragIn(bestident3) != utg->id())) { omesg.aifrag = frg->ident; omesg.bifrag = bestident3; omesg.ahg = bestedge3->ahang(); omesg.bhg = bestedge3->bhang(); omesg.orientation.setIsUnknown(); omesg.overlap_type = AS_DOVETAIL; omesg.quality = 0.0; omesg.min_offset = 0; omesg.max_offset = 0; omesg.polymorph_ct = 0; omesg.alignment_trace = NULL; #ifdef AS_MSG_USE_OVL_DELTA omesg.alignment_delta = NULL; #endif // This overlap is off of the 3' end of this fragment. if (bestedge3->frag3p() == false) omesg.orientation.setIsNormal(); if (bestedge3->frag3p() == true) omesg.orientation.setIsInnie(); pmesg.t = MESG_OVL; pmesg.m = &omesg; WriteProtoMesg_AS(file, &pmesg); } } } } #endif fclose(file); }
// After splitting and ejecting some contains, check for discontinuous unitigs. // void splitDiscontinuousUnitigs(UnitigVector &unitigs, uint32 minOverlap) { writeLog("==> SPLIT DISCONTINUOUS\n"); uint32 numTested = 0; uint32 numSplit = 0; uint32 numCreated = 0; uint32 splitFragsLen = 0; uint32 splitFragsMax = 0; ufNode *splitFrags = NULL; for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *tig = unitigs[ti]; if ((tig == NULL) || (tig->ufpath.size() < 2)) continue; // Unitig must be sorted. Someone upstream os screwing this up. tig->sort(); // We'll want to build an array of new fragments to split out. This can be up // to the size of the largest unitig. splitFragsMax = MAX(splitFragsMax, tig->ufpath.size()); // Check that the unitig starts at position zero. Not critical for the next loop, but // needs to be dome sometime. int32 minPos = MIN(tig->ufpath[0].position.bgn, tig->ufpath[0].position.end); if (minPos == 0) continue; writeLog("splitDiscontinuous()-- tig "F_U32" offset messed up; reset by "F_S32".\n", tig->id(), minPos); for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; frg->position.bgn -= minPos; frg->position.end -= minPos; } } splitFrags = new ufNode [splitFragsMax]; // Now, finally, we can check for gaps in unitigs. for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *tig = unitigs[ti]; if ((tig == NULL) || (tig->ufpath.size() < 2)) continue; // We don't expect many unitigs to be broken, so we'll do a first quick pass to just // test if it is. int32 maxEnd = MAX(tig->ufpath[0].position.bgn, tig->ufpath[0].position.end); bool isBroken = false; for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; int32 bgn = MIN(frg->position.bgn, frg->position.end); int32 end = MAX(frg->position.bgn, frg->position.end); if (bgn > maxEnd - minOverlap) { isBroken = true; break; } maxEnd = MAX(maxEnd, end); } numTested++; if (isBroken == false) continue; numSplit++; // Dang, busted unitig. Fix it up. splitFragsLen = 0; maxEnd = 0; if (logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS)) writeLog("splitDiscontinuous()-- discontinuous tig "F_U32" with "F_SIZE_T" fragments broken into:\n", tig->id(), tig->ufpath.size()); for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; int32 bgn = MIN(frg->position.bgn, frg->position.end); int32 end = MAX(frg->position.bgn, frg->position.end); // Good thick overlap exists to this fragment, save it. if (bgn <= maxEnd - minOverlap) { assert(splitFragsLen < splitFragsMax); splitFrags[splitFragsLen++] = *frg; maxEnd = MAX(maxEnd, end); continue; } // No thick overlap found. We need to break right here before the current fragment. // If there is exactly one fragment, and it's contained, and it's not mated, move it to the // container. (This has a small positive benefit over just making every read a singleton). // if ((splitFragsLen == 1) && (FI->mateIID(splitFrags[0].ident) == 0) && (splitFrags[0].contained != 0)) { Unitig *dangler = unitigs[tig->fragIn(splitFrags[0].contained)]; // If the parent isn't in a unitig, we must have shattered the repeat unitig it was in. // Do the same here. if (dangler == NULL) { if (logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS)) writeLog("splitDiscontinuous()-- singleton frag "F_U32" shattered.\n", splitFrags[0].ident); Unitig::removeFrag(splitFrags[0].ident); } else { assert(dangler->id() == tig->fragIn(splitFrags[0].contained)); if (logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS)) writeLog("splitDiscontinuous()-- old tig "F_U32" with "F_SIZE_T" fragments (contained frag "F_U32" moved here).\n", dangler->id(), dangler->ufpath.size() + 1, splitFrags[0].ident); BestContainment *bestcont = OG->getBestContainer(splitFrags[0].ident); assert(bestcont->isContained == true); dangler->addContainedFrag(splitFrags[0].ident, bestcont, false); dangler->bubbleSortLastFrag(); assert(dangler->id() == Unitig::fragIn(splitFrags[0].ident)); } } // Otherwise, make an entirely new unitig for these fragments. else { numCreated++; makeNewUnitig(unitigs, splitFragsLen, splitFrags); tig = unitigs[ti]; } // Done with the split, save the current fragment. This resets everything. splitFragsLen = 0; splitFrags[splitFragsLen++] = *frg; maxEnd = end; } // If we did any splitting, then the length of the frags in splitFrags will be less than the length // of the path in the current unitig. Make a final new unitig for the remaining fragments. // if (splitFragsLen != tig->ufpath.size()) { numCreated++; makeNewUnitig(unitigs, splitFragsLen, splitFrags); delete unitigs[ti]; unitigs[ti] = NULL; } } writeLog("splitDiscontinuous()-- Tested "F_U32" unitigs, split "F_U32" into "F_U32" new unitigs.\n", numTested, numSplit, numCreated); delete [] splitFrags; }
void writeUnitigsToStore(UnitigVector &unitigs, char *fileprefix, char *tigStorePath, uint32 frg_count_target, bool isFinal) { uint32 utg_count = 0; uint32 frg_count = 0; uint32 prt_count = 1; char filename[FILENAME_MAX] = {0}; uint32 *partmap = new uint32 [unitigs.size()]; // This code closely follows that in AS_CGB_unitigger.c::output_the_chunks() if (isFinal) checkUnitigMembership(unitigs); // Open up the initial output file sprintf(filename, "%s.iidmap", fileprefix); FILE *iidm = fopen(filename, "w"); assert(NULL != iidm); sprintf(filename, "%s.partitioning", fileprefix); FILE *part = fopen(filename, "w"); assert(NULL != part); sprintf(filename, "%s.partitioningInfo", fileprefix); FILE *pari = fopen(filename, "w"); assert(NULL != pari); // Step through all the unitigs once to build the partition mapping and IID mapping. tgStore *tigStore = new tgStore(tigStorePath); tgTig *tig = new tgTig; for (uint32 tigID=0, ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if ((utg == NULL) || (utg->getNumFrags() == 0)) continue; assert(utg->getLength() > 0); // Convert the bogart tig to a tgTig and save to the store. unitigToTig(tig, (isFinal) ? tigID : ti, utg); tigID++; tigStore->insertTig(tig, false); // Increment the partition if the current one is too large. if ((frg_count + utg->getNumFrags() >= frg_count_target) && (frg_count > 0)) { fprintf(pari, "Partition %d has %d unitigs and %d fragments.\n", prt_count, utg_count, frg_count); prt_count++; utg_count = 0; frg_count = 0; } // Note that the tig is included in this partition. utg_count += 1; frg_count += utg->getNumFrags(); // Map the tig to a partition, and log both the tig-to-partition map and the partition-to-read map. fprintf(iidm, "bogart "F_U32" -> tig "F_U32" (in partition "F_U32" with "F_U32" frags)\n", utg->id(), utg->tigID(), prt_count, utg->getNumFrags()); for (uint32 fragIdx=0; fragIdx<utg->getNumFrags(); fragIdx++) fprintf(part, "%d\t%d\n", prt_count, utg->ufpath[fragIdx].ident); } fprintf(pari, "Partition %d has %d unitigs and %d fragments.\n", // Don't forget to log the last partition! prt_count, utg_count, frg_count); fclose(pari); fclose(part); fclose(iidm); delete tig; delete tigStore; }
// Examine the first (few?) fragments of a unitig, evaluate if they indicate a join should be made. static bool joinUnitigs_examineEnd(UnitigVector &unitigs, Unitig *fr, uint32 idx, bool frFirstEnd, vector<joinEntry> &joins) { uint32 frgIdx = (frFirstEnd) ? (idx) : (fr->ufpath.size() - 1 - idx); ufNode *frg = &fr->ufpath[frgIdx]; bool frgRev = (frg->position.end < frg->position.bgn); // Grab the best edge for this end frag. The last arg requests the 3' end if true. // // If we're looking at the first read, we want to get: // 5' - if the frag is forward // 3' - if the frag is reverse (frgRev == true) // // If we're looking at the lat read, we want to get: // 5' - if the frag is reverse // 3' - if the frag is forward (frgRev == false) // BestEdgeOverlap *bestEdge = OG->getBestEdgeOverlap(frg->ident, (frgRev == frFirstEnd)); uint32 tgtId = bestEdge->fragId(); bool tgt3p = bestEdge->frag3p(); if (tgtId == 0) // No best edge? Skip it. return(false); // Grab the unitig for that best edge. uint32 toID = fr->fragIn(tgtId); Unitig *to = unitigs[toID]; if (to->ufpath.size() == 1) // Joining to something teeny? Don't bother checking further. return(false); if (to->id() == fr->id()) // Join to myself? Nope. return(false); // Grab the read we have an edge to, an compute the overlapping length and left over length. ufNode *tgt = &to->ufpath[to->pathPosition(tgtId)]; bool tgtRev = (tgt->position.end < tgt->position.bgn); // If tgt3p (we overlap to the 3' end) is the same as tgtRev (read is reverse) then the unitig is oriented // correctly. Otherwise, positions need to be reverse-complemented. bool toFlip = false; if ((frFirstEnd == true) && (tgt3p == false) && (tgtRev == false)) // source read is at the start, overlap to 5' and the read is forward, need to flip the target unitig toFlip = true; if ((frFirstEnd == true) && (tgt3p == true) && (tgtRev == true)) // source read is at the start, overlap to 3' and the read is reverse, need to flip the target unitig toFlip = true; if ((frFirstEnd == false) && (tgt3p == false) && (tgtRev == true)) // source read is at the end, overlap to 5' and the read is reverse, need to flip the target unitig toFlip = true; if ((frFirstEnd == false) && (tgt3p == true) && (tgtRev == false)) // source read is at the end, overlap to 3' and the read is forward, need to flip the target unitig toFlip = true; uint32 toMin = MIN(tgt->position.bgn, tgt->position.end); uint32 toMax = MAX(tgt->position.bgn, tgt->position.end); uint32 toLen = to->getLength(); uint32 frLen = fr->getLength(); if (toFlip) { toMin = toLen - MAX(tgt->position.bgn, tgt->position.end); toMax = toLen - MIN(tgt->position.bgn, tgt->position.end); } assert(toMin < toMax); // Our two unitigs are of length frLen and toLen. We are appending some portion of 'to' onto // 'fr', and 'discarding' the rest. If the 'discarded' piece is larger than the 'fr' unitig, we // don't want to do the join. // // We err on the side of the discarded piece. uint32 joinLen = 0; uint32 discLen = 0; if (frFirstEnd == true) { joinLen = toMin + frLen; // Prepend the start of 'to' onto 'fr'. discLen = toLen - toMin; } else { joinLen = frLen + toLen - toMax; // Append the end of 'to' onto 'fr'. discLen = toMax; } // If the discard is bigger than us, we do damage by joining. if (discLen > frLen) return(false); // The joined should be much larger and the discarded much smaller. uint32 maxLen = MAX(frLen, toLen); uint32 minLen = MIN(frLen, toLen); double joinChange = (double)joinLen / maxLen; double discChange = (double)discLen / minLen; bool isBad = false; if ((joinChange < 1.10) || (0.75 < discChange)) // Bad if we didn't really change sizes. isBad = true; if ((1.0 < joinChange) && (discChange < 0.5)) // But good if discard is tiny. This occurs if we merge a small with a big. The join change // is somewhat small (1.05 say) yet most of the smaller unitig is used. isBad = false; if (isBad) { writeLog("joinUnitigs_examineEnd()-- join unitig %6u (%7ubp) frag %6u %s <-> unitig %6u (%7ubp) frag %6u %s <-> length %5.2f %7u and %5.2f %7u BAD\n", fr->id(), fr->getLength(), frg->ident, (frgRev) ? "rev" : "fwd", to->id(), to->getLength(), tgt->ident, (tgtRev) ? "rev" : "fwd", joinChange, joinLen, discChange, discLen); return(false); } // OK, join. writeLog("joinUnitigs_examineEnd()-- join unitig %6u (%7ubp) frag %6u %s <-> unitig %6u (%7ubp) frag %6u %s <-> length %5.2f %7u and %5.2f %7u\n", fr->id(), fr->getLength(), frg->ident, (frgRev) ? "rev" : "fwd", to->id(), to->getLength(), tgt->ident, (tgtRev) ? "rev" : "fwd", joinChange, joinLen, discChange, discLen); joins.push_back(joinEntry(frg->ident, frFirstEnd, tgt->ident, toFlip, joinLen)); return(true); }
// Make sure that contained fragments are in the same unitig // as their container. Due to sorting, contained fragments // can come much later in the unitig: // // ------------1 // -------------2 // --------------3 // ----4 (contained in 1, too much error keeps it out of 2 and 3) // // So, our first pass is to move contained fragments around. // void UnitigGraph::moveContains(void) { for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *thisUnitig = unitigs[ti]; if ((thisUnitig == NULL) || (thisUnitig->ufpath.size() < 2)) continue; MateLocation positions(thisUnitig); ufNode *frags = new ufNode [thisUnitig->ufpath.size()]; uint32 fragsLen = 0; if (logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS)) fprintf(logFile, "moveContain unitig %d\n", thisUnitig->id()); for (uint32 fi=0; fi<thisUnitig->ufpath.size(); fi++) { ufNode *frg = &thisUnitig->ufpath[fi]; BestContainment *bestcont = OG->getBestContainer(frg->ident); MateLocationEntry mloc = positions.getById(frg->ident); uint32 thisFrgID = frg->ident; uint32 contFrgID = (bestcont) ? bestcont->container : 0; uint32 mateFrgID = FI->mateIID(frg->ident); uint32 thisUtgID = thisUnitig->fragIn(thisFrgID); uint32 contUtgID = thisUnitig->fragIn(contFrgID); uint32 mateUtgID = thisUnitig->fragIn(mateFrgID); // id1 != 0 -> we found the fragment in the mate happiness table // isBad -> and the mate is unhappy. // // What's id1 vs id2 in MateLocationEntry? Dunno. All I // know is that if there is no mate present, one of those // will be 0. (Similar test used above too.) // bool isMated = (mateFrgID > 0); bool isGrumpy = ((isMated) && (mloc.mleFrgID1 != 0) && (mloc.mleFrgID2 != 0) && (mloc.isGrumpy == true)); // // Figure out what to do. // bool moveToContainer = false; bool moveToSingleton = false; if ((frg->contained == 0) && (bestcont == NULL)) { // CASE 1: Not contained. Leave the fragment here. //fprintf(logFile, "case1 frag %d fragsLen %d\n", thisFrgID, fragsLen); } else if (isMated == false) { // CASE 2: Contained but not mated. Move to be with the // container (if the container isn't here). //fprintf(logFile, "case2 frag %d contID %d fragsLen %d\n", thisFrgID, contUtgID, fragsLen); if (thisUtgID != contUtgID) moveToContainer = true; } else if ((isGrumpy == true) && (thisUtgID == mateUtgID)) { // CASE 3: Not happy, and the frag and mate are together. // Kick out to a singleton. //fprintf(logFile, "case3 frag %d utg %d mate %d utg %d cont %d utg %d fragsLen %d\n", // thisFrgID, thisUtgID, mateFrgID, mateUtgID, contFrgID, contUtgID, fragsLen); if (thisUtgID == mateUtgID) moveToSingleton = true; } else { // This makes for some ugly code (we break the nice if else // if else structure we had going on) but the next two cases // need to know if there is an overlap to the rest of the // unitig. bool hasOverlap = (thisUtgID == contUtgID); bool allContained = false; if (hasOverlap == false) { if (fragsLen == 0) { // The first fragment. Check fragments after to see if // there is an overlap (note only frags with an overlap // in the layout are tested). In rare cases, we ejected // the container, and left a containee with no overlap to // fragments remaining. // // Note that this checks if there is an overlap to the // very first non-contained (aka dovetail) fragment ONLY. // If there isn't an overlap to the first non-contained // fragment, then that fragment will likely NOT align // correctly. uint32 ft = fi + 1; #warning 2x BUGS IN COMPARISON HERE // Skip all the contains. while ((ft < thisUnitig->ufpath.size()) && (OG->isContained(thisUnitig->ufpath[ft].ident) == true) && (MAX(frg->position.bgn, frg->position.end) < MIN(thisUnitig->ufpath[ft].position.bgn, thisUnitig->ufpath[ft].position.end))) ft++; // If the frag is not contained (we could be the // container), and overlaps in the layout, see if there // is a real overlap. if ((ft < thisUnitig->ufpath.size()) && (OG->isContained(thisUnitig->ufpath[ft].ident) == false) && (MAX(frg->position.bgn, frg->position.end) < MIN(thisUnitig->ufpath[ft].position.bgn, thisUnitig->ufpath[ft].position.end))) hasOverlap = OG->containHaveEdgeTo(thisFrgID, thisUnitig->ufpath[ft].ident); } else { // Not the first fragment, search for an overlap to an // already placed frag. uint32 ft = fi; do { ft--; // OK to overlap to a contained frag; he could be our // container. hasOverlap = OG->containHaveEdgeTo(thisFrgID, thisUnitig->ufpath[ft].ident); // Stop if we found an overlap, or we just checked the // first frag in the unitig, or we no longer overlap in // the layout. } while ((hasOverlap == false) && (ft > 0) && (MIN(frg->position.bgn, frg->position.end) < MAX(thisUnitig->ufpath[ft].position.bgn, thisUnitig->ufpath[ft].position.end))); } } // end of hasOverlap // An unbelievabe special case. When the unitig is just a // single container fragment (and any contained frags under // it) rule 4 breaks. The first fragment has no overlap (all // later reads are contained) and so we want to eject it to a // new unitig. Since there are multiple fragments in this // unitig, the ejection occurs. Later, all the contains get // moved to the new unitig. And we repeat. To prevent, we // abort the ejection if the unitig is all contained in one // fragment. // if (fragsLen == 0) { allContained = true; for (uint32 ft = fi + 1; ((allContained == true) && (ft < thisUnitig->ufpath.size())); ft++) allContained = OG->isContained(thisUnitig->ufpath[ft].ident); } if (isGrumpy == true) { // CASE 4: Not happy and not with the mate. This one is a // bit of a decision. // // If an overlap exists to the rest of the unitig, we'll // leave it here. We'll also leave it here if it is the // rest of the unitig is all contained in this fragment. // // If no overlap, and the mate and container are in the // same unitig, we'll just eject. That also implies the // other unitig is somewhat large, at least as big as the // insert size. // // Otherwise, we'll move to the container and cross our // fingers we place it correctly. The alternative is to // eject, and hope that we didn't also eject the mate to a // singleton. //fprintf(logFile, "case4 frag %d utg %d mate %d utg %d cont %d utg %d fragsLen %d\n", // thisFrgID, thisUtgID, mateFrgID, mateUtgID, contFrgID, contUtgID, fragsLen); if ((hasOverlap == false) && (allContained == false)) if (mateUtgID == contUtgID) moveToSingleton = true; else moveToContainer = true; } else { // CASE 5: Happy! If with container, or an overlap exists to // some earlier fragment, leave it here. Otherwise, eject it // to a singleton. The fragment is ejected instead of moved // to be with its container since we don't know which is // correct - the mate or the overlap. // // If not happy, we've already made sure that the mate is not // here (that was case 3). //fprintf(logFile, "case5 frag %d utg %d mate %d utg %d cont %d utg %d fragsLen %d\n", // thisFrgID, thisUtgID, mateFrgID, mateUtgID, contFrgID, contUtgID, fragsLen); // If no overlap (so not with container or no overlap to // other frags) eject. if ((hasOverlap == false) && (allContained == false)) moveToSingleton = true; } } // End of cases // // Do it. // if (moveToContainer == true) { // Move the fragment to be with its container. Unitig *thatUnitig = unitigs[contUtgID]; ufNode containee = *frg; assert(thatUnitig->id() == contUtgID); // Nuke the fragment in the current list frg->ident = 999999999; frg->contained = 999999999; frg->position.bgn = 0; frg->position.end = 0; assert(thatUnitig->id() == contUtgID); if (logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS)) fprintf(logFile, "Moving contained fragment %d from unitig %d to be with its container %d in unitig %d\n", thisFrgID, thisUtgID, contFrgID, contUtgID); assert(bestcont->container == contFrgID); thatUnitig->addContainedFrag(thisFrgID, bestcont, logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS)); assert(thatUnitig->id() == Unitig::fragIn(thisFrgID)); } else if ((moveToSingleton == true) && (thisUnitig->getNumFrags() != 1)) { // Eject the fragment to a singleton (unless we ARE the singleton) Unitig *singUnitig = new Unitig(logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS)); ufNode containee = *frg; // Nuke the fragment in the current list frg->ident = 999999999; frg->contained = 999999999; frg->position.bgn = 0; frg->position.end = 0; if (logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS)) fprintf(logFile, "Ejecting unhappy contained fragment %d from unitig %d into new unitig %d\n", thisFrgID, thisUtgID, singUnitig->id()); containee.contained = 0; singUnitig->addFrag(containee, -MIN(containee.position.bgn, containee.position.end), logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS)); unitigs.push_back(singUnitig); thisUnitig = unitigs[ti]; // Reset the pointer; unitigs might be reallocated } else { // Leave fragment here. Copy the fragment to the list -- if // we need to rebuild the unitig (because fragments were // removed), the list is used, otherwise, we have already // made the changes needed. // // Also, very important, update our containment mark. If our // container was moved, but we stayed put because of a happy // mate, we're still marked as being contained. Rather than // put this check in all the places where we stay put in the // above if-else-else-else, it's here. if ((frg->contained) && (thisUtgID != contUtgID)) frg->contained = 0; frags[fragsLen] = *frg; fragsLen++; } } // over all frags // Now, rebuild this unitig if we made changes. if (fragsLen != thisUnitig->ufpath.size()) { if (logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS)) fprintf(logFile, "Rebuild unitig %d after removing contained fragments.\n", thisUnitig->id()); thisUnitig->ufpath.clear(); // Occasionally, we move all fragments out of the original unitig. Might be worth checking // if that makes sense!! // #warning EMPTIED OUT A UNITIG if (fragsLen > 0) { // No need to resort. Offsets only need adjustment if the first fragment is thrown out. // If not, splitOffset will be zero. // int splitOffset = -MIN(frags[0].position.bgn, frags[0].position.end); // This is where we clean up from the splitting not dealing with contained fragments -- we // force the first frag to be uncontained. // frags[0].contained = 0; for (uint32 i=0; i<fragsLen; i++) thisUnitig->addFrag(frags[i], splitOffset, logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS)); } } delete [] frags; frags = NULL; } // Over all unitigs }
intersectionList::intersectionList(UnitigVector &unitigs) { for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *tig = unitigs[ti]; if (tig == NULL) continue; intersectionEvidence *evidence = new intersectionEvidence [tig->ufpath.size()]; for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; if (OG->isContained(frg->ident)) continue; // For my best overlap, the ID of the unitig that the overlapping fragment is in. evidence[fi].edge5 = *OG->getBestEdgeOverlap(frg->ident, false); evidence[fi].edge3 = *OG->getBestEdgeOverlap(frg->ident, true); evidence[fi].frag5tig = tig->fragIn(evidence[fi].edge5.fragId()); evidence[fi].frag3tig = tig->fragIn(evidence[fi].edge3.fragId()); // Do NOT initialize these! An earlier fragment could have already confirmed an end. // Properly, only the 5' end of a forward fragment (or 3' end of a reverse fragment) can be // confirmed already (otherwise the tig is nonsense), but we don't yet check that. // //evidence[fi].frag5confirmed = false; //evidence[fi].frag3confirmed = false; // But, because the path could be promiscuous, not every overlap to a different tig is bad. // // If my best overlap is to a different tig, but there is an overlapping fragment (in the // unitig placement) with a best edge to me, I'm still good. The BOG build this unitig using // the edge from the other fragment to me. // // If the fragments do not overlap in the layout (yet the best edge still exists) that is a // self-intersection. // // The two blocks are identical, except for 'edge3' and 'edge5'. if (evidence[fi].frag5tig == tig->id()) { uint32 ti = tig->pathPosition(evidence[fi].edge5.fragId()); ufNode *trg = &tig->ufpath[ti]; uint32 minf = (frg->position.bgn < frg->position.end) ? frg->position.bgn : frg->position.end; uint32 maxf = (frg->position.bgn < frg->position.end) ? frg->position.end : frg->position.bgn; uint32 mint = (trg->position.bgn < trg->position.end) ? trg->position.bgn : trg->position.end; uint32 maxt = (trg->position.bgn < trg->position.end) ? trg->position.end : trg->position.bgn; // If they overlap, mark as confirmed, else remember an intersection. if (((minf < mint) && (mint < maxf)) || // t begins inside f ((mint < minf) && (minf < maxt))) { // f begins inside t if (evidence[fi].edge5.frag3p()) evidence[ti].frag3confirmed = true; else evidence[ti].frag5confirmed = true; } else { evidence[fi].frag5self = true; // Not the correct place to report this. Some of these get confirmed by later fragments. //writeLog("BUG1 F: %d,%d T %d,%d\n", minf, maxf, mint, maxt); //writeLog("INTERSECT from unitig %d frag %d end %d TO unitig %d frag %d end %d (SELF)\n", // tig->id(), frg->ident, 5, evidence[fi].frag5tig, evidence[fi].edge5.fragId(), evidence[fi].edge5.frag3p() ? 3 : 5); } } if (evidence[fi].frag3tig == tig->id()) { uint32 ti = tig->pathPosition(evidence[fi].edge3.fragId()); ufNode *trg = &tig->ufpath[ti]; uint32 minf = (frg->position.bgn < frg->position.end) ? frg->position.bgn : frg->position.end; uint32 maxf = (frg->position.bgn < frg->position.end) ? frg->position.end : frg->position.bgn; uint32 mint = (trg->position.bgn < trg->position.end) ? trg->position.bgn : trg->position.end; uint32 maxt = (trg->position.bgn < trg->position.end) ? trg->position.end : trg->position.bgn; if (((minf < mint) && (mint < maxf)) || // t begins inside f ((mint < minf) && (minf < maxt))) { // f begins inside t if (evidence[fi].edge3.frag3p()) evidence[ti].frag3confirmed = true; else evidence[ti].frag5confirmed = true; } else { evidence[fi].frag3self = true; // Not the correct place to report this. Some of these get confirmed by later fragments. //writeLog("BUG2 F: %d,%d T %d,%d\n", minf, maxf, mint, maxt); //writeLog("INTERSECT from unitig %d frag %d end %d TO unitig %d frag %d end %d (SELF)\n", // tig->id(), frg->ident, 3, evidence[fi].frag3tig, evidence[fi].edge3.fragId(), evidence[fi].edge3.frag3p() ? 3 : 5); } } } // // Build the list. // for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; if ((evidence[fi].frag5tig != 0) && (evidence[fi].frag5tig != tig->id()) && (evidence[fi].frag5confirmed == false)) isects.push_back(intersectionPoint(evidence[fi].edge5, frg->ident, false, false)); if ((evidence[fi].frag5tig == tig->id()) && (evidence[fi].frag5self == true) && (evidence[fi].frag5confirmed == false)) isects.push_back(intersectionPoint(evidence[fi].edge5, frg->ident, false, true)); if ((evidence[fi].frag3tig != 0) && (evidence[fi].frag3tig != tig->id()) && (evidence[fi].frag3confirmed == false)) isects.push_back(intersectionPoint(evidence[fi].edge3, frg->ident, true, false)); if ((evidence[fi].frag3tig == tig->id()) && (evidence[fi].frag3self == true) && (evidence[fi].frag3confirmed == false)) isects.push_back(intersectionPoint(evidence[fi].edge3, frg->ident, true, true)); } delete [] evidence; } // Sort the intersections by the ID of the intersected fragment, then build an index into the array. std::sort(isects.begin(), isects.end()); // Terminate the intersection list with a sentinal intersection. This is CRITICAL // to the way we iterate over intersections. isects.push_back(intersectionPoint(BestEdgeOverlap(), 0, true, true)); // Build a map from fragment id to the first intersection in the list. for (uint32 i=0; i<isects.size(); i++) { isectsNum[isects[i].isectFrg]++; if (isectsMap.find(isects[i].isectFrg) == isectsMap.end()) isectsMap[isects[i].isectFrg] = i; } }
void UnitigGraph::setParentAndHang(void) { for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg == NULL) continue; if (utg->ufpath.size() == 0) continue; // Reset parent and hangs for everything. for (uint32 fi=1; fi<utg->ufpath.size(); fi++) { ufNode *frg = &utg->ufpath[fi]; frg->parent = 0; frg->ahang = 0; frg->bhang = 0; } // For each fragment, set parent/hangs using the edges. for (uint32 fi=0; fi<utg->ufpath.size(); fi++) { ufNode *frg = &utg->ufpath[fi]; // If we're contained, gee, I sure hope the container is here! BestContainment *bestcont = OG->getBestContainer(frg->ident); if ((bestcont) && (utg->fragIn(bestcont->container) == utg->id())) { int32 pi = utg->pathPosition(bestcont->container); ufNode *par = &utg->ufpath[pi]; frg->parent = bestcont->container; // The hangs assume the container is forward; adjust if not so. if (par->position.bgn < par->position.end) { frg->ahang = bestcont->a_hang; frg->bhang = bestcont->b_hang; } else { frg->ahang = -bestcont->b_hang; frg->bhang = -bestcont->a_hang; } continue; } // Nope, not contained. If we don't have a parent set, see if one of our best overlaps // can set it. BestEdgeOverlap *bestedge5 = OG->getBestEdgeOverlap(frg->ident, false); BestEdgeOverlap *bestedge3 = OG->getBestEdgeOverlap(frg->ident, true); if ((bestedge5->fragId()) && (utg->fragIn(bestedge5->fragId()) == utg->id())) { int32 pi5 = utg->pathPosition(bestedge5->fragId()); ufNode *oth = &utg->ufpath[pi5]; // Consensus is expected parent/hangs to be relative to the parent fragment. This is used // ONLY to place the fragment, not to orient the fragment. Orientation comes from the // absolute positioning coordinates. // // Interestingly, all four overlap transformations are used here. // // The inner if tests (on fragment orientation) should be asserts, but due to imprecise // layouts, they are sometimes violated: // A fragment from 271-547 had a 5'overlap to something after it; // the frag after was at 543-272, close enough to a tie to screw up placements // if (pi5 < fi) { // We have an edge off our 5' end to something before us --> fragment MUST be forward. // Flip the overlap so it is relative to the other fragment. if (frg->position.bgn < frg->position.end) { frg->parent = bestedge5->fragId(); frg->ahang = -bestedge5->ahang(); frg->bhang = -bestedge5->bhang(); assert(frg->ahang >= 0); } } else { // We have an edge off our 5' end to something after us --> fragment MUST be reverse. // Because our fragment is now reverse, we must reverse the overlap too. if (frg->position.end < frg->position.bgn) { oth->parent = frg->ident; oth->ahang = -bestedge5->bhang(); oth->bhang = -bestedge5->ahang(); assert(oth->ahang >= 0); } } } if ((bestedge3->fragId()) && (utg->fragIn(bestedge3->fragId()) == utg->id())) { int32 pi3 = utg->pathPosition(bestedge3->fragId()); ufNode *oth = &utg->ufpath[pi3]; if (pi3 < fi) { // We have an edge off our 3' end to something before us --> fragment MUST be reverse. // Flip the overlap so it is relative to the other fragment. // Because our fragment is now reverse, we must reverse the overlap too. if (frg->position.end < frg->position.bgn) { frg->parent = bestedge3->fragId(); frg->ahang = bestedge3->bhang(); frg->bhang = bestedge3->ahang(); assert(frg->ahang >= 0); } } else { // We have an edge off our 3' end to something after us --> fragment MUST be forward. // This is the simplest case, the overlap is already correct. if (frg->position.bgn < frg->position.end) { oth->parent = frg->ident; oth->ahang = bestedge3->ahang(); oth->bhang = bestedge3->bhang(); assert(oth->ahang >= 0); } } } } } }
void findPotentialBubbles(UnitigVector &unitigs, BubTargetList &potentialBubbles) { uint32 tiLimit = unitigs.size(); uint32 tiNumThreads = omp_get_max_threads(); uint32 tiBlockSize = (tiLimit < 100000 * tiNumThreads) ? tiNumThreads : tiLimit / 99999; writeStatus("\n"); writeStatus("bubbleDetect()-- working on "F_U32" unitigs, with "F_U32" threads.\n", tiLimit, tiNumThreads); for (uint32 ti=0; ti<tiLimit; ti++) { Unitig *tig = unitigs[ti]; if ((tig == NULL) || // Not a tig, ignore it. (tig->ufpath.size() == 1)) // Singleton, handled elsewhere. continue; uint32 nonContainedReads = 0; bool validBubble = true; map<uint32,uint32> tigOlapsTo; uint32 fiLimit = tig->ufpath.size(); uint32 fiNumThreads = omp_get_max_threads(); uint32 fiBlockSize = (fiLimit < 100 * fiNumThreads) ? fiNumThreads : fiLimit / 99; for (uint32 fi=0; (validBubble == true) && (fi<fiLimit); fi++) { uint32 rid = tig->ufpath[fi].ident; if (OG->isContained(rid) == true) // Don't need to check contained reads. If their container continue; // passes the tests below, the contained read will too. nonContainedReads++; uint32 ovlLen = 0; BAToverlap *ovl = OC->getOverlaps(rid, AS_MAX_ERATE, ovlLen); set<uint32> readOlapsTo; for (uint32 oi=0; oi<ovlLen; oi++) { uint32 ovlTigID = Unitig::fragIn(ovl[oi].b_iid); Unitig *ovlTig = unitigs[ovlTigID]; // Skip this overlap if it is to an unplaced read, to a singleton tig, to ourself, // or to a unitig that is shorter than us. We can not pop this tig as a bubble // in any of those cases. if ((ovlTigID == 0) || (ovlTig == NULL) || (ovlTig->ufpath.size() == 1) || (ovlTig->id() == tig->id()) || (ovlTig->getLength() < tig->getLength())) continue; // Otherwise, remember that we had an overlap to ovlTig. //writeLog("tig %u read %u overlap to tig %u read %u\n", // tig->id(), rid, ovlTigID, ovl[oi].b_iid); readOlapsTo.insert(ovlTigID); } //writeLog("tig %8u read %8u has %u olaps\n", tig->id(), rid, readOlapsTo.size()); // Transfer the per-read counts to the per-unitig counts: add one to the counter for each tig // that we have overlaps to. for (set<uint32>::iterator it=readOlapsTo.begin(); it != readOlapsTo.end(); ++it) tigOlapsTo[*it]++; // Decide if we're a valid potential bubble. If tig id (in it->first) has overlaps to every // read we've seen so far (nonContainedReads), we're still a valid bubble. // // To _attempt_ to have differences in the bubble, we'll accept it if 3/4 of the reads // have overlaps. validBubble = false; for (map<uint32,uint32>::iterator it=tigOlapsTo.begin(); it != tigOlapsTo.end(); ++it) if (it->second >= BUBBLE_READ_FRACTION * nonContainedReads) validBubble = true; // If we've not seen that many reads, pretend it's a valid bubble. It'll get screened out later. if (nonContainedReads < 16) validBubble = true; } // If not validBubble, report. #if 0 if (validBubble == false) { writeLog("notValidBubble tig %8d expects %6u reads\n", tig->id(), nonContainedReads); for (map<uint32,uint32>::iterator it=tigOlapsTo.begin(); it != tigOlapsTo.end(); ++it) writeLog(" to tig %8u overlaps %6u\n", it->first, it->second); } #endif // If validBubble, then there is a tig that every dovetail read has at least one overlap to. // Save those tigs in potentialBubbles. uint32 nTigs = 0; if (validBubble) { for (map<uint32,uint32>::iterator it=tigOlapsTo.begin(); it != tigOlapsTo.end(); ++it) if (it->second >= BUBBLE_READ_FRACTION * nonContainedReads) nTigs++; } // ALWAYS log potential bubbles. if (nTigs > 0) { writeLog("\n"); writeLog("potential bubble tig %8u length %9u nReads %7u to %3u tigs:\n", tig->id(), tig->getLength(), tig->ufpath.size(), nTigs); for (map<uint32,uint32>::iterator it=tigOlapsTo.begin(); it != tigOlapsTo.end(); ++it) { if (it->second >= BUBBLE_READ_FRACTION * nonContainedReads) { Unitig *dest = unitigs[it->first]; writeLog(" tig %8u length %9u nReads %7u\n", dest->id(), dest->getLength(), dest->ufpath.size()); potentialBubbles[ti].push_back(dest->id()); } } } } flushLog(); }
void popBubbles(UnitigVector &unitigs, double deviationBubble) { BubTargetList potentialBubbles; findPotentialBubbles(unitigs, potentialBubbles); writeStatus("popBubbles()-- Found "F_SIZE_T" potential bubbles.\n", potentialBubbles.size()); //if (potentialBubbles.size() == 0) // return; writeLog("\n"); writeLog("Found "F_SIZE_T" potential bubbles.\n", potentialBubbles.size()); writeLog("\n"); vector<overlapPlacement> *placed = findBubbleReadPlacements(unitigs, potentialBubbles, deviationBubble); // We now have, in 'placed', a list of all the places that each read could be placed. Decide if there is a _single_ // place for each bubble to be popped. uint32 tiLimit = unitigs.size(); //uint32 tiNumThreads = omp_get_max_threads(); //uint32 tiBlockSize = (tiLimit < 100000 * tiNumThreads) ? tiNumThreads : tiLimit / 99999; // Clear flags. for (uint32 ti=0; ti<tiLimit; ti++) { if (unitigs[ti]) { unitigs[ti]->_isBubble = false; unitigs[ti]->_isRepeat = false; } } // In parallel, process the placements. for (uint32 ti=0; ti<tiLimit; ti++) { if (potentialBubbles.count(ti) == 0) // Not a potential bubble continue; // Scan the bubble, decide if there are _ANY_ read placements. Log appropriately. Unitig *bubble = unitigs[ti]; bool hasPlacements = false; for (uint32 fi=0; fi<bubble->ufpath.size(); fi++) { uint32 readID = bubble->ufpath[fi].ident; if (placed[readID].size() > 0) hasPlacements = true; } if (hasPlacements == false) writeLog("potential bubble %u had no valid placements (all were not contained in target tig)\n", ti); else writeLog("potential bubble %u\n", ti); // Split the placements into piles for each target and build an interval list for each target. // For each read in the tig, convert the vector of placements into interval lists, one list per target tig. map<uint32, intervalList<uint32> *> targetIntervals; for (uint32 fi=0; fi<bubble->ufpath.size(); fi++) { uint32 readID = bubble->ufpath[fi].ident; for (uint32 pp=0; pp<placed[readID].size(); pp++) { uint32 tid = placed[readID][pp].tigID; assert(placed[readID][pp].frgID > 0); uint32 bgn = (placed[readID][pp].position.bgn < placed[readID][pp].position.end) ? placed[readID][pp].position.bgn : placed[readID][pp].position.end; uint32 end = (placed[readID][pp].position.bgn < placed[readID][pp].position.end) ? placed[readID][pp].position.end : placed[readID][pp].position.bgn; if (targetIntervals[tid] == NULL) targetIntervals[tid] = new intervalList<uint32>; //writeLog("read %u -> tig %u intervals %u-%u\n", readID, tid, bgn, end); targetIntervals[tid]->add(bgn, end-bgn); } } vector<candidatePop *> targets; // Squish the intervals. Create new candidatePops for each interval that isn't too big or // small. Assign each overlapPlacements to the correct candidatePop. for (map<uint32, intervalList<uint32> *>::iterator it=targetIntervals.begin(); it != targetIntervals.end(); ++it) { uint32 targetID = it->first; intervalList<uint32> *IL = it->second; IL->merge(); // Discard intervals that are significantly too small or large. Save the ones that are // nicely sized. Logging here isn't terribly useful, it's just repeated (out of order) later // when we try to make sense of the read alignments. for (uint32 ii=0; ii<IL->numberOfIntervals(); ii++) { if ((IL->hi(ii) - IL->lo(ii) < 0.75 * bubble->getLength()) || // Too small! (1.25 * bubble->getLength() < IL->hi(ii) - IL->lo(ii))) { // Too big! writeLog("tig %8u length %9u -> target %8u piece %2u position %9u-%9u length %8u - size mismatch, discarded\n", bubble->id(), bubble->getLength(), targetID, ii, IL->lo(ii), IL->hi(ii), IL->hi(ii) - IL->lo(ii)); continue; } writeLog("tig %8u length %9u -> target %8u piece %2u position %9u-%9u length %8u\n", bubble->id(), bubble->getLength(), targetID, ii, IL->lo(ii), IL->hi(ii), IL->hi(ii) - IL->lo(ii)); targets.push_back(new candidatePop(bubble, unitigs[targetID], IL->lo(ii), IL->hi(ii))); } delete IL; } targetIntervals.clear(); // If no targets, nothing to do. if (targets.size() == 0) continue; // Run through the placements again, and assign them to the correct target. // // For each read: // For each acceptable placement: // For each target location: // If the placement is for this target, save it. for (uint32 fi=0; fi<bubble->ufpath.size(); fi++) { uint32 readID = bubble->ufpath[fi].ident; for (uint32 pp=0; pp<placed[readID].size(); pp++) { uint32 tid = placed[readID][pp].tigID; uint32 bgn = (placed[readID][pp].position.bgn < placed[readID][pp].position.end) ? placed[readID][pp].position.bgn : placed[readID][pp].position.end; uint32 end = (placed[readID][pp].position.bgn < placed[readID][pp].position.end) ? placed[readID][pp].position.end : placed[readID][pp].position.bgn; for (uint32 tt=0; tt<targets.size(); tt++) if ((targets[tt]->target->id() == tid) && (targets[tt]->bgn < end) && (bgn < targets[tt]->end)) targets[tt]->placed.push_back(placed[readID][pp]); } } // Count the number of targets that have all the reads (later: in the correct order, etc, etc). Remove those // that don't. uint32 nTargets = 0; set<uint32> tigReads; // Reads in the bubble tig. set<uint32> tgtReads; // Reads in the bubble that have a placement in the target. // Remove duplicate placements from each target. for (uint32 tt=0; tt<targets.size(); tt++) { candidatePop *t = targets[tt]; // Detect duplicates, keep the one with lower error. There are a lot of duplicate // placements, logging isn't terribly useful. for (uint32 aa=0; aa<t->placed.size(); aa++) { for (uint32 bb=0; bb<t->placed.size(); bb++) { if ((aa == bb) || (t->placed[aa].frgID != t->placed[bb].frgID) || (t->placed[aa].frgID == 0) || (t->placed[bb].frgID == 0)) continue; if (t->placed[aa].errors / t->placed[aa].aligned < t->placed[bb].errors / t->placed[bb].aligned) { #ifdef SHOW_MULTIPLE_PLACEMENTS writeLog("duplicate read alignment for tig %u read %u - better %u-%u %.4f - worse %u-%u %.4f\n", t->placed[aa].tigID, t->placed[aa].frgID, t->placed[aa].position.bgn, t->placed[aa].position.end, t->placed[aa].errors / t->placed[aa].aligned, t->placed[bb].position.bgn, t->placed[bb].position.end, t->placed[bb].errors / t->placed[bb].aligned); #endif t->placed[bb] = overlapPlacement(); } else { #ifdef SHOW_MULTIPLE_PLACEMENTS writeLog("duplicate read alignment for tig %u read %u - better %u-%u %.4f - worse %u-%u %.4f\n", t->placed[aa].tigID, t->placed[aa].frgID, t->placed[bb].position.bgn, t->placed[bb].position.end, t->placed[bb].errors / t->placed[bb].aligned, t->placed[aa].position.bgn, t->placed[aa].position.end, t->placed[aa].errors / t->placed[aa].aligned); #endif t->placed[aa] = overlapPlacement(); } } } // Get rid of any now-empty entries. for (uint32 aa=t->placed.size(); aa--; ) { if (t->placed[aa].frgID == 0) { t->placed[aa] = t->placed.back(); t->placed.pop_back(); } } } // Make a set of the reads in the bubble. We'll compare each target against this to decide if all reads are placed. for (uint32 fi=0; fi<bubble->ufpath.size(); fi++) tigReads.insert(bubble->ufpath[fi].ident); uint32 nOrphan = 0; // Full coverage; bubble can be popped. uint32 orphanTarget = 0; uint32 nBubble = 0; // Partial coverage, bubble cannot be popped. uint32 bubbleTarget = 0; for (uint32 tt=0; tt<targets.size(); tt++) { tgtReads.clear(); for (uint32 op=0; op<targets[tt]->placed.size(); op++) { if (logFileFlagSet(LOG_BUBBLE_DETAIL)) writeLog("tig %8u length %9u -> target %8u piece %2u position %9u-%9u length %8u - read %7u at %9u-%9u\n", bubble->id(), bubble->getLength(), targets[tt]->target->id(), tt, targets[tt]->bgn, targets[tt]->end, targets[tt]->end - targets[tt]->bgn, targets[tt]->placed[op].frgID, targets[tt]->placed[op].position.bgn, targets[tt]->placed[op].position.end); assert(targets[tt]->placed[op].frgID > 0); tgtReads.insert(targets[tt]->placed[op].frgID); } // Count the number of consecutive reads from the 5' or 3' end of the bubble that are placed // in the target. // // Also, count the number of reads in the bubble that are placed in the target. Likely the // same as n5 + n3. uint32 n5 = 0; uint32 n3 = 0; uint32 nt = 0; for (uint32 fi=0; fi<bubble->ufpath.size(); fi++) if (tgtReads.count(bubble->ufpath[fi].ident) > 0) n5++; else break; for (uint32 fi=bubble->ufpath.size(); fi-->0; ) if (tgtReads.count(bubble->ufpath[fi].ident) > 0) n3++; else break; for (uint32 fi=0; fi<bubble->ufpath.size(); fi++) if (tgtReads.count(bubble->ufpath[fi].ident) > 0) nt++; // Report now, before we nuke targets[tt] for being not a bubble! if ((nt == bubble->ufpath.size()) || ((n5 > 0) && (n3 > 0))) writeLog("tig %8u length %9u -> target %8u piece %2u position %9u-%9u length %8u - expected %3"F_SIZE_TP" reads, had %3"F_SIZE_TP" reads. n5=%3u n3=%3u nt=%3u\n", bubble->id(), bubble->getLength(), targets[tt]->target->id(), tt, targets[tt]->bgn, targets[tt]->end, targets[tt]->end - targets[tt]->bgn, tigReads.size(), tgtReads.size(), n5, n3, nt); // Decide if this is a bubble, orphan from construction, or repeat. if (nt == bubble->ufpath.size()) { nOrphan++; orphanTarget = tt; } else if ((n5 > 0) && (n3 > 0)) { nBubble++; bubbleTarget = tt; } } // If no placements, pbbbt. if (nOrphan + nBubble == 0) { //writeLog("tig %8u length %8u reads %6u had no bubble or orphan placements.\n", bubble->id(), bubble->getLength(), bubble->ufpath.size()); continue; } // If multiple orphan and/or bubble placements, it's a repeat. if (nOrphan + nBubble > 1) { writeLog("tig %8u length %8u reads %6u - repeat - %u orphan %u bubble placements.\n", bubble->id(), bubble->getLength(), bubble->ufpath.size(), nOrphan, nBubble); writeLog("\n"); bubble->_isRepeat = true; continue; } // If a bubble placement, mark it as a bubble so it can be skipped during repeat detection. if (nBubble > 0) { writeLog("tig %8u length %8u reads %6u - bubble\n", bubble->id(), bubble->getLength(), bubble->ufpath.size()); writeLog("\n"); bubble->_isBubble = true; continue; } // Otherwise, it's an orphan, move the reads to the proper place. writeLog("tig %8u length %8u reads %6u - orphan\n", bubble->id(), bubble->getLength(), bubble->ufpath.size()); for (uint32 op=0, tt=orphanTarget; op<targets[tt]->placed.size(); op++) { ufNode frg; frg.ident = targets[tt]->placed[op].frgID; frg.contained = 0; frg.parent = 0; frg.ahang = 0; frg.bhang = 0; frg.position.bgn = targets[tt]->placed[op].position.bgn; frg.position.end = targets[tt]->placed[op].position.end; writeLog("move read %u from tig %u to tig %u %u-%u\n", frg.ident, bubble->id(), targets[tt]->target->id(), frg.position.bgn, frg.position.end); targets[tt]->target->addFrag(frg, 0, false); } writeLog("\n"); unitigs[bubble->id()] = NULL; delete bubble; } // Over all bubbles writeLog("\n"); // Needed if no bubbles are popped. delete [] placed; // Sort reads in all the tigs. Overkill, but correct. for (uint32 ti=0; ti<tiLimit; ti++) { Unitig *tig = unitigs[ti]; if ((tig == NULL) || // Not a tig, ignore it. (tig->ufpath.size() == 1)) // Singleton, already sorted. continue; tig->sort(); } }
void extendByMates(UnitigVector &unitigs, double erateGraph) { //logFileFlags |= LOG_CHUNK_GRAPH; logFileFlags |= LOG_POPULATE_UNITIG; writeLog("==> EXTENDING UNITIGS WITH MATE PAIRS.\n"); uint32 tiMax = unitigs.size(); for (uint32 ti=0; ti<tiMax; ti++) { Unitig *target = unitigs[ti]; if (target == NULL) continue; if (target->ufpath.size() < 2) continue; // Build a list of all the fragments in this unitig, and any mates that are not in a unitig. uint32 extraMates = 0; for (uint32 fi=0; fi<target->ufpath.size(); fi++) { uint32 fid = target->ufpath[fi].ident; uint32 mid = FI->mateIID(fid); if ((mid != 0) && (Unitig::fragIn(mid) == 0)) extraMates++; } writeLog("\n"); writeLog("unitig "F_U32" of size "F_SIZE_T" with "F_U32" extra fragments via mates\n", ti, target->ufpath.size(), extraMates); if (extraMates == 0) continue; // Build a set of the fragments in this unitig plus their mates, and a set of just the mates. set<uint32> frags; set<uint32> mates; for (uint32 fi=0; fi<target->ufpath.size(); fi++) { uint32 fid = target->ufpath[fi].ident; uint32 mid = FI->mateIID(fid); frags.insert(fid); if ((mid != 0) && (Unitig::fragIn(mid) == 0)) { writeLog(" mate frag "F_U32"\n", mid); frags.insert(mid); mates.insert(mid); } } // Now, remove all the unitig fragments from the unitig so we can reconstruct it with the // additional mated fragments. Note that this loop cannot be combined with the last, since // the test for 'additional mate' is 'not in the same unitig' -- and if we remove the // fragments too early, we can't distinguish 'additional' from 'included'. for (uint32 fi=0; fi<target->ufpath.size(); fi++) target->removeFrag(target->ufpath[fi].ident); unitigs[ti] = NULL; delete target; // Build a new BOG for just those fragments - in particular, only overlaps within the set are // used for the BOG. BestOverlapGraph *OGsave = OG; ChunkGraph *CGsave = CG; OG = new BestOverlapGraph(erateGraph, &frags); CG = new ChunkGraph(&frags); uint32 numTigs = unitigs.size(); // Build new unitigs. There should only be one new unitig constructed, but that isn't // guaranteed. No new unitigs are built if they are seeded from the mate fragments. This // isn't ideal -- we'd like to allow the first unitig (supposedly the longest) to start from // a mate fragment. However, consider the not-so-rare case where the original unitig is two // backbone fragments and lots of contains. Those contains contribute mate pairs that all // assemble together, giving a longer path than the original unitig. We don't want to // assemble the mated fragments yet (we'll wait until we get the rest of the fragments that // could assemble together). for (uint32 fi = CG->nextFragByChunkLength(); fi > 0; fi=CG->nextFragByChunkLength()) { if ((Unitig::fragIn(fi) != 0) || (mates.count(fi) > 0)) // Fragment already in a unitig, or is an additional mate that we don't want // to seed from. continue; populateUnitig(unitigs, fi); } // Report what was constructed if (unitigs.size() - numTigs > 1) writeLog("WARNING: mate extension split a unitig.\n"); for (uint32 newTigs=numTigs; newTigs<unitigs.size(); newTigs++) { Unitig *tig = unitigs[newTigs]; if (tig == NULL) continue; placeContainsUsingBestOverlaps(tig, &frags); writeLog(" new tig "F_U32" with "F_SIZE_T" fragments\n", tig->id(), tig->ufpath.size()); } delete OG; delete CG; OG = OGsave; CG = CGsave; } }
void writeUnitigsToStore(UnitigVector &unitigs, char *fileprefix, char *tigStorePath, uint32 frg_count_target, bool isFinal) { uint32 utg_count = 0; uint32 frg_count = 0; uint32 prt_count = 1; char filename[FILENAME_MAX] = {0}; uint32 *partmap = new uint32 [unitigs.size()]; // This code closely follows that in AS_CGB_unitigger.c::output_the_chunks() if (isFinal) checkUnitigMembership(unitigs); // Open up the initial output file sprintf(filename, "%s.iidmap", fileprefix); FILE *iidm = fopen(filename, "w"); assert(NULL != iidm); sprintf(filename, "%s.partitioning", fileprefix); FILE *part = fopen(filename, "w"); assert(NULL != part); sprintf(filename, "%s.partitioningInfo", fileprefix); FILE *pari = fopen(filename, "w"); assert(NULL != pari); // Step through all the unitigs once to build the partition mapping and IID mapping. memset(partmap, 0xff, sizeof(uint32) * unitigs.size()); for (uint32 iumiid=0, ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; uint32 nf = (utg) ? utg->getNumFrags() : 0; if ((utg == NULL) || (nf == 0)) continue; assert(utg->getLength() > 0); assert(nf == utg->ufpath.size()); if ((frg_count + nf >= frg_count_target) && (frg_count > 0)) { fprintf(pari, "Partition %d has %d unitigs and %d fragments.\n", prt_count, utg_count, frg_count); prt_count++; utg_count = 0; frg_count = 0; } uint32 tigid = (isFinal) ? iumiid : ti; assert(tigid < unitigs.size()); partmap[tigid] = prt_count; fprintf(iidm, "Unitig "F_U32" == IUM "F_U32" (in partition "F_U32" with "F_U32" frags)\n", utg->id(), (tigid), partmap[(tigid)], nf); for (uint32 fragIdx=0; fragIdx<nf; fragIdx++) { ufNode *f = &utg->ufpath[fragIdx]; fprintf(part, "%d\t%d\n", prt_count, f->ident); } utg_count += 1; frg_count += nf; iumiid++; } fprintf(pari, "Partition %d has %d unitigs and %d fragments.\n", prt_count, utg_count, frg_count); fclose(pari); fclose(part); fclose(iidm); // Step through all the unitigs once to build the partition mapping and IID mapping. tgStore *tigStore = new tgStore(tigStorePath); tgTig *tig = new tgTig; for (uint32 iumiid=0, ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; uint32 nf = (utg) ? utg->getNumFrags() : 0; if ((utg == NULL) || (nf == 0)) continue; unitigToTig(tig, (isFinal) ? iumiid : ti, utg); tigStore->insertTig(tig, false); iumiid++; } delete tig; delete tigStore; delete [] partmap; }
void breakUnitigs(UnitigVector &unitigs, char *output_prefix, bool enableIntersectionBreaking) { writeLog("==> BREAKING UNITIGS.\n"); intersectionList *ilist = new intersectionList(unitigs); // Stop when we've seen all current unitigs. Replace tiMax // in the for loop below with unitigs.size() to recursively // split unitigs. uint32 tiMax = unitigs.size(); for (uint32 ti=0; ti<tiMax; ti++) { Unitig *tig = unitigs[ti]; if (tig == NULL) continue; vector<breakPoint> breaks; for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; intersectionPoint *isect = ilist->getIntersection(frg->ident, 0); if (isect == NULL) continue; for (; isect->isectFrg == frg->ident; isect++) { assert(tig->id() == Unitig::fragIn(isect->isectFrg)); // Grab the invading unitig Unitig *inv = unitigs[Unitig::fragIn(isect->invadFrg)]; assert(inv->id() == Unitig::fragIn(isect->invadFrg)); // Grab the best edges off the invading fragment. BestEdgeOverlap *best5 = OG->getBestEdgeOverlap(isect->invadFrg, false); BestEdgeOverlap *best3 = OG->getBestEdgeOverlap(isect->invadFrg, true); // Check if the incoming tig is a spur, and we should just ignore it immediately if ((inv->ufpath.size() == 1) && ((best5->fragId() == 0) || (best3->fragId() == 0))) { if (logFileFlagSet(LOG_INTERSECTION_BREAKING)) writeLog("unitig %d frag %d end %c' into unitig %d frag %d end %c' -- IS A SPUR, skip it\n", inv->id(), isect->invadFrg, isect->invad3p ? '3' : '5', tig->id(), isect->isectFrg, isect->isect3p ? '3' : '5'); continue; } // Keep only significant intersections if ((inv->getLength() > MIN_BREAK_LENGTH) && (inv->ufpath.size() > MIN_BREAK_FRAGS)) { if (logFileFlagSet(LOG_INTERSECTION_BREAKING)) writeLog("unitig %d frag %d end %c' into unitig %d frag %d end %c'\n", inv->id(), isect->invadFrg, isect->invad3p ? '3' : '5', tig->id(), isect->isectFrg, isect->isect3p ? '3' : '5'); breaks.push_back(breakPoint(isect->isectFrg, isect->isect3p, true, false)); } } // Over all incoming fragments // If this is the last fragment, terminate the break point list with a 'fakeEnd' (in AS_BAT_Breaking.cc) break point // at the end of the unitig. if ((fi+1 == tig->ufpath.size()) && (breaks.size() > 0)) { breaks.push_back(breakPoint(frg->ident, (frg->position.bgn < frg->position.end), true, false)); } } // Over all fragments in the unitig if (breaks.size() == 0) continue; // Report where breaks occur. 'breaks' is a list, not a vector. #if 0 // We've lost the fields in breaks[i] -- but the reports above aren't updated yet. if (logFileFlagSet(LOG_INTERSECTION_BREAKING) || logFileFlagSet(LOG_MATE_SPLIT_COVERAGE_PLOT)) for (uint32 i=0; i<breaks.size(); i++) writeLog("BREAK unitig %d at position %d,%d from inSize %d inFrags %d.\n", tig->id(), breaks[i].fragPos.bgn, breaks[i].fragPos.end, breaks[i].inSize, breaks[i].inFrags); #endif // Actually do the breaking. if (enableIntersectionBreaking) breakUnitigAt(unitigs, tig, breaks, true); breaks.clear(); } // Over all unitigs }
void placeZombies(UnitigVector &unitigs, double erate) { writeLog("==> SEARCHING FOR ZOMBIES\n"); uint32 *inUnitig = new uint32 [FI->numFragments()+1]; int numZombies = 0; // Mark fragments as dead, then unmark them if they are in a real living unitig. for (uint32 i=0; i<FI->numFragments()+1; i++) inUnitig[i] = noUnitig; for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg == NULL) continue; for (uint32 fi=0; fi<utg->ufpath.size(); fi++) inUnitig[utg->ufpath[fi].ident] = utg->id(); } // For anything not in a living unitig, reload the overlaps and find a new container. // (NOT IMPLEMENTED - for now we just move these to new singleton unitigs). for (uint32 i=0; i<FI->numFragments()+1; i++) { if (FI->fragmentLength(i) == 0) // Deleted fragment continue; if (inUnitig[i] != noUnitig) // Valid fragment in a unitig continue; Unitig *utg = unitigs.newUnitig(false); ufNode frg; frg.ident = i; frg.contained = 0; frg.parent = 0; frg.ahang = 0; frg.bhang = 0; frg.position.bgn = 0; frg.position.end = FI->fragmentLength(i); frg.containment_depth = 0; utg->addFrag(frg, 0, false); writeLog("placeZombies()-- unitig %d created from zombie fragment %d\n", utg->id(), i); numZombies++; } writeLog("RESURRECTED %d ZOMBIE FRAGMENT%s.\n", numZombies, (numZombies != 1) ? "s" : ""); delete [] inUnitig; }
void popMateBubbles(UnitigVector &unitigs) { uint32 nBubblePopped = 0; uint32 nBubbleTooBig = 0; uint32 nBubbleConflict = 0; writeLog("==> SEARCHING FOR MATE BUBBLES\n"); // For each unitig, if all (or most) of the external mates are to a single other unitig (not // counting singletons), then this is a potential bubble popping unitig. // // At present, this is exploratory only. for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *tig = unitigs[ti]; if ((tig == NULL) || (tig->ufpath.size() == 0)) // No tig here. continue; if ((tig->getLength() > 1000) || (tig->ufpath.size() >= 3000)) // Tig too big. continue; //if ((tig->getLength() < 150) || // (tig->ufpath.size() < 5)) // // Tig too small. // continue; uint32 *lkg = new uint32 [tig->ufpath.size()]; uint32 lkgLen = 0; uint32 lkgExt = 0; for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; int32 frgID = frg->ident; int32 matID = FI->mateIID(frgID); uint32 mtigID = 0; Unitig *mtig = 0L; if (matID == 0) // No mate. continue; mtigID = tig->fragIn(matID); mtig = unitigs[mtigID]; if (mtigID == tig->id()) // Mate is not external. continue; lkgExt++; if (mtig->ufpath.size() < 2) // Mate is in singleton. continue; lkg[lkgLen++] = mtigID; } if (lkgLen == 0) // No external mates. continue; sort(lkg, lkg+lkgLen); uint32 last = lkg[0]; uint32 lcnt = 1; for (uint32 i=1; i<lkgLen; i++) { if (last != lkg[i]) { if ((lcnt > 3)) writeLog("popMateBubble()-- tig %d len %d might pop bubble in tig %u (%u mates in there out of %d external mates)\n", tig->id(), tig->getLength(), last, lcnt, lkgExt); last = lkg[i]; lcnt = 0; } lcnt++; } if ((lcnt > 3)) writeLog("popMateBubble()-- tig %d len %d might pop bubble in tig %u (%u mates in there out of %d external mates)\n", tig->id(), tig->getLength(), last, lcnt, lkgExt); delete [] lkg; } }
void markRepeatReads(UnitigVector &unitigs, double deviationRepeat, uint32 confusedAbsolute, double confusedPercent) { uint32 tiLimit = unitigs.size(); uint32 numThreads = omp_get_max_threads(); uint32 blockSize = (tiLimit < 100000 * numThreads) ? numThreads : tiLimit / 99999; writeLog("repeatDetect()-- working on "F_U32" unitigs, with "F_U32" threads.\n", tiLimit, numThreads); vector<olapDat> repeatOlaps; // Overlaps to reads promoted to tig coords intervalList<int32> tigMarksR; // Marked repeats based on reads, filtered by spanning reads intervalList<int32> tigMarksU; // Non-repeat invervals, just the inversion of tigMarksR for (uint32 ti=0; ti<tiLimit; ti++) { Unitig *tig = unitigs[ti]; if (tig == NULL) continue; if (tig->ufpath.size() == 1) continue; vector<olapDat> repeats; writeLog("Annotating repeats in reads for tig %u/%u.\n", ti, tiLimit); // Clear out all the existing marks. They're not for this tig. // Analyze overlaps for each read. For each overlap to a read not in this tig, or not // overlapping in this tig, and of acceptable error rate, add the overlap to repeatOlaps. repeatOlaps.clear(); uint32 fiLimit = tig->ufpath.size(); uint32 numThreads = omp_get_max_threads(); uint32 blockSize = (fiLimit < 100 * numThreads) ? numThreads : fiLimit / 99; #pragma omp parallel for if(fiLimit > 100) schedule(dynamic, blockSize) for (uint32 fi=0; fi<fiLimit; fi++) annotateRepeatsOnRead(unitigs, tig, &tig->ufpath[fi], deviationRepeat, repeatOlaps); writeLog("Annotated with %lu overlaps.\n", repeatOlaps.size()); // Merge marks for the same read into the largest possible. sort(repeatOlaps.begin(), repeatOlaps.end(), olapDatByEviRid); #ifdef SHOW_ANNOTATE for (uint32 ii=0; ii<repeatOlaps.size(); ii++) if (repeatOlaps[ii].tigbgn < 1000000) writeLog("repeatOlaps[%u] %u-%u from tig %u read %u RAW\n", ii, repeatOlaps[ii].tigbgn, repeatOlaps[ii].tigend, repeatOlaps[ii].eviTid, repeatOlaps[ii].eviRid); flushLog(); #endif for (uint32 dd=0, ss=1; ss<repeatOlaps.size(); ss++) { assert(repeatOlaps[dd].eviRid <= repeatOlaps[ss].eviRid); // If different evidence reads, close the destination olap, set up // for a new destination. if (repeatOlaps[dd].eviRid != repeatOlaps[ss].eviRid) { dd = ss; continue; } // If the destination ends before the source begins, there is no overlap between the // two regions. Close dd, set up for a new dd. if (repeatOlaps[dd].tigend <= repeatOlaps[ss].tigbgn) { dd = ss; continue; } // Otherwise, there must be an overlap. Extend the destination region, erase the source // region. repeatOlaps[dd].tigbgn = min(repeatOlaps[ss].tigbgn, repeatOlaps[dd].tigbgn); repeatOlaps[dd].tigend = max(repeatOlaps[ss].tigend, repeatOlaps[dd].tigend); repeatOlaps[ss].tigbgn = UINT32_MAX; repeatOlaps[ss].tigend = UINT32_MAX; repeatOlaps[ss].eviTid = UINT32_MAX; repeatOlaps[ss].eviRid = UINT32_MAX; } // Sort overlaps again. This pushes all those 'erased' regions to the end of the list, which // we can then just pop off. sort(repeatOlaps.begin(), repeatOlaps.end(), olapDatByEviRid); for (uint32 ii=repeatOlaps.size(); ii--; ) if (repeatOlaps[ii].eviTid == UINT32_MAX) repeatOlaps.pop_back(); // For logging, sort by coordinate sort(repeatOlaps.begin(), repeatOlaps.end()); #ifdef SHOW_ANNOTATE for (uint32 ii=0; ii<repeatOlaps.size(); ii++) writeLog("repeatOlaps[%d] %u-%u from tig %u read %u MERGED\n", ii, repeatOlaps[ii].tigbgn, repeatOlaps[ii].tigend, repeatOlaps[ii].eviTid, repeatOlaps[ii].eviRid); #endif // Make a new set of intervals based on all the detected repeats. tigMarksR.clear(); for (uint32 bb=0, ii=0; ii<repeatOlaps.size(); ii++) tigMarksR.add(repeatOlaps[ii].tigbgn, repeatOlaps[ii].tigend - repeatOlaps[ii].tigbgn); // Collapse these markings Collapse all the read markings to intervals on the unitig, merging those that overlap // significantly. writeLog("Merge marks.\n"); tigMarksR.merge(REPEAT_OVERLAP_MIN); // Scan reads, discard any mark that is contained in a read // // We don't need to filterShort() after every one is removed, but it's simpler to do it Right Now than // to track if it is needed. writeLog("Scan reads to discard spanned repeats.\n"); for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; bool frgfwd = (frg->position.bgn < frg->position.end); int32 frglo = (frgfwd) ? frg->position.bgn : frg->position.end; int32 frghi = (frgfwd) ? frg->position.end : frg->position.bgn; bool discarded = false; for (uint32 ri=0; ri<tigMarksR.numberOfIntervals(); ri++) { bool spanLo = false; bool spanHi = false; // The decision of 'spanned by a read' is broken into two pieces: does the read span the // lower (higher) boundary of the region. To be spanned, the boundary needs to be spanned // by at least MIN_ANCHOR_HANG additional bases (to anchor the read to non-repeat // sequence). // // This is a problem at the start/end of the tig, beacuse no read will extend past the // start/end of the tig. Instead, if the repeat is contained within the first (last) read // with no extension at the respective end, it is spanned. if ((frglo == 0) && // Read at start of tig, spans off the high end (tigMarksR.hi(ri) + MIN_ANCHOR_HANG <= frghi)) spanLo = spanHi = true; if ((frghi == tig->getLength()) && // Read at end of tig, spans off the low end (frglo + MIN_ANCHOR_HANG <= tigMarksR.lo(ri))) spanLo = spanHi = true; if (frglo + MIN_ANCHOR_HANG <= tigMarksR.lo(ri)) // Read spanned off the low end spanLo = true; if (tigMarksR.hi(ri) + MIN_ANCHOR_HANG <= frghi) // Read spanned off the high end spanHi = true; if (spanLo && spanHi) { writeLog("discard region %8d:%-8d - contained in read %6u %8d-%8d\n", tigMarksR.lo(ri), tigMarksR.hi(ri), frg->ident, frglo, frghi); tigMarksR.lo(ri) = 0; tigMarksR.hi(ri) = 0; discarded = true; } } if (discarded) tigMarksR.filterShort(1); } // Run through again, looking for the thickest overlap(s) to the remaining regions. // This isn't caring about the end effect noted above. #if 1 writeLog("thickest edges to the repeat regions:\n"); for (uint32 ri=0; ri<tigMarksR.numberOfIntervals(); ri++) { uint32 t5 = UINT32_MAX, l5 = 0, t5bgn, t5end; uint32 t3 = UINT32_MAX, l3 = 0, t3bgn, t3end; for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; bool frgfwd = (frg->position.bgn < frg->position.end); int32 frglo = (frgfwd) ? frg->position.bgn : frg->position.end; int32 frghi = (frgfwd) ? frg->position.end : frg->position.bgn; bool discarded = false; // Overlap off the 5' end of the region. if (frglo <= tigMarksR.lo(ri) && (tigMarksR.lo(ri) <= frghi)) { uint32 olap = frghi - tigMarksR.lo(ri); if (l5 < olap) { l5 = olap; t5 = fi; t5bgn = frglo; // Easier than recomputing it later on... t5end = frghi; } } // Overlap off the 3' end of the region. if (frglo <= tigMarksR.hi(ri) && (tigMarksR.hi(ri) <= frghi)) { uint32 olap = tigMarksR.hi(ri) - frglo; if (l3 < olap) { l3 = olap; t3 = fi; t3bgn = frglo; t3end = frghi; } } if (frglo <= tigMarksR.lo(ri) && (tigMarksR.hi(ri) <= frghi)) { writeLog("saved region %8d:%-8d - closest read %6u (%+6d) %8d:%-8d (%+6d) (contained)\n", tigMarksR.lo(ri), tigMarksR.hi(ri), frg->ident, tigMarksR.lo(ri) - frglo, frglo, frghi, frghi - tigMarksR.hi(ri)); } } if (t5 != UINT32_MAX) writeLog("saved region %8d:%-8d - closest 5' read %6u (%+6d) %8d:%-8d (%+6d)\n", tigMarksR.lo(ri), tigMarksR.hi(ri), tig->ufpath[t5].ident, tigMarksR.lo(ri) - t5bgn, t5bgn, t5end, t5end - tigMarksR.hi(ri)); if (t3 != UINT32_MAX) writeLog("saved region %8d:%-8d - closest 3' read %6u (%+6d) %8d:%-8d (%+6d)\n", tigMarksR.lo(ri), tigMarksR.hi(ri), tig->ufpath[t3].ident, tigMarksR.lo(ri) - t3bgn, t3bgn, t3end, t3end - tigMarksR.hi(ri)); } #endif // Scan reads. If a read intersects a repeat interval, and the best edge for that read // is entirely in the repeat region, decide if there is a near-best edge to something // not in this tig. // // A region with no such near-best edges is _probably_ correct. writeLog("search for confused edges:\n"); uint32 *isConfused = new uint32 [tigMarksR.numberOfIntervals()]; memset(isConfused, 0, sizeof(uint32) * tigMarksR.numberOfIntervals()); for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *rdA = &tig->ufpath[fi]; uint32 rdAid = rdA->ident; bool rdAfwd = (rdA->position.bgn < rdA->position.end); int32 rdAlo = (rdAfwd) ? rdA->position.bgn : rdA->position.end; int32 rdAhi = (rdAfwd) ? rdA->position.end : rdA->position.bgn; double sc = (rdAhi - rdAlo) / (double)FI->fragmentLength(rdAid); if ((OG->isContained(rdAid) == true) || (OG->isSuspicious(rdAid) == true)) continue; for (uint32 ri=0; ri<tigMarksR.numberOfIntervals(); ri++) { uint32 rMin = tigMarksR.lo(ri); uint32 rMax = tigMarksR.hi(ri); if ((rdAhi < rMin) || // Read ends before the region (rMax < rdAlo)) // Read starts after the region continue; // -> don't care about this read! // Compute the position (in the tig) of the best overlaps. int32 tig5bgn=0, tig5end=0; int32 tig3bgn=0, tig3end=0; // Instead of using the best edge - which might not be the edge used in the unitig - // we need to scan the layout to return the previous/next dovetail // Put this in a function - what to return if no best overlap? BestEdgeOverlap *b5 = OG->getBestEdgeOverlap(rdAid, false); BestEdgeOverlap *b3 = OG->getBestEdgeOverlap(rdAid, true); // If the best edge is to a read not in this tig, there is nothing to compare against. // Is this confused by default? Possibly. The unitig was constructed somehow, and that // must then be the edge coming into us. We'll pick it up later. bool b5use = true; bool b3use = true; if (b5->fragId() == 0) b5use = false; if (b3->fragId() == 0) b3use = false; if ((b5use) && (Unitig::fragIn(b5->fragId()) != tig->id())) b5use = false; if ((b3use) && (Unitig::fragIn(b3->fragId()) != tig->id())) b3use = false; // The best edge read is in this tig. If they don't overlap, again, nothing to compare // against. if (b5use) { ufNode *rdB = &tig->ufpath[Unitig::pathPosition(b5->fragId())]; uint32 rdBid = rdB->ident; bool rdBfwd = (rdB->position.bgn < rdB->position.end); int32 rdBlo = (rdBfwd) ? rdB->position.bgn : rdB->position.end; int32 rdBhi = (rdBfwd) ? rdB->position.end : rdB->position.bgn; if ((rdAhi < rdBlo) || (rdBhi < rdAlo)) b5use = false; } if (b3use) { ufNode *rdB = &tig->ufpath[Unitig::pathPosition(b3->fragId())]; uint32 rdBid = rdB->ident; bool rdBfwd = (rdB->position.bgn < rdB->position.end); int32 rdBlo = (rdBfwd) ? rdB->position.bgn : rdB->position.end; int32 rdBhi = (rdBfwd) ? rdB->position.end : rdB->position.bgn; if ((rdAhi < rdBlo) || (rdBhi < rdAlo)) b3use = false; } // If we can use this edge, compute the placement of the overlap on the unitig. // Call #1; if (b5use) { int32 bgn=0, end=0; olapToReadCoords(rdA, b5->ahang(), b5->bhang(), bgn, end); tig5bgn = (rdAfwd) ? (rdAlo + sc * bgn) : (rdAhi - sc * end); tig5end = (rdAfwd) ? (rdAlo + sc * end) : (rdAhi - sc * bgn); assert(tig5bgn < tig5end); if (tig5bgn < 0) tig5bgn = 0; if (tig5end > tig->getLength()) tig5end = tig->getLength(); } // Call #2 if (b3use) { int32 bgn=0, end=0; olapToReadCoords(rdA, b3->ahang(), b3->bhang(), bgn, end); tig3bgn = (rdAfwd) ? (rdAlo + sc * bgn) : (rdAhi - sc * end); tig3end = (rdAfwd) ? (rdAlo + sc * end) : (rdAhi - sc * bgn); assert(tig3bgn < tig3end); if (tig3bgn < 0) tig3bgn = 0; if (tig3end > tig->getLength()) tig3end = tig->getLength(); } // If either of the 5' or 3' overlaps (or both!) are in the repeat region, we need to check for // close overlaps on that end. uint32 len5 = 0; uint32 len3 = 0; if ((rMin < tig5bgn) && (tig5end < rMax) && (b5use)) len5 = FI->overlapLength(rdAid, b5->fragId(), b5->ahang(), b5->bhang()); else b5use = false; if ((rMin < tig3bgn) && (tig3end < rMax) && (b3use)) len3 = FI->overlapLength(rdAid, b3->fragId(), b3->ahang(), b3->bhang()); else b3use = false; double score5 = len5 * (1 - b5->erate()); double score3 = len3 * (1 - b3->erate()); // Neither of the best edges are in the repeat region; move to the next region and/or read. if (len5 + len3 == 0) continue; // At least one of the best edge overlaps is in the repeat region. Scan for other edges // that are of comparable length and quality. uint32 ovlLen = 0; BAToverlap *ovl = OC->getOverlaps(rdAid, AS_MAX_ERATE, ovlLen); for (uint32 oo=0; oo<ovlLen; oo++) { uint32 rdBid = ovl[oo].b_iid; uint32 tgBid = Unitig::fragIn(rdBid); // If the read is in a singleton, skip. These are unassembled crud. if ((tgBid == 0) || (unitigs[tgBid] == NULL) || (unitigs[tgBid]->ufpath.size() == 1)) continue; // If the read is in an annotated bubble, skip. if (unitigs[tgBid]->_isBubble) continue; // Skip if this overlap is the best we're trying to match. if ((rdBid == b5->fragId()) || (rdBid == b3->fragId())) continue; // Skip if this overlap is crappy quality if (OG->isOverlapBadQuality(ovl[oo])) continue; // Skip if the read is contained or suspicious. if ((OG->isContained(rdBid) == true) || (OG->isSuspicious(rdBid) == true)) continue; // Skip if the overlap isn't dovetail. bool ovl5 = ovl[oo].AEndIs5prime(); bool ovl3 = ovl[oo].AEndIs3prime(); if ((ovl5 == false) && (ovl3 == false)) continue; // Skip if we're not using this overlap if ((ovl5 == true) && (b5use == false)) continue; if ((ovl3 == true) && (b3use == false)) continue; uint32 rdBpos = unitigs[tgBid]->pathPosition(rdBid); ufNode *rdB = &unitigs[tgBid]->ufpath[rdBpos]; bool rdBfwd = (rdB->position.bgn < rdB->position.end); int32 rdBlo = (rdBfwd) ? rdB->position.bgn : rdB->position.end; int32 rdBhi = (rdBfwd) ? rdB->position.end : rdB->position.bgn; // If the overlap is to a read in a different tig, or // the overlap is to a read in the same tig, but we don't overlap in the tig, check lengths. // Otherwise, the overlap is present in the tig, and can't be confused. if ((tgBid == tig->id()) && (rdBlo <= rdAhi) && (rdAlo <= rdBhi)) continue; uint32 len = FI->overlapLength(rdAid, ovl[oo].b_iid, ovl[oo].a_hang, ovl[oo].b_hang); double score = len * (1 - ovl[oo].erate); // Compute percent difference. double ad5 = fabs(score - score5); double ad3 = fabs(score - score3); double pd5 = 200 * ad5 / (score + score5); double pd3 = 200 * ad3 / (score + score3); // Skip if this overlap is vastly worse than the best. if ((ovl5 == true) && ((ad5 >= confusedAbsolute) || (pd3 > confusedPercent))) { writeLog("tig %7u read %8u pos %7u-%-7u NOT confused by 5' edge to read %8u - best edge read %8u len %6u erate %.4f score %8.2f - alt edge len %6u erate %.4f score %8.2f - absdiff %8.2f percdiff %8.4f\n", tig->id(), rdAid, rdAlo, rdAhi, rdBid, b5->fragId(), len5, b5->erate(), score5, len, ovl[oo].erate, score, ad5, pd5); continue; } if ((ovl3 == true) && ((ad3 >= confusedAbsolute) || (pd3 > confusedPercent))) { writeLog("tig %7u read %8u pos %7u-%-7u NOT confused by 3' edge to read %8u - best edge read %8u len %6u erate %.4f score %8.2f - alt edge len %6u erate %.4f score %8.2f - absdiff %8.2f percdiff %8.4f\n", tig->id(), rdAid, rdAlo, rdAhi, rdBid, b3->fragId(), len3, b3->erate(), score3, len, ovl[oo].erate, score, ad3, pd3); continue; } // Potential confusion! if (ovl5 == true) writeLog("tig %7u read %8u pos %7u-%-7u IS confused by 5' edge to read %8u - best edge read %8u len %6u erate %.4f score %8.2f - alt edge len %6u erate %.4f score %8.2f - absdiff %8.2f percdiff %8.4f\n", tig->id(), rdAid, rdAlo, rdAhi, rdBid, b5->fragId(), len5, b5->erate(), score5, len, ovl[oo].erate, score, ad5, pd5); if (ovl3 == true) writeLog("tig %7u read %8u pos %7u-%-7u IS confused by 3' edge to read %8u - best edge read %8u len %6u erate %.4f score %8.2f - alt edge len %6u erate %.4f score %8.2f - absdiff %8.2f percdiff %8.4f\n", tig->id(), rdAid, rdAlo, rdAhi, rdBid, b3->fragId(), len3, b3->erate(), score3, len, ovl[oo].erate, score, ad3, pd3); isConfused[ri]++; } } // Over all marks (ri) } // Over all reads (fi) // Scan all the regions, and delete any that have no confusion. { bool discarded = false; for (uint32 ri=0; ri<tigMarksR.numberOfIntervals(); ri++) { if (isConfused[ri] == 0) { writeLog("discard region %8d:%-8d - no confusion in best edges\n", tigMarksR.lo(ri), tigMarksR.hi(ri)); tigMarksR.lo(ri) = 0; tigMarksR.hi(ri) = 0; discarded = true; } else { writeLog("saved region %8d:%-8d - %u best edges are potentially confused\n", tigMarksR.lo(ri), tigMarksR.hi(ri), isConfused[ri]); } } if (discarded) tigMarksR.filterShort(1); } delete [] isConfused; // Scan reads, join any marks that have their junctions spanned by a sufficiently large amount. // // If the read spans this junction be the usual amount, merge the intervals. // // The intervals can be overlapping (by up to REPEAT_OVERLAP_MIN (x2?) bases. For this junction // to be spanned, the read must span from min-ROM to max+ROM, not just hi(ri-1) to lo(ri). // // We DO need to filterShort() after every merge, otherwise, we'd have an empty bogus interval // in the middle of our list, which could be preventing some other merge. OK, we could // // Anything that gets merged is now no longer a true repeat. It's unique, just bordered by repeats. // We can't track this through the indices (because we delete things). We track it with a set of // begin coordinates. set<int32> nonRepeatIntervals; writeLog("Scan reads to merge repeat regions.\n"); for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; bool frgfwd = (frg->position.bgn < frg->position.end); int32 frglo = (frgfwd) ? frg->position.bgn : frg->position.end; int32 frghi = (frgfwd) ? frg->position.end : frg->position.bgn; bool merged = false; for (uint32 ri=1; ri<tigMarksR.numberOfIntervals(); ri++) { uint32 rMin = min(tigMarksR.hi(ri-1), tigMarksR.lo(ri)); uint32 rMax = max(tigMarksR.hi(ri-1), tigMarksR.lo(ri)); if ((frglo + MIN_ANCHOR_HANG <= rMin) && (rMax + MIN_ANCHOR_HANG <= frghi)) { writeLog("merge regions %8d:%-8d and %8d:%-8d - junction contained in read %6u %5d-%5d\n", tigMarksR.lo(ri-1), tigMarksR.hi(ri-1), tigMarksR.lo(ri), tigMarksR.hi(ri), frg->ident, frglo, frghi); tigMarksR.lo(ri) = tigMarksR.lo(ri-1); tigMarksR.lo(ri-1) = 0; // CRITICAL to delete this interval (and not ri) because the next tigMarksR.hi(ri-1) = 0; // iteration will be using ri-1 (== ri here) and ri (== ri+1). merged = true; nonRepeatIntervals.insert(tigMarksR.lo(ri)); } } if (merged) tigMarksR.filterShort(1); } // Extend the regions by MIN_ANCHOR_HANG. This makes checking for reads that span and are // anchored in the next region easier. It also solved a quirk when the first/last repeat // region doesn't extend to the end of the sequence: // 0-183 unique (created from inversion below, but useless and incorrect) // 183-9942 repeat for (uint32 ii=0; ii<tigMarksR.numberOfIntervals(); ii++) { tigMarksR.lo(ii) = max<int32>(tigMarksR.lo(ii) - MIN_ANCHOR_HANG, 0); tigMarksR.hi(ii) = min<int32>(tigMarksR.hi(ii) + MIN_ANCHOR_HANG, tig->getLength()); } // Find the non-repeat intervals. tigMarksU = tigMarksR; tigMarksU.invert(0, tig->getLength()); // Create the list of intervals we'll use to make new unitigs. // // The repeat intervals are extended by MIN_ANCHOR_HANG, and then any read fully contained in one of // these is moved here. // // The non-repeat intervals are shortened by the same amount, and any read that intersects one // is moved there. // // Does order matter? Not sure. The repeat intervals are first, then the formerly repeat // merged intervals, then the unique intervals. Splitting might depend on the repeats being // first. writeLog("Make breakpoints.\n"); vector<breakPointCoords> BP; for (uint32 ii=0; ii<tigMarksR.numberOfIntervals(); ii++) if (nonRepeatIntervals.count(tigMarksR.lo(ii)) == 0) BP.push_back(breakPointCoords(ti, tigMarksR.lo(ii), tigMarksR.hi(ii), true)); for (uint32 ii=0; ii<tigMarksR.numberOfIntervals(); ii++) if (nonRepeatIntervals.count(tigMarksR.lo(ii)) != 0) BP.push_back(breakPointCoords(ti, tigMarksR.lo(ii), tigMarksR.hi(ii), true)); for (uint32 ii=0; ii<tigMarksU.numberOfIntervals(); ii++) { BP.push_back(breakPointCoords(ti, tigMarksU.lo(ii), tigMarksU.hi(ii), false)); } // If only one region, the whole unitig was declared repeat. Nothing to do. if (BP.size() == 1) continue; sort(BP.begin(), BP.end()); // Report. writeLog("break tig %u into up to %u pieces:\n", ti, BP.size()); for (uint32 ii=0; ii<BP.size(); ii++) writeLog(" %8d %8d %s (length %d)\n", BP[ii]._bgn, BP[ii]._end, BP[ii]._isRepeat ? "repeat" : "unique", BP[ii]._end - BP[ii]._bgn); // Scan the reads, counting the number of reads that would be placed in each new tig. This is done // because there are a few 'splits' that don't move any reads around. Unitig **newTigs = new Unitig * [BP.size()]; int32 *lowCoord = new int32 [BP.size()]; uint32 *nRepeat = new uint32 [BP.size()]; uint32 *nUnique = new uint32 [BP.size()]; // First call, count the number of tigs we would create if we let it create them. uint32 nTigs = splitUnitigs(unitigs, tig, BP, newTigs, lowCoord, nRepeat, nUnique, false); // Second call, actually create the tigs, if anything would change. if (nTigs > 1) splitUnitigs(unitigs, tig, BP, newTigs, lowCoord, nRepeat, nUnique, true); // Report the tigs created. for (uint32 ii=0; ii<BP.size(); ii++) { int32 rgnbgn = BP[ii]._bgn; int32 rgnend = BP[ii]._end; bool repeat = BP[ii]._isRepeat; if (nRepeat[ii] + nUnique[ii] == 0) writeLog("For tig %5u %s region %8d %8d - %6u/%6u repeat/unique reads - no new unitig created.\n", ti, (repeat == true) ? "repeat" : "unique", rgnbgn, rgnend, nRepeat[ii], nUnique[ii]); else if (nTigs > 1) writeLog("For tig %5u %s region %8d %8d - %6u/%6u reads repeat/unique - unitig %5u created.\n", ti, (repeat == true) ? "repeat" : "unique", rgnbgn, rgnend, nRepeat[ii], nUnique[ii], newTigs[ii]->id()); else writeLog("For tig %5u %s region %8d %8d - %6u/%6u repeat/unique reads - unitig %5u remains unchanged.\n", ti, (repeat == true) ? "repeat" : "unique", rgnbgn, rgnend, nRepeat[ii], nUnique[ii], tig->id()); } // Cleanup. delete [] newTigs; delete [] lowCoord; delete [] nRepeat; delete [] nUnique; // Remove the old unitig....if we made new ones. if (nTigs > 1) { delete tig; unitigs[ti] = NULL; } } }