// Examine the first (few?) fragments of a unitig, evaluate if they indicate a join should be made. static bool joinUnitigs_examineEnd(UnitigVector &unitigs, Unitig *fr, uint32 idx, bool frFirstEnd, vector<joinEntry> &joins) { uint32 frgIdx = (frFirstEnd) ? (idx) : (fr->ufpath.size() - 1 - idx); ufNode *frg = &fr->ufpath[frgIdx]; bool frgRev = (frg->position.end < frg->position.bgn); // Grab the best edge for this end frag. The last arg requests the 3' end if true. // // If we're looking at the first read, we want to get: // 5' - if the frag is forward // 3' - if the frag is reverse (frgRev == true) // // If we're looking at the lat read, we want to get: // 5' - if the frag is reverse // 3' - if the frag is forward (frgRev == false) // BestEdgeOverlap *bestEdge = OG->getBestEdgeOverlap(frg->ident, (frgRev == frFirstEnd)); uint32 tgtId = bestEdge->fragId(); bool tgt3p = bestEdge->frag3p(); if (tgtId == 0) // No best edge? Skip it. return(false); // Grab the unitig for that best edge. uint32 toID = fr->fragIn(tgtId); Unitig *to = unitigs[toID]; if (to->ufpath.size() == 1) // Joining to something teeny? Don't bother checking further. return(false); if (to->id() == fr->id()) // Join to myself? Nope. return(false); // Grab the read we have an edge to, an compute the overlapping length and left over length. ufNode *tgt = &to->ufpath[to->pathPosition(tgtId)]; bool tgtRev = (tgt->position.end < tgt->position.bgn); // If tgt3p (we overlap to the 3' end) is the same as tgtRev (read is reverse) then the unitig is oriented // correctly. Otherwise, positions need to be reverse-complemented. bool toFlip = false; if ((frFirstEnd == true) && (tgt3p == false) && (tgtRev == false)) // source read is at the start, overlap to 5' and the read is forward, need to flip the target unitig toFlip = true; if ((frFirstEnd == true) && (tgt3p == true) && (tgtRev == true)) // source read is at the start, overlap to 3' and the read is reverse, need to flip the target unitig toFlip = true; if ((frFirstEnd == false) && (tgt3p == false) && (tgtRev == true)) // source read is at the end, overlap to 5' and the read is reverse, need to flip the target unitig toFlip = true; if ((frFirstEnd == false) && (tgt3p == true) && (tgtRev == false)) // source read is at the end, overlap to 3' and the read is forward, need to flip the target unitig toFlip = true; uint32 toMin = MIN(tgt->position.bgn, tgt->position.end); uint32 toMax = MAX(tgt->position.bgn, tgt->position.end); uint32 toLen = to->getLength(); uint32 frLen = fr->getLength(); if (toFlip) { toMin = toLen - MAX(tgt->position.bgn, tgt->position.end); toMax = toLen - MIN(tgt->position.bgn, tgt->position.end); } assert(toMin < toMax); // Our two unitigs are of length frLen and toLen. We are appending some portion of 'to' onto // 'fr', and 'discarding' the rest. If the 'discarded' piece is larger than the 'fr' unitig, we // don't want to do the join. // // We err on the side of the discarded piece. uint32 joinLen = 0; uint32 discLen = 0; if (frFirstEnd == true) { joinLen = toMin + frLen; // Prepend the start of 'to' onto 'fr'. discLen = toLen - toMin; } else { joinLen = frLen + toLen - toMax; // Append the end of 'to' onto 'fr'. discLen = toMax; } // If the discard is bigger than us, we do damage by joining. if (discLen > frLen) return(false); // The joined should be much larger and the discarded much smaller. uint32 maxLen = MAX(frLen, toLen); uint32 minLen = MIN(frLen, toLen); double joinChange = (double)joinLen / maxLen; double discChange = (double)discLen / minLen; bool isBad = false; if ((joinChange < 1.10) || (0.75 < discChange)) // Bad if we didn't really change sizes. isBad = true; if ((1.0 < joinChange) && (discChange < 0.5)) // But good if discard is tiny. This occurs if we merge a small with a big. The join change // is somewhat small (1.05 say) yet most of the smaller unitig is used. isBad = false; if (isBad) { writeLog("joinUnitigs_examineEnd()-- join unitig %6u (%7ubp) frag %6u %s <-> unitig %6u (%7ubp) frag %6u %s <-> length %5.2f %7u and %5.2f %7u BAD\n", fr->id(), fr->getLength(), frg->ident, (frgRev) ? "rev" : "fwd", to->id(), to->getLength(), tgt->ident, (tgtRev) ? "rev" : "fwd", joinChange, joinLen, discChange, discLen); return(false); } // OK, join. writeLog("joinUnitigs_examineEnd()-- join unitig %6u (%7ubp) frag %6u %s <-> unitig %6u (%7ubp) frag %6u %s <-> length %5.2f %7u and %5.2f %7u\n", fr->id(), fr->getLength(), frg->ident, (frgRev) ? "rev" : "fwd", to->id(), to->getLength(), tgt->ident, (tgtRev) ? "rev" : "fwd", joinChange, joinLen, discChange, discLen); joins.push_back(joinEntry(frg->ident, frFirstEnd, tgt->ident, toFlip, joinLen)); return(true); }
intersectionList::intersectionList(UnitigVector &unitigs) { for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *tig = unitigs[ti]; if (tig == NULL) continue; intersectionEvidence *evidence = new intersectionEvidence [tig->ufpath.size()]; for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; if (OG->isContained(frg->ident)) continue; // For my best overlap, the ID of the unitig that the overlapping fragment is in. evidence[fi].edge5 = *OG->getBestEdgeOverlap(frg->ident, false); evidence[fi].edge3 = *OG->getBestEdgeOverlap(frg->ident, true); evidence[fi].frag5tig = tig->fragIn(evidence[fi].edge5.fragId()); evidence[fi].frag3tig = tig->fragIn(evidence[fi].edge3.fragId()); // Do NOT initialize these! An earlier fragment could have already confirmed an end. // Properly, only the 5' end of a forward fragment (or 3' end of a reverse fragment) can be // confirmed already (otherwise the tig is nonsense), but we don't yet check that. // //evidence[fi].frag5confirmed = false; //evidence[fi].frag3confirmed = false; // But, because the path could be promiscuous, not every overlap to a different tig is bad. // // If my best overlap is to a different tig, but there is an overlapping fragment (in the // unitig placement) with a best edge to me, I'm still good. The BOG build this unitig using // the edge from the other fragment to me. // // If the fragments do not overlap in the layout (yet the best edge still exists) that is a // self-intersection. // // The two blocks are identical, except for 'edge3' and 'edge5'. if (evidence[fi].frag5tig == tig->id()) { uint32 ti = tig->pathPosition(evidence[fi].edge5.fragId()); ufNode *trg = &tig->ufpath[ti]; uint32 minf = (frg->position.bgn < frg->position.end) ? frg->position.bgn : frg->position.end; uint32 maxf = (frg->position.bgn < frg->position.end) ? frg->position.end : frg->position.bgn; uint32 mint = (trg->position.bgn < trg->position.end) ? trg->position.bgn : trg->position.end; uint32 maxt = (trg->position.bgn < trg->position.end) ? trg->position.end : trg->position.bgn; // If they overlap, mark as confirmed, else remember an intersection. if (((minf < mint) && (mint < maxf)) || // t begins inside f ((mint < minf) && (minf < maxt))) { // f begins inside t if (evidence[fi].edge5.frag3p()) evidence[ti].frag3confirmed = true; else evidence[ti].frag5confirmed = true; } else { evidence[fi].frag5self = true; // Not the correct place to report this. Some of these get confirmed by later fragments. //writeLog("BUG1 F: %d,%d T %d,%d\n", minf, maxf, mint, maxt); //writeLog("INTERSECT from unitig %d frag %d end %d TO unitig %d frag %d end %d (SELF)\n", // tig->id(), frg->ident, 5, evidence[fi].frag5tig, evidence[fi].edge5.fragId(), evidence[fi].edge5.frag3p() ? 3 : 5); } } if (evidence[fi].frag3tig == tig->id()) { uint32 ti = tig->pathPosition(evidence[fi].edge3.fragId()); ufNode *trg = &tig->ufpath[ti]; uint32 minf = (frg->position.bgn < frg->position.end) ? frg->position.bgn : frg->position.end; uint32 maxf = (frg->position.bgn < frg->position.end) ? frg->position.end : frg->position.bgn; uint32 mint = (trg->position.bgn < trg->position.end) ? trg->position.bgn : trg->position.end; uint32 maxt = (trg->position.bgn < trg->position.end) ? trg->position.end : trg->position.bgn; if (((minf < mint) && (mint < maxf)) || // t begins inside f ((mint < minf) && (minf < maxt))) { // f begins inside t if (evidence[fi].edge3.frag3p()) evidence[ti].frag3confirmed = true; else evidence[ti].frag5confirmed = true; } else { evidence[fi].frag3self = true; // Not the correct place to report this. Some of these get confirmed by later fragments. //writeLog("BUG2 F: %d,%d T %d,%d\n", minf, maxf, mint, maxt); //writeLog("INTERSECT from unitig %d frag %d end %d TO unitig %d frag %d end %d (SELF)\n", // tig->id(), frg->ident, 3, evidence[fi].frag3tig, evidence[fi].edge3.fragId(), evidence[fi].edge3.frag3p() ? 3 : 5); } } } // // Build the list. // for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; if ((evidence[fi].frag5tig != 0) && (evidence[fi].frag5tig != tig->id()) && (evidence[fi].frag5confirmed == false)) isects.push_back(intersectionPoint(evidence[fi].edge5, frg->ident, false, false)); if ((evidence[fi].frag5tig == tig->id()) && (evidence[fi].frag5self == true) && (evidence[fi].frag5confirmed == false)) isects.push_back(intersectionPoint(evidence[fi].edge5, frg->ident, false, true)); if ((evidence[fi].frag3tig != 0) && (evidence[fi].frag3tig != tig->id()) && (evidence[fi].frag3confirmed == false)) isects.push_back(intersectionPoint(evidence[fi].edge3, frg->ident, true, false)); if ((evidence[fi].frag3tig == tig->id()) && (evidence[fi].frag3self == true) && (evidence[fi].frag3confirmed == false)) isects.push_back(intersectionPoint(evidence[fi].edge3, frg->ident, true, true)); } delete [] evidence; } // Sort the intersections by the ID of the intersected fragment, then build an index into the array. std::sort(isects.begin(), isects.end()); // Terminate the intersection list with a sentinal intersection. This is CRITICAL // to the way we iterate over intersections. isects.push_back(intersectionPoint(BestEdgeOverlap(), 0, true, true)); // Build a map from fragment id to the first intersection in the list. for (uint32 i=0; i<isects.size(); i++) { isectsNum[isects[i].isectFrg]++; if (isectsMap.find(isects[i].isectFrg) == isectsMap.end()) isectsMap[isects[i].isectFrg] = i; } }
void UnitigGraph::setParentAndHang(void) { for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg == NULL) continue; if (utg->ufpath.size() == 0) continue; // Reset parent and hangs for everything. for (uint32 fi=1; fi<utg->ufpath.size(); fi++) { ufNode *frg = &utg->ufpath[fi]; frg->parent = 0; frg->ahang = 0; frg->bhang = 0; } // For each fragment, set parent/hangs using the edges. for (uint32 fi=0; fi<utg->ufpath.size(); fi++) { ufNode *frg = &utg->ufpath[fi]; // If we're contained, gee, I sure hope the container is here! BestContainment *bestcont = OG->getBestContainer(frg->ident); if ((bestcont) && (utg->fragIn(bestcont->container) == utg->id())) { int32 pi = utg->pathPosition(bestcont->container); ufNode *par = &utg->ufpath[pi]; frg->parent = bestcont->container; // The hangs assume the container is forward; adjust if not so. if (par->position.bgn < par->position.end) { frg->ahang = bestcont->a_hang; frg->bhang = bestcont->b_hang; } else { frg->ahang = -bestcont->b_hang; frg->bhang = -bestcont->a_hang; } continue; } // Nope, not contained. If we don't have a parent set, see if one of our best overlaps // can set it. BestEdgeOverlap *bestedge5 = OG->getBestEdgeOverlap(frg->ident, false); BestEdgeOverlap *bestedge3 = OG->getBestEdgeOverlap(frg->ident, true); if ((bestedge5->fragId()) && (utg->fragIn(bestedge5->fragId()) == utg->id())) { int32 pi5 = utg->pathPosition(bestedge5->fragId()); ufNode *oth = &utg->ufpath[pi5]; // Consensus is expected parent/hangs to be relative to the parent fragment. This is used // ONLY to place the fragment, not to orient the fragment. Orientation comes from the // absolute positioning coordinates. // // Interestingly, all four overlap transformations are used here. // // The inner if tests (on fragment orientation) should be asserts, but due to imprecise // layouts, they are sometimes violated: // A fragment from 271-547 had a 5'overlap to something after it; // the frag after was at 543-272, close enough to a tie to screw up placements // if (pi5 < fi) { // We have an edge off our 5' end to something before us --> fragment MUST be forward. // Flip the overlap so it is relative to the other fragment. if (frg->position.bgn < frg->position.end) { frg->parent = bestedge5->fragId(); frg->ahang = -bestedge5->ahang(); frg->bhang = -bestedge5->bhang(); assert(frg->ahang >= 0); } } else { // We have an edge off our 5' end to something after us --> fragment MUST be reverse. // Because our fragment is now reverse, we must reverse the overlap too. if (frg->position.end < frg->position.bgn) { oth->parent = frg->ident; oth->ahang = -bestedge5->bhang(); oth->bhang = -bestedge5->ahang(); assert(oth->ahang >= 0); } } } if ((bestedge3->fragId()) && (utg->fragIn(bestedge3->fragId()) == utg->id())) { int32 pi3 = utg->pathPosition(bestedge3->fragId()); ufNode *oth = &utg->ufpath[pi3]; if (pi3 < fi) { // We have an edge off our 3' end to something before us --> fragment MUST be reverse. // Flip the overlap so it is relative to the other fragment. // Because our fragment is now reverse, we must reverse the overlap too. if (frg->position.end < frg->position.bgn) { frg->parent = bestedge3->fragId(); frg->ahang = bestedge3->bhang(); frg->bhang = bestedge3->ahang(); assert(frg->ahang >= 0); } } else { // We have an edge off our 3' end to something after us --> fragment MUST be forward. // This is the simplest case, the overlap is already correct. if (frg->position.bgn < frg->position.end) { oth->parent = frg->ident; oth->ahang = bestedge3->ahang(); oth->bhang = bestedge3->bhang(); assert(oth->ahang >= 0); } } } } } }