// Examine the first (few?) fragments of a unitig, evaluate if they indicate a join should be made. static bool joinUnitigs_examineEnd(UnitigVector &unitigs, Unitig *fr, uint32 idx, bool frFirstEnd, vector<joinEntry> &joins) { uint32 frgIdx = (frFirstEnd) ? (idx) : (fr->ufpath.size() - 1 - idx); ufNode *frg = &fr->ufpath[frgIdx]; bool frgRev = (frg->position.end < frg->position.bgn); // Grab the best edge for this end frag. The last arg requests the 3' end if true. // // If we're looking at the first read, we want to get: // 5' - if the frag is forward // 3' - if the frag is reverse (frgRev == true) // // If we're looking at the lat read, we want to get: // 5' - if the frag is reverse // 3' - if the frag is forward (frgRev == false) // BestEdgeOverlap *bestEdge = OG->getBestEdgeOverlap(frg->ident, (frgRev == frFirstEnd)); uint32 tgtId = bestEdge->fragId(); bool tgt3p = bestEdge->frag3p(); if (tgtId == 0) // No best edge? Skip it. return(false); // Grab the unitig for that best edge. uint32 toID = fr->fragIn(tgtId); Unitig *to = unitigs[toID]; if (to->ufpath.size() == 1) // Joining to something teeny? Don't bother checking further. return(false); if (to->id() == fr->id()) // Join to myself? Nope. return(false); // Grab the read we have an edge to, an compute the overlapping length and left over length. ufNode *tgt = &to->ufpath[to->pathPosition(tgtId)]; bool tgtRev = (tgt->position.end < tgt->position.bgn); // If tgt3p (we overlap to the 3' end) is the same as tgtRev (read is reverse) then the unitig is oriented // correctly. Otherwise, positions need to be reverse-complemented. bool toFlip = false; if ((frFirstEnd == true) && (tgt3p == false) && (tgtRev == false)) // source read is at the start, overlap to 5' and the read is forward, need to flip the target unitig toFlip = true; if ((frFirstEnd == true) && (tgt3p == true) && (tgtRev == true)) // source read is at the start, overlap to 3' and the read is reverse, need to flip the target unitig toFlip = true; if ((frFirstEnd == false) && (tgt3p == false) && (tgtRev == true)) // source read is at the end, overlap to 5' and the read is reverse, need to flip the target unitig toFlip = true; if ((frFirstEnd == false) && (tgt3p == true) && (tgtRev == false)) // source read is at the end, overlap to 3' and the read is forward, need to flip the target unitig toFlip = true; uint32 toMin = MIN(tgt->position.bgn, tgt->position.end); uint32 toMax = MAX(tgt->position.bgn, tgt->position.end); uint32 toLen = to->getLength(); uint32 frLen = fr->getLength(); if (toFlip) { toMin = toLen - MAX(tgt->position.bgn, tgt->position.end); toMax = toLen - MIN(tgt->position.bgn, tgt->position.end); } assert(toMin < toMax); // Our two unitigs are of length frLen and toLen. We are appending some portion of 'to' onto // 'fr', and 'discarding' the rest. If the 'discarded' piece is larger than the 'fr' unitig, we // don't want to do the join. // // We err on the side of the discarded piece. uint32 joinLen = 0; uint32 discLen = 0; if (frFirstEnd == true) { joinLen = toMin + frLen; // Prepend the start of 'to' onto 'fr'. discLen = toLen - toMin; } else { joinLen = frLen + toLen - toMax; // Append the end of 'to' onto 'fr'. discLen = toMax; } // If the discard is bigger than us, we do damage by joining. if (discLen > frLen) return(false); // The joined should be much larger and the discarded much smaller. uint32 maxLen = MAX(frLen, toLen); uint32 minLen = MIN(frLen, toLen); double joinChange = (double)joinLen / maxLen; double discChange = (double)discLen / minLen; bool isBad = false; if ((joinChange < 1.10) || (0.75 < discChange)) // Bad if we didn't really change sizes. isBad = true; if ((1.0 < joinChange) && (discChange < 0.5)) // But good if discard is tiny. This occurs if we merge a small with a big. The join change // is somewhat small (1.05 say) yet most of the smaller unitig is used. isBad = false; if (isBad) { writeLog("joinUnitigs_examineEnd()-- join unitig %6u (%7ubp) frag %6u %s <-> unitig %6u (%7ubp) frag %6u %s <-> length %5.2f %7u and %5.2f %7u BAD\n", fr->id(), fr->getLength(), frg->ident, (frgRev) ? "rev" : "fwd", to->id(), to->getLength(), tgt->ident, (tgtRev) ? "rev" : "fwd", joinChange, joinLen, discChange, discLen); return(false); } // OK, join. writeLog("joinUnitigs_examineEnd()-- join unitig %6u (%7ubp) frag %6u %s <-> unitig %6u (%7ubp) frag %6u %s <-> length %5.2f %7u and %5.2f %7u\n", fr->id(), fr->getLength(), frg->ident, (frgRev) ? "rev" : "fwd", to->id(), to->getLength(), tgt->ident, (tgtRev) ? "rev" : "fwd", joinChange, joinLen, discChange, discLen); joins.push_back(joinEntry(frg->ident, frFirstEnd, tgt->ident, toFlip, joinLen)); return(true); }
// Given two fragments that share at least one edge, this will find that edge and construct a new // edge to make it mutual. // // For example, if there is a best edge from aFrg 3' to bFrg 5', this will return that edge in a3, // and also create the symmetric edge in b5. // static bool findEdges(ufNode *aFrg, BestEdgeOverlap &a5, BestEdgeOverlap &a3, ufNode *bFrg, BestEdgeOverlap &b5, BestEdgeOverlap &b3) { if (OG->isContained(aFrg->ident) || OG->isContained(bFrg->ident)) return(false); // Grab what edges we have. a5 = *OG->getBestEdgeOverlap(aFrg->ident, false); a3 = *OG->getBestEdgeOverlap(aFrg->ident, true); b5 = *OG->getBestEdgeOverlap(bFrg->ident, false); b3 = *OG->getBestEdgeOverlap(bFrg->ident, true); // Erase things that aren't correct if (a5.fragId() != bFrg->ident) a5 = BestEdgeOverlap(); if (a3.fragId() != bFrg->ident) a3 = BestEdgeOverlap(); if (b5.fragId() != aFrg->ident) b5 = BestEdgeOverlap(); if (b3.fragId() != aFrg->ident) b3 = BestEdgeOverlap(); // If we have no edges left, there are no edges! if ((b5.fragId() != aFrg->ident) && (b3.fragId() != aFrg->ident) && (a5.fragId() != bFrg->ident) && (a3.fragId() != bFrg->ident)) return(false); // If we found TWO edges for any single fragment....that's madness! That means the fragment // had best dovetail overlaps to the same other fragment off of both ends. We'll complain // and return failure. Ideally, data like this will be cleaned up by OBT, or filtered from // our input. // if (a5.fragId() == a3.fragId()) { writeLog("findEdges()-- frag %d has multiple edges to frag %d - a5 %d/%d' a3 %d/%d'\n", aFrg->ident, a5.fragId(), a5.fragId(), a5.frag3p() ? 3 : 5, a5.fragId(), a5.frag3p() ? 3 : 5); } if (b5.fragId() == b3.fragId()) { writeLog("findEdges()-- frag %d has multiple edges to frag %d - b5 %d/%d' b3 %d/%d'\n", bFrg->ident, b5.fragId(), b5.fragId(), b5.frag3p() ? 3 : 5, b5.fragId(), b5.frag3p() ? 3 : 5); } if (((a5.fragId() != 0) && (a5.fragId() == a3.fragId())) || ((b5.fragId() != 0) && (b5.fragId() == b3.fragId()))) { a5 = BestEdgeOverlap(); a3 = BestEdgeOverlap(); b5 = BestEdgeOverlap(); b3 = BestEdgeOverlap(); return(false); } // Now, populate the other edges using whatever we have. Best case is that we have two edges // (because we're done). assert(((a5.fragId() == bFrg->ident) + (a3.fragId() == bFrg->ident) + (b5.fragId() == aFrg->ident) + (b3.fragId() == aFrg->ident)) <= 2); if (((a5.fragId() == bFrg->ident) || (a3.fragId() == bFrg->ident)) && ((b5.fragId() == aFrg->ident) || (b3.fragId() == aFrg->ident))) return(true); // Otherwise, we have exactly one edge, and the other one needs to be created. assert(((a5.fragId() == bFrg->ident) + (a3.fragId() == bFrg->ident) + (b5.fragId() == aFrg->ident) + (b3.fragId() == aFrg->ident)) == 1); if (a5.fragId() == bFrg->ident) { //assert(a5.fragId() == 0); assert(a3.fragId() == 0); assert(b5.fragId() == 0); assert(b3.fragId() == 0); // Edge off of A's 5' end ('false' below)... // ...to B's 3' end (so ANTI or NORMAL -- negate the hangs) // ...to B's 5' end (so INNIE or OUTTIE -- swap the hangs) if (a5.frag3p()) b3.set(aFrg->ident, false, -a5.ahang(), -a5.bhang()); else b5.set(aFrg->ident, false, a5.bhang(), a5.ahang()); } else if (a3.fragId() == bFrg->ident) { assert(a5.fragId() == 0); //assert(a3.fragId() == 0); assert(b5.fragId() == 0); assert(b3.fragId() == 0); // Edge off of A's 3' end ('true' below)... // ...to B's 3' end (so INNIE or OUTTIE -- swap the hangs) // ...to B's 5' end (so ANTI or NORMAL -- negate the hangs) if (a3.frag3p()) b3.set(aFrg->ident, true, a3.bhang(), a3.ahang()); else b5.set(aFrg->ident, true, -a3.ahang(), -a3.bhang()); } else if (b5.fragId() == aFrg->ident) { assert(a5.fragId() == 0); assert(a3.fragId() == 0); //assert(b5.fragId() == 0); assert(b3.fragId() == 0); if (b5.frag3p()) a3.set(bFrg->ident, false, -b5.ahang(), -b5.bhang()); else a5.set(bFrg->ident, false, b5.bhang(), b5.ahang()); } else if (b3.fragId() == aFrg->ident) { assert(a5.fragId() == 0); assert(a3.fragId() == 0); assert(b5.fragId() == 0); //assert(b3.fragId() == 0); if (b3.frag3p()) a3.set(bFrg->ident, true, b3.bhang(), b3.ahang()); else a5.set(bFrg->ident, true, -b3.ahang(), -b3.bhang()); } else { fprintf(stderr, "findEdges()-- Logically impossible!\n"); assert(0); } // And now we should have exactly two edges. assert(((a5.fragId() == bFrg->ident) + (a3.fragId() == bFrg->ident) + (b5.fragId() == aFrg->ident) + (b3.fragId() == aFrg->ident)) == 2); return(true); }
void BestOverlapGraph::scoreEdge(const OVSoverlap& olap) { if (isOverlapBadQuality(olap)) return; // Store edges from contained frags to help with unhappy mate // splitting. // // From Eli: These are contained, but close either way. We're // storing the non-containment edges for this fragment, plus a few // containment edges that are "close" to being dovetails. "I think // there are cases when a change in the alignemtn (consensus) will // change which one is contained and screw up the order, so having // this 10 base fudge factor helps things work out." // if (isContained(olap.b_iid)) { if (((olap.dat.ovl.a_hang >= -10) && (olap.dat.ovl.b_hang <= 0)) || ((olap.dat.ovl.a_hang >= 0) && (olap.dat.ovl.b_hang <= 10))) { BestContainment *c = &_bestC[olap.b_iid]; if (c->olaps == NULL) { c->olaps = new uint32 [c->olapsLen]; c->olapsLen = 0; } c->olaps[c->olapsLen++] = olap.a_iid; } return; } // Skip contained fragments. if (isContained(olap.a_iid) || isContained(olap.b_iid)) return; // Skip containment overlaps. Can this happen? Yup. How? // The overlap could be above our allowed error. // if (((olap.dat.ovl.a_hang >= 0) && (olap.dat.ovl.b_hang <= 0)) || ((olap.dat.ovl.a_hang <= 0) && (olap.dat.ovl.b_hang >= 0))) return; uint64 newScr = scoreOverlap(olap); // If the score is 0, the overlap doesn't pass the scoring // criteria at all so don't store the overlap whether or not // it's dovetailing or containment. if (newScr == 0) return; // Dove tailing overlap bool a3p = AS_OVS_overlapAEndIs3prime(olap); BestEdgeOverlap *best = getBestEdgeOverlap(olap.a_iid, a3p); uint64 score = 0; // Store the overlap if: // 1.) The score is better than what is already in the graph // 2.) If the scores are identical, the one with the longer length // // Since the order of how the overlaps are read in from the overlap // store are by A's increasing uint32, by default, if the score and // length are the same, the uint32 of the lower value will be kept. if (a3p) score = _best3score[olap.a_iid]; else score = _best5score[olap.a_iid]; if (newScr > score) { best->set(olap); if (a3p) _best3score[olap.a_iid] = newScr; else _best5score[olap.a_iid] = newScr; } }