Beispiel #1
0
//  Examine the first (few?) fragments of a unitig, evaluate if they indicate a join should be made.
static
bool
joinUnitigs_examineEnd(UnitigVector      &unitigs,
                       Unitig            *fr,
                       uint32             idx,
                       bool               frFirstEnd,
                       vector<joinEntry> &joins) {
  uint32           frgIdx  = (frFirstEnd) ? (idx) : (fr->ufpath.size() - 1 - idx);
  ufNode          *frg     = &fr->ufpath[frgIdx];
  bool             frgRev  = (frg->position.end < frg->position.bgn);

  //  Grab the best edge for this end frag.  The last arg requests the 3' end if true.
  //
  //  If we're looking at the first read, we want to get:
  //    5' - if the frag is forward
  //    3' - if the frag is reverse (frgRev == true)
  //
  //  If we're looking at the lat read, we want to get:
  //    5' - if the frag is reverse
  //    3' - if the frag is forward  (frgRev == false)
  //
  BestEdgeOverlap *bestEdge    = OG->getBestEdgeOverlap(frg->ident, (frgRev == frFirstEnd));

  uint32      tgtId = bestEdge->fragId();
  bool        tgt3p = bestEdge->frag3p();

  if (tgtId == 0)
    //  No best edge?  Skip it.
    return(false);

  //  Grab the unitig for that best edge.

  uint32   toID  = fr->fragIn(tgtId);
  Unitig  *to    = unitigs[toID];

  if (to->ufpath.size() == 1)
    //  Joining to something teeny?  Don't bother checking further.
    return(false);

  if (to->id() == fr->id())
    //  Join to myself?  Nope.
    return(false);

  //  Grab the read we have an edge to, an compute the overlapping length and left over length.

  ufNode  *tgt    = &to->ufpath[to->pathPosition(tgtId)];
  bool     tgtRev = (tgt->position.end < tgt->position.bgn);

  //  If tgt3p (we overlap to the 3' end) is the same as tgtRev (read is reverse) then the unitig is oriented
  //  correctly.  Otherwise, positions need to be reverse-complemented.


  bool     toFlip = false;

  if ((frFirstEnd == true) && (tgt3p == false) && (tgtRev == false))
    //  source read is at the start, overlap to 5' and the read is forward, need to flip the target unitig
    toFlip = true;

  if ((frFirstEnd == true) && (tgt3p == true) && (tgtRev == true))
    //  source read is at the start, overlap to 3' and the read is reverse, need to flip the target unitig
    toFlip = true;


  if ((frFirstEnd == false) && (tgt3p == false) && (tgtRev == true))
    //  source read is at the end, overlap to 5' and the read is reverse, need to flip the target unitig
    toFlip = true;

  if ((frFirstEnd == false) && (tgt3p == true) && (tgtRev == false))
    //  source read is at the end, overlap to 3' and the read is forward, need to flip the target unitig
    toFlip = true;


  uint32   toMin = MIN(tgt->position.bgn, tgt->position.end);
  uint32   toMax = MAX(tgt->position.bgn, tgt->position.end);
  uint32   toLen = to->getLength();
  uint32   frLen = fr->getLength();

  if (toFlip) {
    toMin = toLen - MAX(tgt->position.bgn, tgt->position.end);
    toMax = toLen - MIN(tgt->position.bgn, tgt->position.end);
  }

  assert(toMin < toMax);

  //  Our two unitigs are of length frLen and toLen.  We are appending some portion of 'to' onto
  //  'fr', and 'discarding' the rest.  If the 'discarded' piece is larger than the 'fr' unitig, we
  //  don't want to do the join.
  //
  //  We err on the side of the discarded piece.

  uint32   joinLen = 0;
  uint32   discLen = 0;

  if (frFirstEnd == true) {
    joinLen = toMin + frLen;  //  Prepend the start of 'to' onto 'fr'.
    discLen = toLen - toMin;

  } else {
    joinLen = frLen + toLen - toMax;  //  Append the end of 'to' onto 'fr'.
    discLen = toMax;
  }

  //  If the discard is bigger than us, we do damage by joining.

  if (discLen > frLen)
    return(false);

  //  The joined should be much larger and the discarded much smaller.

  uint32    maxLen = MAX(frLen, toLen);
  uint32    minLen = MIN(frLen, toLen);

  double    joinChange = (double)joinLen / maxLen;
  double    discChange = (double)discLen / minLen;

  bool      isBad = false;

  if ((joinChange < 1.10) ||
      (0.75       < discChange))
    //  Bad if we didn't really change sizes.
    isBad = true;

  if ((1.0        < joinChange) &&
      (discChange < 0.5))
    //  But good if discard is tiny.  This occurs if we merge a small with a big.  The join change
    //  is somewhat small (1.05 say) yet most of the smaller unitig is used.
    isBad = false;

  if (isBad) {
    writeLog("joinUnitigs_examineEnd()-- join unitig %6u (%7ubp) frag %6u %s <-> unitig %6u (%7ubp) frag %6u %s <-> length %5.2f %7u and %5.2f %7u BAD\n",
             fr->id(), fr->getLength(), frg->ident, (frgRev) ? "rev" : "fwd",
             to->id(), to->getLength(), tgt->ident, (tgtRev) ? "rev" : "fwd",
             joinChange, joinLen,
             discChange, discLen);
    return(false);
  }

  //  OK, join.

  writeLog("joinUnitigs_examineEnd()-- join unitig %6u (%7ubp) frag %6u %s <-> unitig %6u (%7ubp) frag %6u %s <-> length %5.2f %7u and %5.2f %7u\n",
           fr->id(), fr->getLength(), frg->ident, (frgRev) ? "rev" : "fwd",
           to->id(), to->getLength(), tgt->ident, (tgtRev) ? "rev" : "fwd",
           joinChange, joinLen,
           discChange, discLen);

  joins.push_back(joinEntry(frg->ident, frFirstEnd, tgt->ident, toFlip, joinLen));

  return(true);
}
Beispiel #2
0
//  Given two fragments that share at least one edge, this will find that edge and construct a new
//  edge to make it mutual.
//
//  For example, if there is a best edge from aFrg 3' to bFrg 5', this will return that edge in a3,
//  and also create the symmetric edge in b5.
//
static
bool
findEdges(ufNode *aFrg, BestEdgeOverlap &a5, BestEdgeOverlap &a3,
                       ufNode *bFrg, BestEdgeOverlap &b5, BestEdgeOverlap &b3) {

  if (OG->isContained(aFrg->ident) ||
      OG->isContained(bFrg->ident))
    return(false);

  //  Grab what edges we have.

  a5 = *OG->getBestEdgeOverlap(aFrg->ident, false);
  a3 = *OG->getBestEdgeOverlap(aFrg->ident, true);
  b5 = *OG->getBestEdgeOverlap(bFrg->ident, false);
  b3 = *OG->getBestEdgeOverlap(bFrg->ident, true);

  //  Erase things that aren't correct

  if (a5.fragId() != bFrg->ident)  a5 = BestEdgeOverlap();
  if (a3.fragId() != bFrg->ident)  a3 = BestEdgeOverlap();
  if (b5.fragId() != aFrg->ident)  b5 = BestEdgeOverlap();
  if (b3.fragId() != aFrg->ident)  b3 = BestEdgeOverlap();

  //  If we have no edges left, there are no edges!

  if ((b5.fragId() != aFrg->ident) && (b3.fragId() != aFrg->ident) &&
      (a5.fragId() != bFrg->ident) && (a3.fragId() != bFrg->ident))
    return(false);

  //  If we found TWO edges for any single fragment....that's madness!  That means the fragment
  //  had best dovetail overlaps to the same other fragment off of both ends.  We'll complain
  //  and return failure.  Ideally, data like this will be cleaned up by OBT, or filtered from
  //  our input.
  //
  if (a5.fragId() == a3.fragId()) {
    writeLog("findEdges()-- frag %d has multiple edges to frag %d - a5 %d/%d' a3 %d/%d'\n",
            aFrg->ident, a5.fragId(),
            a5.fragId(), a5.frag3p() ? 3 : 5,
            a5.fragId(), a5.frag3p() ? 3 : 5);
  }

  if (b5.fragId() == b3.fragId()) {
    writeLog("findEdges()-- frag %d has multiple edges to frag %d - b5 %d/%d' b3 %d/%d'\n",
            bFrg->ident, b5.fragId(),
            b5.fragId(), b5.frag3p() ? 3 : 5,
            b5.fragId(), b5.frag3p() ? 3 : 5);
  }

  if (((a5.fragId() != 0) && (a5.fragId() == a3.fragId())) ||
      ((b5.fragId() != 0) && (b5.fragId() == b3.fragId()))) {
    a5 = BestEdgeOverlap();
    a3 = BestEdgeOverlap();
    b5 = BestEdgeOverlap();
    b3 = BestEdgeOverlap();
    return(false);
  }

  //  Now, populate the other edges using whatever we have.  Best case is that we have two edges
  //  (because we're done).

  assert(((a5.fragId() == bFrg->ident) +
          (a3.fragId() == bFrg->ident) +
          (b5.fragId() == aFrg->ident) +
          (b3.fragId() == aFrg->ident)) <= 2);

  if (((a5.fragId() == bFrg->ident) || (a3.fragId() == bFrg->ident)) &&
      ((b5.fragId() == aFrg->ident) || (b3.fragId() == aFrg->ident)))
    return(true);

  //  Otherwise, we have exactly one edge, and the other one needs to be created.

  assert(((a5.fragId() == bFrg->ident) +
          (a3.fragId() == bFrg->ident) +
          (b5.fragId() == aFrg->ident) +
          (b3.fragId() == aFrg->ident)) == 1);

  if        (a5.fragId() == bFrg->ident) {
    //assert(a5.fragId() == 0);
    assert(a3.fragId() == 0);
    assert(b5.fragId() == 0);
    assert(b3.fragId() == 0);

    //  Edge off of A's 5' end ('false' below)...
    //  ...to B's 3' end (so ANTI or NORMAL -- negate the hangs)
    //  ...to B's 5' end (so INNIE or OUTTIE -- swap the hangs)
    if (a5.frag3p())
      b3.set(aFrg->ident, false, -a5.ahang(), -a5.bhang());
    else
      b5.set(aFrg->ident, false, a5.bhang(), a5.ahang());

  } else if (a3.fragId() == bFrg->ident) {
    assert(a5.fragId() == 0);
    //assert(a3.fragId() == 0);
    assert(b5.fragId() == 0);
    assert(b3.fragId() == 0);

    //  Edge off of A's 3' end ('true' below)...
    //  ...to B's 3' end (so INNIE or OUTTIE -- swap the hangs)
    //  ...to B's 5' end (so ANTI or NORMAL -- negate the hangs)
    if (a3.frag3p())
      b3.set(aFrg->ident, true, a3.bhang(), a3.ahang());
    else
      b5.set(aFrg->ident, true, -a3.ahang(), -a3.bhang());

  } else if (b5.fragId() == aFrg->ident) {
    assert(a5.fragId() == 0);
    assert(a3.fragId() == 0);
    //assert(b5.fragId() == 0);
    assert(b3.fragId() == 0);

    if (b5.frag3p())
      a3.set(bFrg->ident, false, -b5.ahang(), -b5.bhang());
    else
      a5.set(bFrg->ident, false, b5.bhang(), b5.ahang());


  } else if (b3.fragId() == aFrg->ident) {
    assert(a5.fragId() == 0);
    assert(a3.fragId() == 0);
    assert(b5.fragId() == 0);
    //assert(b3.fragId() == 0);

    if (b3.frag3p())
      a3.set(bFrg->ident, true, b3.bhang(), b3.ahang());
    else
      a5.set(bFrg->ident, true, -b3.ahang(), -b3.bhang());

  } else {
    fprintf(stderr, "findEdges()-- Logically impossible!\n");
    assert(0);
  }

  //  And now we should have exactly two edges.

  assert(((a5.fragId() == bFrg->ident) +
          (a3.fragId() == bFrg->ident) +
          (b5.fragId() == aFrg->ident) +
          (b3.fragId() == aFrg->ident)) == 2);

  return(true);
}
void BestOverlapGraph::scoreEdge(const OVSoverlap& olap) {

  if (isOverlapBadQuality(olap))
    return;

  //  Store edges from contained frags to help with unhappy mate
  //  splitting.
  //
  //  From Eli: These are contained, but close either way.  We're
  //  storing the non-containment edges for this fragment, plus a few
  //  containment edges that are "close" to being dovetails.  "I think
  //  there are cases when a change in the alignemtn (consensus) will
  //  change which one is contained and screw up the order, so having
  //  this 10 base fudge factor helps things work out."
  //
  if (isContained(olap.b_iid)) {
    if (((olap.dat.ovl.a_hang >= -10) && (olap.dat.ovl.b_hang <=  0)) ||
        ((olap.dat.ovl.a_hang >=   0) && (olap.dat.ovl.b_hang <= 10))) {
      BestContainment *c = &_bestC[olap.b_iid];
      if (c->olaps == NULL) {
        c->olaps    = new uint32 [c->olapsLen];
        c->olapsLen = 0;
      }
      c->olaps[c->olapsLen++] = olap.a_iid;
    }
    return;
  }

  //  Skip contained fragments.
  if (isContained(olap.a_iid) || isContained(olap.b_iid))
    return;

  //  Skip containment overlaps.  Can this happen?  Yup.  How?
  //  The overlap could be above our allowed error.
  //
  if (((olap.dat.ovl.a_hang >= 0) && (olap.dat.ovl.b_hang <= 0)) ||
      ((olap.dat.ovl.a_hang <= 0) && (olap.dat.ovl.b_hang >= 0)))
    return;

  uint64 newScr = scoreOverlap(olap);

  //  If the score is 0, the overlap doesn't pass the scoring
  //  criteria at all so don't store the overlap whether or not
  //  it's dovetailing or containment.

  if (newScr == 0)
    return;

  //  Dove tailing overlap
  bool             a3p     = AS_OVS_overlapAEndIs3prime(olap);
  BestEdgeOverlap *best    = getBestEdgeOverlap(olap.a_iid, a3p);
  uint64           score   = 0;

  // Store the overlap if:
  //   1.)  The score is better than what is already in the graph
  //   2.)  If the scores are identical, the one with the longer length
  //
  // Since the order of how the overlaps are read in from the overlap
  // store are by A's increasing uint32, by default, if the score and
  // length are the same, the uint32 of the lower value will be kept.

  if (a3p)
    score = _best3score[olap.a_iid];
  else
    score = _best5score[olap.a_iid];

  if (newScr > score) {
    best->set(olap);

    if (a3p)
      _best3score[olap.a_iid] = newScr;
    else
      _best5score[olap.a_iid] = newScr;
  }
}