示例#1
0
void
placeContainsUsingBestOverlaps(UnitigVector &unitigs) {
  uint32   fragsPlaced  = 1;
  uint32   fragsPending = 0;

  logFileFlags &= ~LOG_PLACE_FRAG;

  while (fragsPlaced > 0) {
    fragsPlaced  = 0;
    fragsPending = 0;

    writeLog("==> PLACING CONTAINED FRAGMENTS\n");

    for (uint32 fid=1; fid<FI->numFragments()+1; fid++) {
      BestContainment *bestcont = OG->getBestContainer(fid);
      Unitig          *utg;

      if (bestcont->isContained == false)
        //  Not a contained fragment.
        continue;

      if (Unitig::fragIn(fid) != 0)
        //  Containee already placed.
        continue;

      if (Unitig::fragIn(bestcont->container) == 0) {
        //  Container not placed (yet).
        fragsPending++;
        continue;
      }

      utg = unitigs[Unitig::fragIn(bestcont->container)];
      utg->addContainedFrag(fid, bestcont, logFileFlagSet(LOG_INITIAL_CONTAINED_PLACEMENT));

      if (utg->id() != Unitig::fragIn(fid))
        writeLog("placeContainsUsingBestOverlaps()-- FAILED to add frag %d to unitig %d.\n", fid, bestcont->container);
      assert(utg->id() == Unitig::fragIn(fid));


      fragsPlaced++;
    }

    writeLog("==> PLACING CONTAINED FRAGMENTS - placed %d fragments; still need to place %d\n",
            fragsPlaced, fragsPending);

    if ((fragsPlaced == 0) && (fragsPending > 0)) {
      writeLog("Stopping contained fragment placement due to zombies.\n");
      fragsPlaced  = 0;
      fragsPending = 0;
    }
  }

  for (uint32 ti=1; ti<unitigs.size(); ti++) {
    Unitig *utg = unitigs[ti];

    if (utg)
      utg->sort();
  }
}
示例#2
0
ChunkGraph::ChunkGraph(const char *output_prefix) {

  setLogFile(output_prefix, "ChunkGraph");

  _maxFragment = FI->numFragments();
  _restrict    = NULL;

  _pathLen         = new uint32      [_maxFragment * 2 + 2];
  _chunkLength     = new ChunkLength [_maxFragment];
  _chunkLengthIter = 0;

  memset(_pathLen,     0, sizeof(uint32)      * (_maxFragment * 2 + 2));
  memset(_chunkLength, 0, sizeof(ChunkLength) * (_maxFragment));

  for (uint32 fid=1; fid <= _maxFragment; fid++) {
    if (OG->isContained(fid)) {
      if (logFileFlagSet(LOG_CHUNK_GRAPH))
        writeLog("read %u contained\n", fid);
      continue;
    }

    if (OG->isSuspicious(fid)) {
      if (logFileFlagSet(LOG_CHUNK_GRAPH))
        writeLog("read %u suspicious\n", fid);
      continue;
    }

    uint32  l5 = countFullWidth(FragmentEnd(fid, false));
    uint32  l3 = countFullWidth(FragmentEnd(fid, true));

    _chunkLength[fid-1].fragId = fid;
    _chunkLength[fid-1].cnt    = l5 + l3;
  }

  delete [] _pathLen;
  _pathLen = NULL;

  std::sort(_chunkLength, _chunkLength + _maxFragment);
}
示例#3
0
static
void
makeNewUnitig(UnitigVector &unitigs,
              uint32        splitFragsLen,
              ufNode       *splitFrags) {
  Unitig *dangler = unitigs.newUnitig(false);

  if (logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS))
    writeLog("splitDiscontinuous()--   new tig "F_U32" with "F_U32" fragments (starting at frag "F_U32").\n",
            dangler->id(), splitFragsLen, splitFrags[0].ident);

  int splitOffset = -MIN(splitFrags[0].position.bgn, splitFrags[0].position.end);

  //  This should already be true, but we force it still
  splitFrags[0].contained = 0;

  for (uint32 i=0; i<splitFragsLen; i++)
    dangler->addFrag(splitFrags[i], splitOffset, false);  //logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS));
}
示例#4
0
void
reportUnitigs(UnitigVector &unitigs, const char *prefix, const char *name) {

  if (logFileFlagSet(LOG_INTERMEDIATE_UNITIGS) == 0)
    return;

  uint32  numFragsT  = 0;
  uint32  numFragsP  = 0;
  uint64  utgLen     = 0;

  //  Compute average frags per partition.
  for (uint32  ti=0; ti<unitigs.size(); ti++) {
    Unitig  *utg = unitigs[ti];

    if (utg == NULL)
      continue;

    numFragsT += utg->ufpath.size();

    if (utg->ufpath.size() > 2)
      utgLen    += utg->getLength();
  }

  if      (utgLen < 16 * 1024 * 1024)
    numFragsP = numFragsT / 7;
  else if (utgLen < 64 * 1024 * 1024)
    numFragsP = numFragsT / 63;
  else
    numFragsP = numFragsT / 127;

  char tigStorePath[FILENAME_MAX];
  sprintf(tigStorePath, "%s.%03u.%s.tigStore", prefix, logFileOrder, name);

  //  Failing to do this results in consensus running about 40 times slower.  Three hours instead of
  //  five minutes.
  setParentAndHang(unitigs);

  writeUnitigsToStore(unitigs, tigStorePath, tigStorePath, numFragsP, false);
}
示例#5
0
//  Closes the current logFile, opens a new one called 'prefix.logFileOrder.label'.  If 'label' is
//  NULL, the logFile is reset to stderr.
void
setLogFile(char const *prefix, char const *label) {

  assert(prefix != NULL);

  //  Allocate space.

  if (logFileThread == NULL)
    logFileThread = new logFileInstance [omp_get_max_threads()];

  //  If writing to stderr, that's all we needed to do.

  if (logFileFlagSet(LOG_STDERR))
    return;

  //  Close out the old.

  logFileMain.close();

  for (int32 tn=0; tn<omp_get_max_threads(); tn++)
    logFileThread[tn].close();

  //  Move to the next iteration.

  logFileOrder++;

  //  Set up for that iteration.

  logFileMain.set(prefix, logFileOrder, label, 0);

  for (int32 tn=0; tn<omp_get_max_threads(); tn++)
    logFileThread[tn].set(prefix, logFileOrder, label, tn+1);

  //  File open is delayed until it is used.

  if (label != NULL)
    fprintf(stderr,  "setLogFile()-- Now logging to '%s.%03d.%s'\n", prefix, logFileOrder, label);
}
示例#6
0
void
breakUnitigs(UnitigVector &unitigs,
             char         *output_prefix,
             bool          enableIntersectionBreaking) {

  writeLog("==> BREAKING UNITIGS.\n");

  intersectionList  *ilist = new intersectionList(unitigs);

  //  Stop when we've seen all current unitigs.  Replace tiMax
  //  in the for loop below with unitigs.size() to recursively
  //  split unitigs.

  uint32 tiMax = unitigs.size();

  for (uint32 ti=0; ti<tiMax; ti++) {
    Unitig             *tig = unitigs[ti];

    if (tig == NULL)
      continue;

    vector<breakPoint>   breaks;

    for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
      ufNode             *frg   = &tig->ufpath[fi];
      intersectionPoint  *isect = ilist->getIntersection(frg->ident, 0);

      if (isect == NULL)
        continue;

      for (; isect->isectFrg == frg->ident; isect++) {
        assert(tig->id() == Unitig::fragIn(isect->isectFrg));

        //  Grab the invading unitig

        Unitig *inv = unitigs[Unitig::fragIn(isect->invadFrg)];
        assert(inv->id() == Unitig::fragIn(isect->invadFrg));

        //  Grab the best edges off the invading fragment.

        BestEdgeOverlap  *best5 = OG->getBestEdgeOverlap(isect->invadFrg, false);
        BestEdgeOverlap  *best3 = OG->getBestEdgeOverlap(isect->invadFrg, true);

        //  Check if the incoming tig is a spur, and we should just ignore it immediately

        if ((inv->ufpath.size() == 1) &&
            ((best5->fragId() == 0) ||
             (best3->fragId() == 0))) {
          if (logFileFlagSet(LOG_INTERSECTION_BREAKING))
            writeLog("unitig %d frag %d end %c' into unitig %d frag %d end %c' -- IS A SPUR, skip it\n",
                    inv->id(), isect->invadFrg, isect->invad3p ? '3' : '5',
                    tig->id(), isect->isectFrg, isect->isect3p ? '3' : '5');
          continue;
        }

        //  Keep only significant intersections

        if ((inv->getLength()   > MIN_BREAK_LENGTH) &&
            (inv->ufpath.size() > MIN_BREAK_FRAGS)) {
          if (logFileFlagSet(LOG_INTERSECTION_BREAKING))
            writeLog("unitig %d frag %d end %c' into unitig %d frag %d end %c'\n",
                    inv->id(), isect->invadFrg, isect->invad3p ? '3' : '5',
                    tig->id(), isect->isectFrg, isect->isect3p ? '3' : '5');
          breaks.push_back(breakPoint(isect->isectFrg, isect->isect3p, true, false));
        }
      }  //  Over all incoming fragments

      //  If this is the last fragment, terminate the break point list with a 'fakeEnd' (in AS_BAT_Breaking.cc) break point
      //  at the end of the unitig.

      if ((fi+1 == tig->ufpath.size()) &&
          (breaks.size() > 0)) {
        breaks.push_back(breakPoint(frg->ident, (frg->position.bgn < frg->position.end), true, false));
      }
    }  //  Over all fragments in the unitig


    if (breaks.size() == 0)
      continue;

    //  Report where breaks occur.  'breaks' is a list, not a vector.
#if 0
    //  We've lost the fields in breaks[i] -- but the reports above aren't updated yet.
    if (logFileFlagSet(LOG_INTERSECTION_BREAKING) ||
        logFileFlagSet(LOG_MATE_SPLIT_COVERAGE_PLOT))
      for (uint32 i=0; i<breaks.size(); i++)
        writeLog("BREAK unitig %d at position %d,%d from inSize %d inFrags %d.\n",
                tig->id(),
                breaks[i].fragPos.bgn,
                breaks[i].fragPos.end,
                breaks[i].inSize,
                breaks[i].inFrags);
#endif

    //  Actually do the breaking.
    if (enableIntersectionBreaking)
      breakUnitigAt(unitigs, tig, breaks, true);

    breaks.clear();
  }  //  Over all unitigs
}
示例#7
0
//  After splitting and ejecting some contains, check for discontinuous unitigs.
//
void splitDiscontinuousUnitigs(UnitigVector &unitigs, uint32 minOverlap) {

  writeLog("==> SPLIT DISCONTINUOUS\n");

  uint32                numTested  = 0;
  uint32                numSplit   = 0;
  uint32                numCreated = 0;

  uint32                splitFragsLen = 0;
  uint32                splitFragsMax = 0;
  ufNode               *splitFrags    = NULL;

  for (uint32 ti=0; ti<unitigs.size(); ti++) {
    Unitig  *tig = unitigs[ti];

    if ((tig == NULL) || (tig->ufpath.size() < 2))
      continue;

    //  Unitig must be sorted.  Someone upstream os screwing this up.
    tig->sort();

    //  We'll want to build an array of new fragments to split out.  This can be up
    //  to the size of the largest unitig.
    splitFragsMax = MAX(splitFragsMax, tig->ufpath.size());

    //  Check that the unitig starts at position zero.  Not critical for the next loop, but
    //  needs to be dome sometime.
    int32   minPos = MIN(tig->ufpath[0].position.bgn, tig->ufpath[0].position.end);

    if (minPos == 0)
      continue;

    writeLog("splitDiscontinuous()-- tig "F_U32" offset messed up; reset by "F_S32".\n", tig->id(), minPos);

    for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
      ufNode  *frg = &tig->ufpath[fi];

      frg->position.bgn -= minPos;
      frg->position.end -= minPos;
    }
  }

  splitFrags = new ufNode [splitFragsMax];

  //  Now, finally, we can check for gaps in unitigs.

  for (uint32 ti=0; ti<unitigs.size(); ti++) {
    Unitig  *tig = unitigs[ti];

    if ((tig == NULL) || (tig->ufpath.size() < 2))
      continue;

    //  We don't expect many unitigs to be broken, so we'll do a first quick pass to just
    //  test if it is.

    int32  maxEnd   = MAX(tig->ufpath[0].position.bgn, tig->ufpath[0].position.end);
    bool   isBroken = false;

    for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
      ufNode  *frg = &tig->ufpath[fi];

      int32    bgn = MIN(frg->position.bgn, frg->position.end);
      int32    end = MAX(frg->position.bgn, frg->position.end);

      if (bgn > maxEnd - minOverlap) {
        isBroken = true;
        break;
      }

      maxEnd = MAX(maxEnd, end);
    }

    numTested++;

    if (isBroken == false)
      continue;

    numSplit++;

    //  Dang, busted unitig.  Fix it up.

    splitFragsLen = 0;
    maxEnd        = 0;

    if (logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS))
      writeLog("splitDiscontinuous()-- discontinuous tig "F_U32" with "F_SIZE_T" fragments broken into:\n",
              tig->id(), tig->ufpath.size());

    for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
      ufNode  *frg = &tig->ufpath[fi];

      int32    bgn = MIN(frg->position.bgn, frg->position.end);
      int32    end = MAX(frg->position.bgn, frg->position.end);

      //  Good thick overlap exists to this fragment, save it.
      if (bgn <= maxEnd - minOverlap) {
        assert(splitFragsLen < splitFragsMax);
        splitFrags[splitFragsLen++] = *frg;
        maxEnd = MAX(maxEnd, end);
        continue;
      }

      //  No thick overlap found.  We need to break right here before the current fragment.

      //  If there is exactly one fragment, and it's contained, and it's not mated, move it to the
      //  container.  (This has a small positive benefit over just making every read a singleton).
      //
      if ((splitFragsLen == 1) &&
          (FI->mateIID(splitFrags[0].ident) == 0) &&
          (splitFrags[0].contained != 0)) {
        Unitig  *dangler  = unitigs[tig->fragIn(splitFrags[0].contained)];

        //  If the parent isn't in a unitig, we must have shattered the repeat unitig it was in.
        //  Do the same here.

        if (dangler == NULL) {
          if (logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS))
            writeLog("splitDiscontinuous()--   singleton frag "F_U32" shattered.\n",
                    splitFrags[0].ident);
          Unitig::removeFrag(splitFrags[0].ident);

        } else {
          assert(dangler->id() == tig->fragIn(splitFrags[0].contained));

          if (logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS))
            writeLog("splitDiscontinuous()--   old tig "F_U32" with "F_SIZE_T" fragments (contained frag "F_U32" moved here).\n",
                    dangler->id(), dangler->ufpath.size() + 1, splitFrags[0].ident);

          BestContainment  *bestcont = OG->getBestContainer(splitFrags[0].ident);

          assert(bestcont->isContained == true);

          dangler->addContainedFrag(splitFrags[0].ident, bestcont, false);
          dangler->bubbleSortLastFrag();

          assert(dangler->id() == Unitig::fragIn(splitFrags[0].ident));
        }
      }

      //  Otherwise, make an entirely new unitig for these fragments.
      else {
        numCreated++;
        makeNewUnitig(unitigs, splitFragsLen, splitFrags);
        tig = unitigs[ti];
      }

      //  Done with the split, save the current fragment.  This resets everything.

      splitFragsLen = 0;
      splitFrags[splitFragsLen++] = *frg;

      maxEnd = end;
    }


    //  If we did any splitting, then the length of the frags in splitFrags will be less than the length
    //  of the path in the current unitig.  Make a final new unitig for the remaining fragments.
    //
    if (splitFragsLen != tig->ufpath.size()) {
      numCreated++;
      makeNewUnitig(unitigs, splitFragsLen, splitFrags);

      delete unitigs[ti];
      unitigs[ti] = NULL;
    }
  }

  writeLog("splitDiscontinuous()-- Tested "F_U32" unitigs, split "F_U32" into "F_U32" new unitigs.\n",
          numTested, numSplit, numCreated);

  delete [] splitFrags;
}
示例#8
0
void
MateLocation::buildHappinessGraphs(UnitigVector &unitigs) {

  //  First entry is always zero.  Needed for the getById() accessor.

  assert(_table[0].mleFrgID1 == 0);
  assert(_table[0].mleFrgID2 == 0);

  assert(IS != NULL);

  for (uint32 mleidx=1; mleidx<_table.size(); mleidx++) {
    MateLocationEntry &loc = _table[mleidx];

    //  We MUST have mleFrgID1 defined.  If mleFrgID2 is not defined, then the mate is external.
    assert(loc.mleFrgID1 != 0);

    //  Well, this bit of ugly is here to fill out the location of the mate (when it is in a
    //  different unitig, the location for this fragment is not set)....EXCEPT that the mate might
    //  not even be placed in a unitig yet (if it is a contain, and we're called before contains are
    //  placed).
    //
    //  This is currently only used for logging -- AND statistics on mate happiness.  Later we
    //  should use it to determine if the mate is buried in the other unitig, which would make the
    //  fragment in this unitig bad.
    //
    if (loc.mleFrgID2 == 0) {
      assert(loc.mleUtgID2 == 0);
      loc.mleFrgID2 = FI->mateIID(loc.mleFrgID1);
      loc.mleUtgID2 = _tig->fragIn(loc.mleFrgID2);

      if (loc.mleUtgID2 != 0) {
        Unitig  *mt = unitigs[loc.mleUtgID2];
        uint32   fi = _tig->pathPosition(loc.mleFrgID2);
        loc.mlePos2 = mt->ufpath[fi].position;
      }
    }

    uint32 lib =  FI->libraryIID(loc.mleFrgID1);

    if (lib == 0)
      //  Shouldn't occur, but just in case, ignore fragments in the legacy library.
      continue;

    if (IS->valid(lib) == false)
      // Don't check libs that we didn't generate good stats for
      continue;

    int32 badMaxInter = static_cast<int32>(IS->mean(lib) + BADMATE_INTER_STDDEV * IS->stddev(lib));
    int32 badMinInter = static_cast<int32>(IS->mean(lib) - BADMATE_INTER_STDDEV * IS->stddev(lib));

    int32 badMaxIntra = static_cast<int32>(IS->mean(lib) + BADMATE_INTRA_STDDEV * IS->stddev(lib));
    int32 badMinIntra = static_cast<int32>(IS->mean(lib) - BADMATE_INTRA_STDDEV * IS->stddev(lib));

    //  To keep the results the same as the previous version (1.89)
    badMaxIntra = badMaxInter;
    badMinIntra = badMinInter;

    int32 dist = 0;
    int32 bgn  = 0;
    int32 end  = 0;

    //  Bgn and End MUST be signed.

    int32 matBgn = loc.mlePos2.bgn;
    int32 matEnd = loc.mlePos2.end;
    int32 matLen = (matBgn < matEnd) ? (matEnd - matBgn) : (matBgn - matEnd);

    int32 frgBgn = loc.mlePos1.bgn;
    int32 frgEnd = loc.mlePos1.end;
    int32 frgLen = (frgBgn < frgEnd) ? (frgEnd - frgBgn) : (frgBgn - frgEnd);

    int32 nContained  = 0;

    if (OG->getBestContainer(loc.mleFrgID1)->isContained == true)
      nContained++;

    if (OG->getBestContainer(loc.mleFrgID2)->isContained == true)
      nContained++;

    if ((matLen >= MIN(badMaxInter, badMaxIntra)) ||
        (frgLen >= MIN(badMaxInter, badMaxIntra)))
      //  Yikes, fragment longer than insert size!
      continue;


    //  Until reset, assume this is a bad mate pair.
    loc.isGrumpy = true;

    int32 ULEN1 = 0;  //unitigs[loc.mleUtgID1]->getLength()
    int32 ULEN2 = 0;  //unitigs[loc.mleUtgID2]->getLength()


    //  If the mate is in another unitig, mark bad only if there is enough space to fit the mate in
    //  this unitig.
    //
    //  TODO: OR if there isn't enough space on the end of the OTHER unitig
    //
    if (loc.mleUtgID1 != loc.mleUtgID2) {
      if ((isReverse(loc.mlePos1) == true)  && (badMaxInter < frgBgn)) {
        incrRange(badExternalRev, -1, frgBgn - badMaxInter, frgEnd);
        incrRange(badExternalRev, -1, frgBgn, frgEnd);
        nbadExternalRev[nContained]++;
        if (logFileFlagSet(LOG_HAPPINESS))
          writeLog("buildHappinessGraph()--  unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- bad external reverse\n",
                  loc.mleUtgID1, ULEN1, loc.mleFrgID1, frgBgn, frgEnd, frgLen,
                  loc.mleUtgID2, ULEN2, loc.mleFrgID2, matBgn, matEnd, matLen);
        goto markBad;
      }

      if ((isReverse(loc.mlePos1) == false) && (badMaxInter < _tigLen - frgBgn)) {
        incrRange(badExternalFwd, -1, frgEnd, frgBgn + badMaxInter);
        incrRange(badExternalFwd, -1, frgEnd, frgBgn);
        nbadExternalFwd[nContained]++;
        if (logFileFlagSet(LOG_HAPPINESS))
          writeLog("buildHappinessGraph()--  unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- bad external forward\n",
                  loc.mleUtgID1, ULEN1, loc.mleFrgID1, frgBgn, frgEnd, frgLen,
                  loc.mleUtgID2, ULEN2, loc.mleFrgID2, matBgn, matEnd, matLen);
        goto markBad;
      }

      //  Not enough space.  Not a grumpy mate pair.
      loc.isGrumpy = false;

      if (isReverse(loc.mlePos1) == true)
        ngoodExternalRev[nContained]++;
      else
        ngoodExternalFwd[nContained]++;

      if (logFileFlagSet(LOG_HAPPINESS))
        writeLog("buildHappinessGraph()--  unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- not bad, not enough space\n",
                loc.mleUtgID1, ULEN1, loc.mleFrgID1, frgBgn, frgEnd, frgLen,
                loc.mleUtgID2, ULEN2, loc.mleFrgID2, matBgn, matEnd, matLen);
      continue;
    }


    //  Both mates are in this unitig.


    //  Same orientation?
    if ((isReverse(loc.mlePos1) == false) &&
        (isReverse(loc.mlePos2) == false)) {
      incrRange(badNormal, -1, MIN(frgBgn, matBgn), MAX(frgEnd, matEnd));
      nbadNormal[nContained]++;
      if (logFileFlagSet(LOG_HAPPINESS))
        writeLog("buildHappinessGraph()--  unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- bad normal\n",
                loc.mleUtgID1, ULEN1, loc.mleFrgID1, frgBgn, frgEnd, frgLen,
                loc.mleUtgID2, ULEN2, loc.mleFrgID2, matBgn, matEnd, matLen);
      goto markBad;
    }

    if ((isReverse(loc.mlePos1) == true) &&
        (isReverse(loc.mlePos2) == true)) {
      incrRange(badAnti, -1, MIN(frgEnd, matEnd), MAX(frgBgn, matBgn));
      nbadAnti[nContained]++;
      if (logFileFlagSet(LOG_HAPPINESS))
        writeLog("buildHappinessGraph()--  unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- bad anti\n",
                loc.mleUtgID1, ULEN1, loc.mleFrgID1, frgBgn, frgEnd, frgLen,
                loc.mleUtgID2, ULEN2, loc.mleFrgID2, matBgn, matEnd, matLen);
      goto markBad;
    }


    //  Check a special case for a circular unitig, outtie mates, but close enough to the end to
    //  plausibly be linking the ends together.
    //
    //   <---              --->
    //  ========unitig==========
    //
    if ((isReverse(loc.mlePos1) == true) &&
        (badMinIntra               <= frgBgn + _tigLen - matBgn) &&
        (frgBgn + _tigLen - matBgn <= badMaxIntra)) {
      loc.isGrumpy = false;  //  IT'S GOOD, kind of.
      ngood[nContained]++;
      if (logFileFlagSet(LOG_HAPPINESS))
        writeLog("buildHappinessGraph()--  unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- good because circular\n",
                loc.mleUtgID1, ULEN1, loc.mleFrgID1, frgBgn, frgEnd, frgLen,
                loc.mleUtgID2, ULEN2, loc.mleFrgID2, matBgn, matEnd, matLen);
      continue;
    }


    //  Outties?  True if pos1.end < pos2.bgn.  (For the second case, swap pos1 and pos2)
    //
    //  (pos1.end) <------   (pos1.bgn)
    //  (pos2.bgn)  -------> (pos2.end)
    //
    if ((isReverse(loc.mlePos1) == true)  && (loc.mlePos1.end < loc.mlePos2.bgn)) {
      incrRange(badOuttie, -1, MIN(frgBgn, frgEnd), MAX(matBgn, matEnd));
      nbadOuttie[nContained]++;
      if (logFileFlagSet(LOG_HAPPINESS))
        writeLog("buildHappinessGraph()--  unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- bad outtie (case 1)\n",
                loc.mleUtgID1, ULEN1, loc.mleFrgID1, frgBgn, frgEnd, frgLen,
                loc.mleUtgID2, ULEN2, loc.mleFrgID2, matBgn, matEnd, matLen);
      goto markBad;
    }
    if ((isReverse(loc.mlePos1) == false) && (loc.mlePos2.end < loc.mlePos1.bgn)) {
      incrRange(badOuttie, -1, MIN(frgBgn, frgEnd), MAX(matBgn, matEnd));
      nbadOuttie[nContained]++;
      if (logFileFlagSet(LOG_HAPPINESS))
        writeLog("buildHappinessGraph()--  unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- bad outtie (case 2)\n",
                loc.mleUtgID1, ULEN1, loc.mleFrgID1, frgBgn, frgEnd, frgLen,
                loc.mleUtgID2, ULEN2, loc.mleFrgID2, matBgn, matEnd, matLen);
      goto markBad;
    }

    //  So, now not NORMAL or ANTI or OUTTIE.  We must be left with innies.

    if (isReverse(loc.mlePos1) == false)
      //  First fragment is on the left, second is on the right.
      dist = loc.mlePos2.bgn - loc.mlePos1.bgn;
    else
      //  First fragment is on the right, second is on the left.
      dist = loc.mlePos1.bgn - loc.mlePos2.bgn;

    assert(dist >= 0);

    if (dist < badMinIntra) {
      incrRange(badCompressed, -1, MIN(frgBgn, matBgn), MAX(frgBgn, matBgn));
      nbadCompressed[nContained]++;
      if (logFileFlagSet(LOG_HAPPINESS))
        writeLog("buildHappinessGraph()--  unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- bad compressed\n",
                loc.mleUtgID1, ULEN1, loc.mleFrgID1, frgBgn, frgEnd, frgLen,
                loc.mleUtgID2, ULEN2, loc.mleFrgID2, matBgn, matEnd, matLen);
      goto markBad;
    }

    if (badMaxIntra < dist) {
      incrRange(badStretched, -1, MIN(frgBgn, matBgn), MAX(frgBgn, matBgn));
      nbadStretched[nContained]++;
      if (logFileFlagSet(LOG_HAPPINESS))
        writeLog("buildHappinessGraph()--  unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- bad stretched\n",
                loc.mleUtgID1, ULEN1, loc.mleFrgID1, frgBgn, frgEnd, frgLen,
                loc.mleUtgID2, ULEN2, loc.mleFrgID2, matBgn, matEnd, matLen);
      goto markBad;
    }

    assert(badMinIntra <= dist);
    assert(dist        <= badMaxIntra);

    incrRange(good, 1, MIN(frgBgn, matBgn), MAX(frgBgn, matBgn));
    loc.isGrumpy = false;  //  IT'S GOOD!
    ngood[nContained]++;
    if (logFileFlagSet(LOG_HAPPINESS))
      writeLog("buildHappinessGraph()--  unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- GOOD!\n",
              loc.mleUtgID1, ULEN1, loc.mleFrgID1, frgBgn, frgEnd, frgLen,
              loc.mleUtgID2, ULEN2, loc.mleFrgID2, matBgn, matEnd, matLen);
    continue;

  markBad:

    //  Mark bad from the 3' end of the fragment till the upper limit where the mate should go.

    if (loc.mleUtgID1 == _tig->id()) {
      assert(loc.mleFrgID1 != 0);
      if (isReverse(loc.mlePos1) == false) {
        //  Mark bad for forward fagment 1
        assert(frgBgn < frgEnd);
        bgn = frgEnd;
        end = frgBgn + badMaxIntra;
        incrRange(badFwd, -1, bgn, end);
      } else {
        //  Mark bad for reverse fragment 1
        assert(frgEnd < frgBgn);
        bgn = frgBgn - badMaxIntra;
        end = frgEnd;
        incrRange(badRev, -1, bgn, end);
      }
    }

    if (loc.mleUtgID2 == _tig->id()) {
      assert(loc.mleFrgID2 != 0);
      if (isReverse(loc.mlePos2) == false) {
        //  Mark bad for forward fragment 2
        assert(matBgn < matEnd);
        bgn = matEnd;
        end = matBgn + badMaxIntra;
        incrRange(badFwd, -1, bgn, end);
      } else {
        //  Mark bad for reverse fragment 2
        assert(matEnd < matBgn);
        bgn = matBgn - badMaxIntra;
        end = matEnd;
        incrRange(badRev, -1, bgn, end);
      }
    }
  }  //  Over all MateLocationEntries in the table
}  //  buildHappinessGraph()
示例#9
0
uint32
ChunkGraph::countFullWidth(FragmentEnd firstEnd) {
  uint64   firstIdx = getIndex(firstEnd);

  assert(firstIdx < _maxFragment * 2 + 2);

  if (_pathLen[firstIdx] > 0)
    return _pathLen[firstIdx];

  uint32                length = 0;
  std::set<FragmentEnd> seen;
  FragmentEnd           lastEnd = firstEnd;
  uint64                lastIdx = firstIdx;

  //  Until we run off the chain, or we hit a fragment with a known length, compute the length FROM
  //  THE START.
  //
  while ((lastIdx != 0) &&
         (_pathLen[lastIdx] == 0)) {

    seen.insert(lastEnd);

    _pathLen[lastIdx] = ++length;

    //  Follow the path of lastEnd

    lastEnd = OG->followOverlap(lastEnd);
    lastIdx = getIndex(lastEnd);
  }

  //  Check why we stopped.  Three cases:
  //
  //  1)  We ran out of best edges to follow -- lastEnd.fragId() == 0
  //  2)  We encountered a fragment with known length -- _pathLen[lastEnd.index()] > 0
  //  3)  We encountered a self-loop (same condition as case 2)
  //
  //  To distinguish case 2 and 3, we keep a set<> of the fragments we've seen in this construction.
  //  If 'lastEnd' is in that set, then we're case 3.  If so, adjust every node in the cycle to have
  //  the same length, the length of the cycle itself.
  //
  //  'lastEnd' and 'index' are the first fragment in the cycle; we've seen this one before.
  //
  if (lastEnd.fragId() == 0) {
    //  Case 1.  Do nothing.
    ;

  } else if (seen.find(lastEnd) != seen.end()) {
    //  Case 3, a cycle.
    uint32      cycleLen = length - _pathLen[lastIdx] + 1;
    FragmentEnd currEnd  = lastEnd;
    uint64      currIdx  = lastIdx;

    do {
      _pathLen[currIdx] = cycleLen;
      currEnd = OG->followOverlap(currEnd);
      currIdx = getIndex(currEnd);
    } while (lastEnd != currEnd);

  } else {
    //  Case 2, an existing path.
    length += _pathLen[lastIdx];
  }

  //  Our return value is now whatever count we're at.
  uint32 lengthMax = length;

  //  Traverse again, converting "path length from the start" into "path length from the end".  Any
  //  cycle has had its length set correctly already, and we stop at either the start of the cycle,
  //  or at the start of any existing path.
  //
  FragmentEnd currEnd = firstEnd;
  uint64      currIdx = firstIdx;

  while (currEnd != lastEnd) {
    _pathLen[currIdx] = length--;
    currEnd = OG->followOverlap(currEnd);
    currIdx = getIndex(currEnd);
  }



  if (logFileFlagSet(LOG_CHUNK_GRAPH)) {
    seen.clear();

    currEnd = firstEnd;
    currIdx = firstIdx;

    writeLog("path from %d,%d length %d:",
            firstEnd.fragId(),
            (firstEnd.frag3p()) ? 3 : 5,
            _pathLen[firstIdx]);

    while ((currEnd.fragId() != 0) &&
           (seen.find(currEnd) == seen.end())) {
      seen.insert(currEnd);

      if (currEnd == lastEnd)
        writeLog(" LAST");

      writeLog(" %d,%d(%d)",
              currEnd.fragId(),
              (currEnd.frag3p()) ? 3 : 5,
              _pathLen[currIdx]);

      currEnd = OG->followOverlap(currEnd);
      currIdx = getIndex(currEnd);
    }

    if (seen.find(currEnd) != seen.end())
      writeLog(" CYCLE %d,%d(%d)",
              currEnd.fragId(),
              (currEnd.frag3p()) ? 3 : 5,
              _pathLen[currIdx]);

    writeLog("\n");
  }

  if (lengthMax != _pathLen[firstIdx])
    writeLog("ERROR: lengthMax %d _pathLen[] %d\n",
            lengthMax, _pathLen[firstIdx]);
  assert(lengthMax == _pathLen[firstIdx]);

  return(_pathLen[firstIdx]);
}
示例#10
0
void
placeUnplacedUsingAllOverlaps(UnitigVector &unitigs,
                              const char   *prefix) {
  uint32  fiLimit    = FI->numFragments();
  uint32  numThreads = omp_get_max_threads();
  uint32  blockSize  = (fiLimit < 100 * numThreads) ? numThreads : fiLimit / 99;

  uint32       *placedTig = new uint32      [FI->numFragments() + 1];
  SeqInterval  *placedPos = new SeqInterval [FI->numFragments() + 1];

  memset(placedTig, 0, sizeof(uint32)      * (FI->numFragments() + 1));
  memset(placedPos, 0, sizeof(SeqInterval) * (FI->numFragments() + 1));

  //  Just some logging.  Count the number of reads we try to place.

  uint32   nToPlaceContained = 0;
  uint32   nToPlace          = 0;
  uint32   nPlacedContained  = 0;
  uint32   nPlaced           = 0;
  uint32   nFailedContained  = 0;
  uint32   nFailed           = 0;

  for (uint32 fid=1; fid<FI->numFragments()+1; fid++)
    if (Unitig::fragIn(fid) == 0)
      if (OG->isContained(fid))
        nToPlaceContained++;
      else
        nToPlace++;

  writeLog("placeContains()-- placing %u contained and %u unplaced reads, with %d threads.\n",
           nToPlaceContained, nToPlace, numThreads);

  //  Do the placing!

#pragma omp parallel for schedule(dynamic, blockSize)
  for (uint32 fid=1; fid<FI->numFragments()+1; fid++) {
    bool  enableLog = true;

    if (Unitig::fragIn(fid) > 0)
      continue;

    //  Place the read.

    vector<overlapPlacement>   placements;

    placeFragUsingOverlaps(unitigs, AS_MAX_ERATE, NULL, fid, placements);

    //  Search the placements for the highest expected identity placement using all overlaps in the unitig.

    uint32   b = UINT32_MAX;

    for (uint32 i=0; i<placements.size(); i++) {
      Unitig *tig = unitigs[placements[i].tigID];

      if (placements[i].fCoverage < 0.99)                   //  Ignore partially placed reads.
        continue;

      if (tig->ufpath.size() == 1)  //  Ignore placements in singletons.
        continue;

      uint32  bgn   = (placements[i].position.bgn < placements[i].position.end) ? placements[i].position.bgn : placements[i].position.end;
      uint32  end   = (placements[i].position.bgn < placements[i].position.end) ? placements[i].position.end : placements[i].position.bgn;

      double  erate = placements[i].errors / placements[i].aligned;

      if (tig->overlapConsistentWithTig(5.0, bgn, end, erate) < 0.5) {
        if ((enableLog == true) && (logFileFlagSet(LOG_PLACE_UNPLACED)))
          writeLog("frag %8u tested tig %6u (%6u reads) at %8u-%8u (cov %7.5f erate %6.4f) - HIGH ERROR\n",
                   fid, placements[i].tigID, tig->ufpath.size(), placements[i].position.bgn, placements[i].position.end, placements[i].fCoverage, erate);
        continue;
      }

      if ((enableLog == true) && (logFileFlagSet(LOG_PLACE_UNPLACED)))
        writeLog("frag %8u tested tig %6u (%6u reads) at %8u-%8u (cov %7.5f erate %6.4f)\n",
                 fid, placements[i].tigID, tig->ufpath.size(), placements[i].position.bgn, placements[i].position.end, placements[i].fCoverage, erate);

      if ((b == UINT32_MAX) ||
          (placements[i].errors / placements[i].aligned < placements[b].errors / placements[b].aligned))
        b = i;
    }

    //  If we didn't find a best, b will be invalid; set positions for adding to a new tig.
    //  If we did, save both the position it was placed at, and the tigID it was placed in.

    if (b == UINT32_MAX) {
      if ((enableLog == true) && (logFileFlagSet(LOG_PLACE_UNPLACED)))
        writeLog("frag %8u remains unplaced\n", fid);
      placedPos[fid].bgn = 0;
      placedPos[fid].end = FI->fragmentLength(fid);
    }

    else {
      if ((enableLog == true) && (logFileFlagSet(LOG_PLACE_UNPLACED)))
        writeLog("frag %8u placed tig %6u (%6u reads) at %8u-%8u (cov %7.5f erate %6.4f)\n",
                 fid, placements[b].tigID, unitigs[placements[b].tigID]->ufpath.size(),
                 placements[b].position.bgn, placements[b].position.end,
                 placements[b].fCoverage,
                 placements[b].errors / placements[b].aligned);
      placedTig[fid] = placements[b].tigID;
      placedPos[fid] = placements[b].position;
    }
  }

  //  All reads placed, now just dump them in their correct tigs.

  for (uint32 fid=1; fid<FI->numFragments()+1; fid++) {
    Unitig  *tig = NULL;
    ufNode   frg;

    if (Unitig::fragIn(fid) > 0)
      continue;

    //  If not placed, dump it in a new unitig.  Well, not anymore.  These reads were not placed in
    //  any tig initially, were not allowed to seed a tig, and now, could find no place to go.
    //  They're garbage.  Plus, it screws up the logging above because we don't know the new tig ID
    //  until now.

    if (placedTig[fid] == 0) {
      if (OG->isContained(fid))
        nFailedContained++;
      else
        nFailed++;

      //tig = unitigs.newUnitig(false);
    }

    //  Otherwise, it was placed somewhere, grab the tig.

    else {
      if (OG->isContained(fid))
        nPlacedContained++;
      else
        nPlaced++;

      tig = unitigs[placedTig[fid]];
    }

    //  Regardless, add it to the tig.  Logging for this is above.

    if (tig) {
      frg.ident             = fid;
      frg.contained         = 0;
      frg.parent            = 0;
      frg.ahang             = 0;
      frg.bhang             = 0;
      frg.position          = placedPos[fid];

      tig->addFrag(frg, 0, false);
    }
  }

  //  Cleanup.

  delete [] placedPos;
  delete [] placedTig;

  writeLog("placeContains()-- Placed %u contained reads and %u unplaced reads.\n", nPlacedContained, nPlaced);
  writeLog("placeContains()-- Failed to place %u contained reads (too high error suspected) and %u unplaced reads (lack of overlaps suspected).\n", nFailedContained, nFailed);

  //  But wait!  All the tigs need to be sorted.  Well, not really _all_, but the hard ones to sort
  //  are big, and those quite likely had reads added to them, so it's really not worth the effort
  //  of tracking which ones need sorting, since the ones that don't need it are trivial to sort.

  for (uint32 ti=1; ti<unitigs.size(); ti++) {
    Unitig *utg = unitigs[ti];

    if (utg)
      utg->sort();
  }
}
void
reportUnitigs(UnitigVector &unitigs, const char *prefix, const char *name, uint64 genomeSize) {

  //  Generate n50.  Assumes unitigs have been 'classified' already.

  vector<uint32>   unassembledLength;
  vector<uint32>   bubbleLength;
  vector<uint32>   repeatLength;
  vector<uint32>   circularLength;
  vector<uint32>   contigLength;

  for (uint32  ti=0; ti<unitigs.size(); ti++) {
    Unitig  *utg = unitigs[ti];

    if (utg == NULL)
      continue;

    if (utg->_isUnassembled) {
      unassembledLength.push_back(utg->getLength());
    }

    else if (utg->_isBubble) {
      bubbleLength.push_back(utg->getLength());
    }

    else if (utg->_isRepeat) {
      repeatLength.push_back(utg->getLength());
    }

    else if (utg->_isCircular) {
      circularLength.push_back(utg->getLength());
    }

    else {
      contigLength.push_back(utg->getLength());
    }
  }

  char   N[FILENAME_MAX];

  sprintf(N, "%s.sizes", getLogFilePrefix());

  errno = 0;
  FILE *F = fopen(N, "w");
  if (errno == 0) {
    reportN50(F, unassembledLength, "UNASSEMBLED", genomeSize);
    reportN50(F, bubbleLength,      "BUBBLE",      genomeSize);
    reportN50(F, repeatLength,      "REPEAT",      genomeSize);
    reportN50(F, circularLength,    "CIRCULAR",    genomeSize);
    reportN50(F, contigLength,      "CONTIGS",     genomeSize);

    fclose(F);
  }

  if (logFileFlagSet(LOG_INTERMEDIATE_UNITIGS) == 0)
    return;

  //  Dump to an intermediate store.

  char tigStorePath[FILENAME_MAX];
  sprintf(tigStorePath, "%s.tigStore", getLogFilePrefix());

  fprintf(stderr, "Creating intermediate tigStore '%s'\n", tigStorePath);

  uint32  numFragsT  = 0;
  uint32  numFragsP  = 0;
  uint64  utgLen     = 0;

  //  Compute average frags per partition.

  for (uint32  ti=0; ti<unitigs.size(); ti++) {
    Unitig  *utg = unitigs[ti];

    if (utg == NULL)
      continue;

    numFragsT += utg->ufpath.size();

    if (utg->ufpath.size() > 2)
      utgLen    += utg->getLength();
  }

  if      (utgLen < 16 * 1024 * 1024)
    numFragsP = numFragsT / 7;
  else if (utgLen < 64 * 1024 * 1024)
    numFragsP = numFragsT / 63;
  else
    numFragsP = numFragsT / 127;

  //  Dump the unitigs to an intermediate store.

  setParentAndHang(unitigs);

  writeUnitigsToStore(unitigs, tigStorePath, tigStorePath, numFragsP, false);
}
示例#12
0
void
MateLocation::buildHappinessGraphs(Unitig *utg) {

  //  First entry is always zero.  Needed for the getById() accessor.

  assert(_table[0].mleFrgID1 == 0);
  assert(_table[0].mleFrgID2 == 0);

  for (uint32 mleidx=1; mleidx<_table.size(); mleidx++) {
    MateLocationEntry &loc = _table[mleidx];

    //  We MUST have mleFrgID1 defined.  If mleFrgID2 is not defined, then the mate is external.
    assert(loc.mleFrgID1 != 0);

    uint32 lib =  FI->libraryIID(loc.mleFrgID1);

    if (lib == 0)
      //  Shouldn't occur, but just in case, ignore fragments in the legacy library.
      continue;

    if (IS->valid(lib) == false)
      // Don't check libs that we didn't generate good stats for
      continue;

    int32 badMaxInter = static_cast<int32>(IS->mean(lib) + BADMATE_INTER_STDDEV * IS->stddev(lib));
    int32 badMinInter = static_cast<int32>(IS->mean(lib) - BADMATE_INTER_STDDEV * IS->stddev(lib));

    int32 badMaxIntra = static_cast<int32>(IS->mean(lib) + BADMATE_INTRA_STDDEV * IS->stddev(lib));
    int32 badMinIntra = static_cast<int32>(IS->mean(lib) - BADMATE_INTRA_STDDEV * IS->stddev(lib));

    //  To keep the results the same as the previous version (1.89)
    badMaxIntra = badMaxInter;
    badMinIntra = badMinInter;

    int32 dist = 0;
    int32 bgn  = 0;
    int32 end  = 0;

    //  Bgn and End MUST be signed.

    int32 matBgn = loc.mlePos2.bgn;
    int32 matEnd = loc.mlePos2.end;
    int32 matLen = (matBgn < matEnd) ? (matEnd - matBgn) : (matBgn - matEnd);

    int32 frgBgn = loc.mlePos1.bgn;
    int32 frgEnd = loc.mlePos1.end;
    int32 frgLen = (frgBgn < frgEnd) ? (frgEnd - frgBgn) : (frgBgn - frgEnd);

    if ((matLen >= MIN(badMaxInter, badMaxIntra)) ||
        (frgLen >= MIN(badMaxInter, badMaxIntra)))
      //  Yikes, fragment longer than insert size!
      continue;


    //  Until reset, assume this is a bad mate pair.
    loc.isGrumpy = true;


    //  If the mate is in another unitig, mark bad only if there is enough space to fit the mate in
    //  this unitig.
    if (loc.mleUtgID1 != loc.mleUtgID2) {
      if ((isReverse(loc.mlePos1) == true)  && (badMaxInter < frgBgn)) {
        incrRange(badExternalRev, -1, frgBgn - badMaxInter, frgEnd);
        incrRange(badExternalRev, -1, frgBgn, frgEnd);
        if (logFileFlagSet(LOG_HAPPINESS))
          fprintf(logFile, "buildHappinessGraph()--  unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- bad external reverse\n",
                  loc.mleUtgID1, 0, loc.mleFrgID1, frgBgn, frgEnd, frgLen,
                  loc.mleUtgID2, 0, loc.mleFrgID2, matBgn, matEnd, matLen);
        goto markBad;
      }

      if ((isReverse(loc.mlePos1) == false) && (badMaxInter < _tigLen - frgBgn)) {
        incrRange(badExternalFwd, -1, frgEnd, frgBgn + badMaxInter);
        incrRange(badExternalFwd, -1, frgEnd, frgBgn);
        if (logFileFlagSet(LOG_HAPPINESS))
          fprintf(logFile, "buildHappinessGraph()--  unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- bad external forward\n",
                  loc.mleUtgID1, 0, loc.mleFrgID1, frgBgn, frgEnd, frgLen,
                  loc.mleUtgID2, 0, loc.mleFrgID2, matBgn, matEnd, matLen);
        goto markBad;
      }

      //  Not enough space.  Not a grumpy mate pair.
      loc.isGrumpy = false;
      if (logFileFlagSet(LOG_HAPPINESS))
        fprintf(logFile, "buildHappinessGraph()--  unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- not bad, not enough space\n",
                loc.mleUtgID1, 0, loc.mleFrgID1, frgBgn, frgEnd, frgLen,
                loc.mleUtgID2, 0, loc.mleFrgID2, matBgn, matEnd, matLen);
      continue;
    }


    //  Both mates are in this unitig.


    //  Same orientation?
    if ((isReverse(loc.mlePos1) == false) &&
        (isReverse(loc.mlePos2) == false)) {
      incrRange(badNormal, -1, MIN(frgBgn, matBgn), MAX(frgEnd, matEnd));
      if (logFileFlagSet(LOG_HAPPINESS))
        fprintf(logFile, "buildHappinessGraph()--  unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- bad normal\n",
                loc.mleUtgID1, 0, loc.mleFrgID1, frgBgn, frgEnd, frgLen,
                loc.mleUtgID2, 0, loc.mleFrgID2, matBgn, matEnd, matLen);
      goto markBad;
    }

    if ((isReverse(loc.mlePos1) == true) &&
        (isReverse(loc.mlePos2) == true)) {
      incrRange(badAnti, -1, MIN(frgEnd, matEnd), MAX(frgBgn, matBgn));
      if (logFileFlagSet(LOG_HAPPINESS))
        fprintf(logFile, "buildHappinessGraph()--  unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- bad anti\n",
                loc.mleUtgID1, 0, loc.mleFrgID1, frgBgn, frgEnd, frgLen,
                loc.mleUtgID2, 0, loc.mleFrgID2, matBgn, matEnd, matLen);
      goto markBad;
    }


    //  Check a special case for a circular unitig, outtie mates, but close enough to the end to
    //  plausibly be linking the ends together.
    //
    //   <---              --->
    //  ========unitig==========
    //
    if ((isReverse(loc.mlePos1) == true) &&
        (badMinIntra               <= frgBgn + _tigLen - matBgn) &&
        (frgBgn + _tigLen - matBgn <= badMaxIntra)) {
      loc.isGrumpy = false;  //  IT'S GOOD, kind of.
      if (logFileFlagSet(LOG_HAPPINESS))
        fprintf(logFile, "buildHappinessGraph()--  unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- good because circular\n",
                loc.mleUtgID1, 0, loc.mleFrgID1, frgBgn, frgEnd, frgLen,
                loc.mleUtgID2, 0, loc.mleFrgID2, matBgn, matEnd, matLen);
      continue;
    }


    //  Outties?  True if pos1.end < pos2.bgn.  (For the second case, swap pos1 and pos2)
    //
    //  (pos1.end) <------   (pos1.bgn)
    //  (pos2.bgn)  -------> (pos2.end)
    //
    if ((isReverse(loc.mlePos1) == true)  && (loc.mlePos1.end < loc.mlePos2.bgn)) {
      incrRange(badOuttie, -1, MIN(frgBgn, frgEnd), MAX(matBgn, matEnd));
      if (logFileFlagSet(LOG_HAPPINESS))
        fprintf(logFile, "buildHappinessGraph()--  unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- bad outtie (case 1)\n",
                loc.mleUtgID1, 0, loc.mleFrgID1, frgBgn, frgEnd, frgLen,
                loc.mleUtgID2, 0, loc.mleFrgID2, matBgn, matEnd, matLen);
      goto markBad;
    }
    if ((isReverse(loc.mlePos1) == false) && (loc.mlePos2.end < loc.mlePos1.bgn)) {
      incrRange(badOuttie, -1, MIN(frgBgn, frgEnd), MAX(matBgn, matEnd));
      if (logFileFlagSet(LOG_HAPPINESS))
        fprintf(logFile, "buildHappinessGraph()--  unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- bad outtie (case 2)\n",
                loc.mleUtgID1, 0, loc.mleFrgID1, frgBgn, frgEnd, frgLen,
                loc.mleUtgID2, 0, loc.mleFrgID2, matBgn, matEnd, matLen);
      goto markBad;
    }

    //  So, now not NORMAL or ANTI or OUTTIE.  We must be left with innies.

    if (isReverse(loc.mlePos1) == false)
      //  First fragment is on the left, second is on the right.
      dist = loc.mlePos2.bgn - loc.mlePos1.bgn;
    else
      //  First fragment is on the right, second is on the left.
      dist = loc.mlePos1.bgn - loc.mlePos2.bgn;

    assert(dist >= 0);

    if (dist < badMinIntra) {
      incrRange(badCompressed, -1, MIN(frgBgn, matBgn), MAX(frgBgn, matBgn));
      if (logFileFlagSet(LOG_HAPPINESS))
        fprintf(logFile, "buildHappinessGraph()--  unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- bad compressed\n",
                loc.mleUtgID1, 0, loc.mleFrgID1, frgBgn, frgEnd, frgLen,
                loc.mleUtgID2, 0, loc.mleFrgID2, matBgn, matEnd, matLen);
      goto markBad;
    }

    if (badMaxIntra < dist) {
      incrRange(badStretched, -1, MIN(frgBgn, matBgn), MAX(frgBgn, matBgn));
      if (logFileFlagSet(LOG_HAPPINESS))
        fprintf(logFile, "buildHappinessGraph()--  unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- bad stretched\n",
                loc.mleUtgID1, 0, loc.mleFrgID1, frgBgn, frgEnd, frgLen,
                loc.mleUtgID2, 0, loc.mleFrgID2, matBgn, matEnd, matLen);
      goto markBad;
    }

    assert(badMinIntra <= dist);
    assert(dist        <= badMaxIntra);

    incrRange(goodGraph, 1, MIN(frgBgn, matBgn), MAX(frgBgn, matBgn));
    loc.isGrumpy = false;  //  IT'S GOOD!
    if (logFileFlagSet(LOG_HAPPINESS))
      fprintf(logFile, "buildHappinessGraph()--  unitig %d (len %d) frag %d pos %d,%d (len %d) and unitig %d (len %d) frag %d pos %d,%d (len %d) -- GOOD!\n",
              loc.mleUtgID1, 0, loc.mleFrgID1, frgBgn, frgEnd, frgLen,
              loc.mleUtgID2, 0, loc.mleFrgID2, matBgn, matEnd, matLen);
    continue;

  markBad:

    //  Mark bad from the 3' end of the fragment till the upper limit where the mate should go.

    if (loc.mleUtgID1 == utg->id()) {
      assert(loc.mleFrgID1 != 0);
      if (isReverse(loc.mlePos1) == false) {
        //  Mark bad for forward fagment 1
        assert(frgBgn < frgEnd);
        bgn = frgEnd;
        end = frgBgn + badMaxIntra;
        incrRange(badFwdGraph, -1, bgn, end);
      } else {
        //  Mark bad for reverse fragment 1
        assert(frgEnd < frgBgn);
        bgn = frgBgn - badMaxIntra;
        end = frgEnd;
        incrRange(badRevGraph, -1, bgn, end);
      }
    }

    if (loc.mleUtgID2 == utg->id()) {
      assert(loc.mleFrgID2 != 0);
      if (isReverse(loc.mlePos2) == false) {
        //  Mark bad for forward fragment 2
        assert(matBgn < matEnd);
        bgn = matEnd;
        end = matBgn + badMaxIntra;
        incrRange(badFwdGraph, -1, bgn, end);
      } else {
        //  Mark bad for reverse fragment 2
        assert(matEnd < matBgn);
        bgn = matBgn - badMaxIntra;
        end = matEnd;
        incrRange(badRevGraph, -1, bgn, end);
      }
    }
  }  //  Over all MateLocationEntries in the table
}  //  buildHappinessGraph()
示例#13
0
//  Make sure that contained fragments are in the same unitig
//  as their container.  Due to sorting, contained fragments
//  can come much later in the unitig:
//
//  ------------1
//    -------------2
//       --------------3
//         ----4 (contained in 1, too much error keeps it out of 2 and 3)
//
//  So, our first pass is to move contained fragments around.
//
void UnitigGraph::moveContains(void) {

  for (uint32 ti=0; ti<unitigs.size(); ti++) {
    Unitig  *thisUnitig = unitigs[ti];

    if ((thisUnitig == NULL) ||
        (thisUnitig->ufpath.size() < 2))
      continue;

    MateLocation positions(thisUnitig);

    ufNode               *frags         = new ufNode [thisUnitig->ufpath.size()];
    uint32                fragsLen      = 0;

    if (logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS))
      fprintf(logFile, "moveContain unitig %d\n", thisUnitig->id());

    for (uint32 fi=0; fi<thisUnitig->ufpath.size(); fi++) {
      ufNode  *frg = &thisUnitig->ufpath[fi];

      BestContainment   *bestcont   = OG->getBestContainer(frg->ident);
      MateLocationEntry  mloc       = positions.getById(frg->ident);

      uint32  thisFrgID = frg->ident;
      uint32  contFrgID = (bestcont) ? bestcont->container : 0;
      uint32  mateFrgID = FI->mateIID(frg->ident);

      uint32  thisUtgID = thisUnitig->fragIn(thisFrgID);
      uint32  contUtgID = thisUnitig->fragIn(contFrgID);
      uint32  mateUtgID = thisUnitig->fragIn(mateFrgID);

      //  id1 != 0 -> we found the fragment in the mate happiness table
      //  isBad -> and the mate is unhappy.
      //
      //  What's id1 vs id2 in MateLocationEntry?  Dunno.  All I
      //  know is that if there is no mate present, one of those
      //  will be 0.  (Similar test used above too.)
      //
      bool    isMated    = (mateFrgID > 0);
      bool    isGrumpy   = ((isMated) && (mloc.mleFrgID1 != 0) && (mloc.mleFrgID2 != 0) && (mloc.isGrumpy == true));

      //
      //  Figure out what to do.
      //

      bool    moveToContainer = false;
      bool    moveToSingleton = false;

      if        ((frg->contained == 0) && (bestcont == NULL)) {
        //  CASE 1:  Not contained.  Leave the fragment here.
        //fprintf(logFile, "case1 frag %d fragsLen %d\n", thisFrgID, fragsLen);

      } else if (isMated == false) {
        //  CASE 2: Contained but not mated.  Move to be with the
        //  container (if the container isn't here).
        //fprintf(logFile, "case2 frag %d contID %d fragsLen %d\n", thisFrgID, contUtgID, fragsLen);

        if (thisUtgID != contUtgID)
          moveToContainer = true;

      } else if ((isGrumpy == true) && (thisUtgID == mateUtgID)) {
        //  CASE 3: Not happy, and the frag and mate are together.
        //  Kick out to a singleton.

        //fprintf(logFile, "case3 frag %d utg %d mate %d utg %d cont %d utg %d fragsLen %d\n",
        //        thisFrgID, thisUtgID, mateFrgID, mateUtgID, contFrgID, contUtgID, fragsLen);

        if (thisUtgID == mateUtgID)
          moveToSingleton = true;

      } else {

        //  This makes for some ugly code (we break the nice if else
        //  if else structure we had going on) but the next two cases
        //  need to know if there is an overlap to the rest of the
        //  unitig.

        bool  hasOverlap   = (thisUtgID == contUtgID);
        bool  allContained = false;


        if (hasOverlap == false) {
          if (fragsLen == 0) {
            //  The first fragment.  Check fragments after to see if
            //  there is an overlap (note only frags with an overlap
            //  in the layout are tested).  In rare cases, we ejected
            //  the container, and left a containee with no overlap to
            //  fragments remaining.
            //
            //  Note that this checks if there is an overlap to the
            //  very first non-contained (aka dovetail) fragment ONLY.
            //  If there isn't an overlap to the first non-contained
            //  fragment, then that fragment will likely NOT align
            //  correctly.

            uint32 ft = fi + 1;

#warning 2x BUGS IN COMPARISON HERE

            //  Skip all the contains.
            while ((ft < thisUnitig->ufpath.size()) &&
                   (OG->isContained(thisUnitig->ufpath[ft].ident) == true) &&
                   (MAX(frg->position.bgn, frg->position.end) < MIN(thisUnitig->ufpath[ft].position.bgn, thisUnitig->ufpath[ft].position.end)))
              ft++;

            //  If the frag is not contained (we could be the
            //  container), and overlaps in the layout, see if there
            //  is a real overlap.
            if ((ft < thisUnitig->ufpath.size()) &&
                (OG->isContained(thisUnitig->ufpath[ft].ident) == false) &&
                (MAX(frg->position.bgn, frg->position.end) < MIN(thisUnitig->ufpath[ft].position.bgn, thisUnitig->ufpath[ft].position.end)))
              hasOverlap = OG->containHaveEdgeTo(thisFrgID, thisUnitig->ufpath[ft].ident);
          } else {
            //  Not the first fragment, search for an overlap to an
            //  already placed frag.

            uint32  ft = fi;

            do {
              ft--;

              //  OK to overlap to a contained frag; he could be our
              //  container.

              hasOverlap = OG->containHaveEdgeTo(thisFrgID, thisUnitig->ufpath[ft].ident);

              //  Stop if we found an overlap, or we just checked the
              //  first frag in the unitig, or we no longer overlap in
              //  the layout.
            } while ((hasOverlap == false) &&
                     (ft > 0) &&
                     (MIN(frg->position.bgn, frg->position.end) < MAX(thisUnitig->ufpath[ft].position.bgn, thisUnitig->ufpath[ft].position.end)));
          }
        }  //  end of hasOverlap


        //  An unbelievabe special case.  When the unitig is just a
        //  single container fragment (and any contained frags under
        //  it) rule 4 breaks.  The first fragment has no overlap (all
        //  later reads are contained) and so we want to eject it to a
        //  new unitig.  Since there are multiple fragments in this
        //  unitig, the ejection occurs.  Later, all the contains get
        //  moved to the new unitig.  And we repeat.  To prevent, we
        //  abort the ejection if the unitig is all contained in one
        //  fragment.
        //
        if (fragsLen == 0) {
          allContained = true;

          for (uint32 ft = fi + 1; ((allContained == true) && (ft < thisUnitig->ufpath.size())); ft++)
            allContained = OG->isContained(thisUnitig->ufpath[ft].ident);
        }



        if (isGrumpy == true) {
          //  CASE 4: Not happy and not with the mate.  This one is a
          //  bit of a decision.
          //
          //  If an overlap exists to the rest of the unitig, we'll
          //  leave it here.  We'll also leave it here if it is the
          //  rest of the unitig is all contained in this fragment.
          //
          //  If no overlap, and the mate and container are in the
          //  same unitig, we'll just eject.  That also implies the
          //  other unitig is somewhat large, at least as big as the
          //  insert size.
          //
          //  Otherwise, we'll move to the container and cross our
          //  fingers we place it correctly.  The alternative is to
          //  eject, and hope that we didn't also eject the mate to a
          //  singleton.

          //fprintf(logFile, "case4 frag %d utg %d mate %d utg %d cont %d utg %d fragsLen %d\n",
          //        thisFrgID, thisUtgID, mateFrgID, mateUtgID, contFrgID, contUtgID, fragsLen);

          if ((hasOverlap == false) && (allContained == false))
            if (mateUtgID == contUtgID)
              moveToSingleton = true;
            else
              moveToContainer = true;

        } else {
          //  CASE 5: Happy!  If with container, or an overlap exists to
          //  some earlier fragment, leave it here.  Otherwise, eject it
          //  to a singleton.  The fragment is ejected instead of moved
          //  to be with its container since we don't know which is
          //  correct - the mate or the overlap.
          //
          //  If not happy, we've already made sure that the mate is not
          //  here (that was case 3).

          //fprintf(logFile, "case5 frag %d utg %d mate %d utg %d cont %d utg %d fragsLen %d\n",
          //        thisFrgID, thisUtgID, mateFrgID, mateUtgID, contFrgID, contUtgID, fragsLen);

          //  If no overlap (so not with container or no overlap to
          //  other frags) eject.
          if ((hasOverlap == false) && (allContained == false))
            moveToSingleton = true;
        }
      }  //  End of cases

      //
      //  Do it.
      //

      if (moveToContainer == true) {
        //  Move the fragment to be with its container.

        Unitig         *thatUnitig = unitigs[contUtgID];
        ufNode          containee  = *frg;

        assert(thatUnitig->id() == contUtgID);

        //  Nuke the fragment in the current list
        frg->ident        = 999999999;
        frg->contained    = 999999999;
        frg->position.bgn = 0;
        frg->position.end = 0;

        assert(thatUnitig->id() == contUtgID);

        if (logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS))
          fprintf(logFile, "Moving contained fragment %d from unitig %d to be with its container %d in unitig %d\n",
                  thisFrgID, thisUtgID, contFrgID, contUtgID);

        assert(bestcont->container == contFrgID);

        thatUnitig->addContainedFrag(thisFrgID, bestcont, logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS));
        assert(thatUnitig->id() == Unitig::fragIn(thisFrgID));

      } else if ((moveToSingleton == true) && (thisUnitig->getNumFrags() != 1)) {
        //  Eject the fragment to a singleton (unless we ARE the singleton)
        Unitig        *singUnitig  = new Unitig(logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS));
        ufNode         containee  = *frg;

        //  Nuke the fragment in the current list
        frg->ident        = 999999999;
        frg->contained    = 999999999;
        frg->position.bgn = 0;
        frg->position.end = 0;

        if (logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS))
          fprintf(logFile, "Ejecting unhappy contained fragment %d from unitig %d into new unitig %d\n",
                  thisFrgID, thisUtgID, singUnitig->id());

        containee.contained = 0;

        singUnitig->addFrag(containee, -MIN(containee.position.bgn, containee.position.end), logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS));

        unitigs.push_back(singUnitig);
        thisUnitig = unitigs[ti];  //  Reset the pointer; unitigs might be reallocated

      } else {
        //  Leave fragment here.  Copy the fragment to the list -- if
        //  we need to rebuild the unitig (because fragments were
        //  removed), the list is used, otherwise, we have already
        //  made the changes needed.
        //
        //  Also, very important, update our containment mark.  If our
        //  container was moved, but we stayed put because of a happy
        //  mate, we're still marked as being contained.  Rather than
        //  put this check in all the places where we stay put in the
        //  above if-else-else-else, it's here.

        if ((frg->contained) && (thisUtgID != contUtgID))
          frg->contained = 0;

        frags[fragsLen] = *frg;
        fragsLen++;
      }

    }  //  over all frags

    //  Now, rebuild this unitig if we made changes.

    if (fragsLen != thisUnitig->ufpath.size()) {
      if (logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS))
        fprintf(logFile, "Rebuild unitig %d after removing contained fragments.\n", thisUnitig->id());

      thisUnitig->ufpath.clear();

      //  Occasionally, we move all fragments out of the original unitig.  Might be worth checking
      //  if that makes sense!!
      //
#warning EMPTIED OUT A UNITIG
      if (fragsLen > 0) {
        //  No need to resort.  Offsets only need adjustment if the first fragment is thrown out.
        //  If not, splitOffset will be zero.
        //
        int splitOffset = -MIN(frags[0].position.bgn, frags[0].position.end);

        //  This is where we clean up from the splitting not dealing with contained fragments -- we
        //  force the first frag to be uncontained.
        //
        frags[0].contained = 0;

        for (uint32 i=0; i<fragsLen; i++)
          thisUnitig->addFrag(frags[i], splitOffset, logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS));
      }
    }

    delete [] frags;
    frags = NULL;

  }  //  Over all unitigs
}
示例#14
0
//  For every unitig, report the best overlaps contained in the
//  unitig, and all overlaps contained in the unitig.
void
reportOverlapsUsed(UnitigVector &unitigs, const char *prefix, const char *name) {

  if (logFileFlagSet(LOG_OVERLAPS_USED) == 0)
    return;

  char ovlPath[FILENAME_MAX];
  sprintf(ovlPath, "%s.%03u.%s.overlaps", prefix, logFileOrder, name);

  FILE *F = fopen(ovlPath, "w");

  if (F == NULL)
    return;

  for (uint32  ti=0; ti<unitigs.size(); ti++) {
    Unitig  *utg = unitigs[ti];

    if (utg == NULL)
      continue;

    for (uint32 fi=0; fi<utg->ufpath.size(); fi++) {
      ufNode  *frg = &utg->ufpath[fi];

      //  Where is our best overlap?  Contained or dovetail?

      BestEdgeOverlap *bestedge5 = OG->getBestEdgeOverlap(frg->ident, false);
      BestEdgeOverlap *bestedge3 = OG->getBestEdgeOverlap(frg->ident, true);

      uint32           bestident5 = 0;
      uint32           bestident3 = 0;

      if (bestedge5)
        bestident5 = bestedge5->fragId();

      if (bestedge3)
        bestident3 = bestedge3->fragId();

      //  Now search ahead, reporting any overlap to any fragment.
      //
      for (uint32 oi=fi+1; oi<utg->ufpath.size(); oi++) {
        ufNode  *ooo = &utg->ufpath[oi];

        int frgbgn = MIN(frg->position.bgn, frg->position.end);
        int frgend = MAX(frg->position.bgn, frg->position.end);

        int ooobgn = MIN(ooo->position.bgn, ooo->position.end);
        int oooend = MAX(ooo->position.bgn, ooo->position.end);

        if ((frgbgn <= ooobgn) && (ooobgn + 40 < frgend)) {
          BestContainment *bestcont  = OG->getBestContainer(ooo->ident);

          uint32           bestident  = 0;
          if (bestcont->isContained)
            bestident = bestcont->container;

          bool isBest = ((frg->ident == bestident) ||
                         (ooo->ident == bestident5) ||
                         (ooo->ident == bestident3));

          fprintf(F, "%d\t%d%s\n", frg->ident, ooo->ident, (isBest) ? ((bestident) ? "\tbc" : "\tbe") : "");
        }

        if (frgend < ooobgn)
          break;
      }
    }
  }

  fclose(F);
}