Ejemplo n.º 1
0
void
placeUnplacedUsingAllOverlaps(UnitigVector &unitigs,
                              const char   *prefix) {
  uint32  fiLimit    = FI->numFragments();
  uint32  numThreads = omp_get_max_threads();
  uint32  blockSize  = (fiLimit < 100 * numThreads) ? numThreads : fiLimit / 99;

  uint32       *placedTig = new uint32      [FI->numFragments() + 1];
  SeqInterval  *placedPos = new SeqInterval [FI->numFragments() + 1];

  memset(placedTig, 0, sizeof(uint32)      * (FI->numFragments() + 1));
  memset(placedPos, 0, sizeof(SeqInterval) * (FI->numFragments() + 1));

  //  Just some logging.  Count the number of reads we try to place.

  uint32   nToPlaceContained = 0;
  uint32   nToPlace          = 0;
  uint32   nPlacedContained  = 0;
  uint32   nPlaced           = 0;
  uint32   nFailedContained  = 0;
  uint32   nFailed           = 0;

  for (uint32 fid=1; fid<FI->numFragments()+1; fid++)
    if (Unitig::fragIn(fid) == 0)
      if (OG->isContained(fid))
        nToPlaceContained++;
      else
        nToPlace++;

  writeLog("placeContains()-- placing %u contained and %u unplaced reads, with %d threads.\n",
           nToPlaceContained, nToPlace, numThreads);

  //  Do the placing!

#pragma omp parallel for schedule(dynamic, blockSize)
  for (uint32 fid=1; fid<FI->numFragments()+1; fid++) {
    bool  enableLog = true;

    if (Unitig::fragIn(fid) > 0)
      continue;

    //  Place the read.

    vector<overlapPlacement>   placements;

    placeFragUsingOverlaps(unitigs, AS_MAX_ERATE, NULL, fid, placements);

    //  Search the placements for the highest expected identity placement using all overlaps in the unitig.

    uint32   b = UINT32_MAX;

    for (uint32 i=0; i<placements.size(); i++) {
      Unitig *tig = unitigs[placements[i].tigID];

      if (placements[i].fCoverage < 0.99)                   //  Ignore partially placed reads.
        continue;

      if (tig->ufpath.size() == 1)  //  Ignore placements in singletons.
        continue;

      uint32  bgn   = (placements[i].position.bgn < placements[i].position.end) ? placements[i].position.bgn : placements[i].position.end;
      uint32  end   = (placements[i].position.bgn < placements[i].position.end) ? placements[i].position.end : placements[i].position.bgn;

      double  erate = placements[i].errors / placements[i].aligned;

      if (tig->overlapConsistentWithTig(5.0, bgn, end, erate) < 0.5) {
        if ((enableLog == true) && (logFileFlagSet(LOG_PLACE_UNPLACED)))
          writeLog("frag %8u tested tig %6u (%6u reads) at %8u-%8u (cov %7.5f erate %6.4f) - HIGH ERROR\n",
                   fid, placements[i].tigID, tig->ufpath.size(), placements[i].position.bgn, placements[i].position.end, placements[i].fCoverage, erate);
        continue;
      }

      if ((enableLog == true) && (logFileFlagSet(LOG_PLACE_UNPLACED)))
        writeLog("frag %8u tested tig %6u (%6u reads) at %8u-%8u (cov %7.5f erate %6.4f)\n",
                 fid, placements[i].tigID, tig->ufpath.size(), placements[i].position.bgn, placements[i].position.end, placements[i].fCoverage, erate);

      if ((b == UINT32_MAX) ||
          (placements[i].errors / placements[i].aligned < placements[b].errors / placements[b].aligned))
        b = i;
    }

    //  If we didn't find a best, b will be invalid; set positions for adding to a new tig.
    //  If we did, save both the position it was placed at, and the tigID it was placed in.

    if (b == UINT32_MAX) {
      if ((enableLog == true) && (logFileFlagSet(LOG_PLACE_UNPLACED)))
        writeLog("frag %8u remains unplaced\n", fid);
      placedPos[fid].bgn = 0;
      placedPos[fid].end = FI->fragmentLength(fid);
    }

    else {
      if ((enableLog == true) && (logFileFlagSet(LOG_PLACE_UNPLACED)))
        writeLog("frag %8u placed tig %6u (%6u reads) at %8u-%8u (cov %7.5f erate %6.4f)\n",
                 fid, placements[b].tigID, unitigs[placements[b].tigID]->ufpath.size(),
                 placements[b].position.bgn, placements[b].position.end,
                 placements[b].fCoverage,
                 placements[b].errors / placements[b].aligned);
      placedTig[fid] = placements[b].tigID;
      placedPos[fid] = placements[b].position;
    }
  }

  //  All reads placed, now just dump them in their correct tigs.

  for (uint32 fid=1; fid<FI->numFragments()+1; fid++) {
    Unitig  *tig = NULL;
    ufNode   frg;

    if (Unitig::fragIn(fid) > 0)
      continue;

    //  If not placed, dump it in a new unitig.  Well, not anymore.  These reads were not placed in
    //  any tig initially, were not allowed to seed a tig, and now, could find no place to go.
    //  They're garbage.  Plus, it screws up the logging above because we don't know the new tig ID
    //  until now.

    if (placedTig[fid] == 0) {
      if (OG->isContained(fid))
        nFailedContained++;
      else
        nFailed++;

      //tig = unitigs.newUnitig(false);
    }

    //  Otherwise, it was placed somewhere, grab the tig.

    else {
      if (OG->isContained(fid))
        nPlacedContained++;
      else
        nPlaced++;

      tig = unitigs[placedTig[fid]];
    }

    //  Regardless, add it to the tig.  Logging for this is above.

    if (tig) {
      frg.ident             = fid;
      frg.contained         = 0;
      frg.parent            = 0;
      frg.ahang             = 0;
      frg.bhang             = 0;
      frg.position          = placedPos[fid];

      tig->addFrag(frg, 0, false);
    }
  }

  //  Cleanup.

  delete [] placedPos;
  delete [] placedTig;

  writeLog("placeContains()-- Placed %u contained reads and %u unplaced reads.\n", nPlacedContained, nPlaced);
  writeLog("placeContains()-- Failed to place %u contained reads (too high error suspected) and %u unplaced reads (lack of overlaps suspected).\n", nFailedContained, nFailed);

  //  But wait!  All the tigs need to be sorted.  Well, not really _all_, but the hard ones to sort
  //  are big, and those quite likely had reads added to them, so it's really not worth the effort
  //  of tracking which ones need sorting, since the ones that don't need it are trivial to sort.

  for (uint32 ti=1; ti<unitigs.size(); ti++) {
    Unitig *utg = unitigs[ti];

    if (utg)
      utg->sort();
  }
}
Ejemplo n.º 2
0
vector<overlapPlacement>  *
findBubbleReadPlacements(UnitigVector    &unitigs,
                         BubTargetList   &potentialBubbles,
                         double           deviationBubble) {
  uint32  fiLimit      = FI->numFragments();
  uint32  fiNumThreads = omp_get_max_threads();
  uint32  fiBlockSize  = (fiLimit < 1000 * fiNumThreads) ? fiNumThreads : fiLimit / 999;

  vector<overlapPlacement>   *placed = new vector<overlapPlacement> [fiLimit + 1];

#pragma omp parallel for schedule(dynamic, fiBlockSize)
  for (uint32 fi=0; fi<fiLimit; fi++) {
    uint32     rdAtigID = Unitig::fragIn(fi);

    if ((rdAtigID == 0) ||                           //  Read not placed in a tig, ignore it.
        (OG->isContained(fi)) ||                     //  Read is contained, ignore it.
        (potentialBubbles.count(rdAtigID) == 0))     //  Read isn't in a potential bubble, ignore it.
      continue;

    Unitig     *rdAtig   = unitigs[rdAtigID];
    ufNode     *rdA      = &rdAtig->ufpath[ Unitig::pathPosition(fi) ];
    bool        rdAfwd   = (rdA->position.bgn < rdA->position.end);
    int32       rdAlo    = (rdAfwd) ? rdA->position.bgn : rdA->position.end;
    int32       rdAhi    = (rdAfwd) ? rdA->position.end : rdA->position.bgn;

    uint32      ovlLen   = 0;
    BAToverlap *ovl      = OC->getOverlaps(rdA->ident, AS_MAX_ERATE, ovlLen);

    set<uint32> intersections;

    //if ((fi % 100) == 0)
    //  fprintf(stderr, "findBubbleReadPlacements()-- read %8u with %6u overlaps - %6.2f%% finished.\r",
    //          rdA->ident, ovlLen, 100.0 * fi / fiLimit);

    //  Compute all placements for this read.

    vector<overlapPlacement>   placements;

    placeFragUsingOverlaps(unitigs, AS_MAX_ERATE, NULL, rdA->ident, placements);

    //  Weed out placements that aren't for bubbles, or that are for bubbles but are poor quality.  Or are to ourself!

    for (uint32 pi=0; pi<placements.size(); pi++) {
      uint32    rdBtigID = placements[pi].tigID;
      Unitig   *rdBtig   = unitigs[rdBtigID];

      uint32    lo       = (placements[pi].position.bgn < placements[pi].position.end) ? placements[pi].position.bgn : placements[pi].position.end;
      uint32    hi       = (placements[pi].position.bgn < placements[pi].position.end) ? placements[pi].position.end : placements[pi].position.bgn;

      double    erate    = placements[pi].errors / placements[pi].aligned;

      //  Ignore the placement if it is to ourself.

      if (rdAtigID == rdBtigID) {
        //writeLog("tig %6u frag %8u -> tig %6u %6u reads at %8u-%8u (cov %7.5f erate %6.4f) - SAME TIG\n",
        //         rdAtigID, placements[pi].frgID, placements[pi].tigID, rdBtig->ufpath.size(), placements[pi].position.bgn, placements[pi].position.end, placements[pi].fCoverage, erate);
        continue;
      }

      //  Ignore the placement if it is to a non-tig / singleton read, or if it didn't place the
      //  read fully.

      if ((rdBtigID == 0) ||
          (rdBtig   == NULL) ||
          (rdBtig->ufpath.size() == 1) ||
          (placements[pi].fCoverage < 0.99)) {
        if (logFileFlagSet(LOG_BUBBLE_DETAIL))
          writeLog("tig %6u frag %8u -> tig %6u %6u reads at %8u-%8u (cov %7.5f erate %6.4f) - PARTIALLY PLACED\n",
                   rdAtigID, placements[pi].frgID, placements[pi].tigID, rdBtig->ufpath.size(), placements[pi].position.bgn, placements[pi].position.end, placements[pi].fCoverage, erate);
        continue;
      }

      //  Ignore the placement if it isn't to one of our bubble-popping candidate unitigs.

      bool             dontcare = true;
      vector<uint32>  &pbubbles = potentialBubbles[rdAtigID];

      for (uint32 pb=0; pb<pbubbles.size(); pb++) {
        if (pbubbles[pb] == rdBtigID)
          dontcare = false;
      }

      if (dontcare) {
        if (logFileFlagSet(LOG_BUBBLE_DETAIL))
          writeLog("tig %6u frag %8u -> tig %6u %6u reads at %8u-%8u (cov %7.5f erate %6.4f) - NOT CANDIDATE TIG\n",
                   rdAtigID, placements[pi].frgID, placements[pi].tigID, rdBtig->ufpath.size(), placements[pi].position.bgn, placements[pi].position.end, placements[pi].fCoverage, erate);
        continue;
      }

      //  Ignore the placement if it is too diverged from the destination tig.

      if (rdBtig->overlapConsistentWithTig(deviationBubble, lo, hi, erate) < 0.5) {
        if (logFileFlagSet(LOG_BUBBLE_DETAIL))
          writeLog("tig %6u frag %8u -> tig %6u %6u reads at %8u-%8u (cov %7.5f erate %6.4f) - HIGH ERROR\n",
                   rdAtigID, placements[pi].frgID, placements[pi].tigID, rdBtig->ufpath.size(), placements[pi].position.bgn, placements[pi].position.end, placements[pi].fCoverage, erate);
        continue;
      }

      //  Good placement!

      if (logFileFlagSet(LOG_BUBBLE_DETAIL))
        writeLog("tig %6u frag %8u -> tig %6u %6u reads at %8u-%8u (cov %7.5f erate %6.4f)\n",
                 rdAtigID, placements[pi].frgID, placements[pi].tigID, rdBtig->ufpath.size(), placements[pi].position.bgn, placements[pi].position.end, placements[pi].fCoverage, erate);

      placed[fi].push_back(placements[pi]);
    }
  }

  return(placed);
}