C++ (Cpp) Unitig 예제들, Unitig C++ (Cpp) 예제들

예제 #1

0

파일 보기

파일: AS_BAT_UnitigVector.C 프로젝트: Ecogpr/canu

Unitig *
UnitigVector::newUnitig(bool verbose) {
  Unitig *u = new Unitig();

#pragma omp critical
  {
    u->_id = _totalUnitigs++;

    if (verbose)
      writeLog("Creating Unitig %d\n", u->_id);

    if (_blockNext >= _blockSize) {
      assert(_numBlocks < _maxBlocks);

      _blocks[_numBlocks] = new Unitig * [_blockSize];

      memset(_blocks[_numBlocks], 0, sizeof(Unitig **) * _blockSize);

      _numBlocks++;
      _blockNext = 0;
    }

    _blocks[_numBlocks-1][_blockNext++] = u;

    //  The rest are just sanity checks.

    assert((u->id() / _blockSize) == (_numBlocks - 1));
    assert((u->id() % _blockSize) == (_blockNext - 1));

    assert(operator[](u->id()) == u);
  }

  return(u);
};

예제 #2

0

파일 보기

파일: AS_BAT_UnitigVector.C 프로젝트: Ecogpr/canu

void
UnitigVector::computeArrivalRate(const char *prefix, const char *label) {
  uint32  tiLimit = size();
  uint32  numThreads = omp_get_max_threads();
  uint32  blockSize = (tiLimit < 100000 * numThreads) ? numThreads : tiLimit / 99999;

  fprintf(stderr, "Computing arrival rates for %u unitigs using %u threads.\n", tiLimit, numThreads);

  vector<int32>  hist[6];

  //#pragma omp parallel for schedule(dynamic, blockSize)
  for (uint32 ti=0; ti<tiLimit; ti++) {
    Unitig  *tig = operator[](ti);

    if (tig == NULL)
      continue;

    if (tig->ufpath.size() == 1)
      continue;

    tig->computeArrivalRate(prefix, label, hist);
  }

  for (uint32 ii=1; ii<6; ii++) {
    char  N[FILENAME_MAX];

    sprintf(N, "%s.arrivalRate.%u.dat", prefix, ii);
    FILE *F = fopen(N, "w");
    for (uint32 jj=0; jj<hist[ii].size(); jj++)
      fprintf(F, "%d\n", hist[ii][jj]);
    fclose(F);
  }
}

예제 #3

0

파일 보기

파일: AS_BAT_PromoteToSingleton.C 프로젝트: ondovb/canu

void
promoteToSingleton(UnitigVector &unitigs, bool enablePromoteToSingleton) {

  for (uint32 fi=1; fi<=FI->numFragments(); fi++) {
    if (Unitig::fragIn(fi) != 0)
      //  Placed already
      continue;

    if (FI->fragmentLength(fi) == 0)
      //  Deleted.
      continue;

    if (enablePromoteToSingleton == false) {
      writeLog("promoteToSingleton()--  Repeat fragment "F_U32" removed from assembly.\n", fi);
      FI->markAsIgnore(fi);
      continue;
    }

    Unitig *utg = unitigs.newUnitig(false);
    ufNode  frag;

    frag.ident             = fi;
    frag.contained         = 0;
    frag.parent            = 0;
    frag.ahang             = 0;
    frag.bhang             = 0;
    frag.position.bgn      = 0;
    frag.position.end      = FI->fragmentLength(fi);
    frag.containment_depth = 0;

    utg->addFrag(frag, 0, false);
  }
}

예제 #4

0

파일 보기

파일: AS_BAT_PlaceContains.C 프로젝트: Ecogpr/canu

void
breakSingletonTigs(UnitigVector &unitigs) {

  //  For any singleton unitig, eject the read and delete the unitig.  Eventually,
  //  we will stop making singleton unitigs.

  uint32   removed = 0;

  for (uint32 ti=1; ti<unitigs.size(); ti++) {
    Unitig *utg = unitigs[ti];

    if (utg == NULL)
      continue;

    if (utg->ufpath.size() > 1)
      continue;

    unitigs[ti] = NULL;                      //  Remove the unitig from the list
    utg->removeFrag(utg->ufpath[0].ident);   //  Eject the read
    delete utg;                              //  Reclaim space
    removed++;                               //  Count
  }

  writeLog("Removed %u read%s from %u singleton unitig%s.\n",
          removed, (removed != 1) ? "" : "s",
          removed, (removed != 1) ? "" : "s");
}

예제 #5

0

파일 보기

파일: AS_BAT_PromoteToSingleton.C 프로젝트: AndreasHegerGenomics/canu

void
promoteToSingleton(UnitigVector &unitigs) {

  for (uint32 fi=1; fi<=FI->numFragments(); fi++) {
    if (Unitig::fragIn(fi) != 0)
      //  Placed already
      continue;

    if (FI->fragmentLength(fi) == 0)
      //  Deleted.
      continue;

    Unitig *utg = unitigs.newUnitig(false);
    ufNode  frag;

    frag.ident             = fi;
    frag.contained         = 0;
    frag.parent            = 0;
    frag.ahang             = 0;
    frag.bhang             = 0;
    frag.position.bgn      = 0;
    frag.position.end      = FI->fragmentLength(fi);

    utg->addFrag(frag, 0, false);
  }
}

예제 #6

0

파일 보기

파일: AS_BAT_Instrumentation.C 프로젝트: AndreasHegerGenomics/canu

void
checkUnitigMembership(UnitigVector &unitigs) {
  uint32 *inUnitig = new uint32 [FI->numFragments()+1];
  uint32  noUnitig = 0xffffffff;

  //  All reads start of not placed in a unitig.

  for (uint32 i=0; i<FI->numFragments()+1; i++)
    inUnitig[i] = noUnitig;

  //  Over all unitigs, remember where each read is.

  for (uint32 ti=0; ti<unitigs.size(); ti++) {
    Unitig  *tig = unitigs[ti];
    int32    len = 0;

    if (tig == NULL)
      continue;

    for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
      ufNode  *frg = &tig->ufpath[fi];

      if (frg->ident > FI->numFragments())
        fprintf(stderr, "tig %u ufpath[%d] ident %u more than number of reads %u\n",
                tig->id(), fi, frg->ident, FI->numFragments());

      if (inUnitig[frg->ident] != noUnitig)
        fprintf(stderr, "tig %u ufpath[%d] ident %u placed multiple times\n",
                tig->id(), fi, frg->ident);

      assert(frg->ident <= FI->numFragments());   //  Can't be out of range.
      assert(inUnitig[frg->ident] == noUnitig);   //  Read must be not placed yet.

      inUnitig[frg->ident] = ti;
    }
  }

  //  Find any read not placed in a unitig.

  for (uint32 i=0; i<FI->numFragments()+1; i++) {
    if (FI->fragmentLength(i) == 0)  //  Deleted read.
      continue;

    assert(inUnitig[i] != 0);         //  There shouldn't be a unitig 0.
    assert(inUnitig[i] != noUnitig);  //  The read should be in a unitig.
  }

  delete [] inUnitig;
}

예제 #7

0

파일 보기

파일: AS_BAT_UnitigVector.C 프로젝트: Ecogpr/canu

void
UnitigVector::reportErrorProfiles(const char *prefix, const char *label) {
  uint32  tiLimit = size();
  uint32  numThreads = omp_get_max_threads();
  uint32  blockSize = (tiLimit < 100000 * numThreads) ? numThreads : tiLimit / 99999;

  for (uint32 ti=0; ti<tiLimit; ti++) {
    Unitig  *tig = operator[](ti);

    if (tig == NULL)
      continue;

    if (tig->ufpath.size() == 1)
      continue;

    tig->reportErrorProfile(prefix, label);
  }
}

예제 #8

0

파일 보기

파일: AS_BAT_SplitDiscontinuous.C 프로젝트: xtmgah/canu

static
void
makeNewUnitig(UnitigVector &unitigs,
              uint32        splitFragsLen,
              ufNode       *splitFrags) {
  Unitig *dangler = unitigs.newUnitig(false);

  if (logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS))
    writeLog("splitDiscontinuous()--   new tig "F_U32" with "F_U32" fragments (starting at frag "F_U32").\n",
            dangler->id(), splitFragsLen, splitFrags[0].ident);

  int splitOffset = -MIN(splitFrags[0].position.bgn, splitFrags[0].position.end);

  //  This should already be true, but we force it still
  splitFrags[0].contained = 0;

  for (uint32 i=0; i<splitFragsLen; i++)
    dangler->addFrag(splitFrags[i], splitOffset, false);  //logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS));
}

예제 #9

0

파일 보기

파일: AS_BAT_PlaceContains.C 프로젝트: cdunn2001/DConvert

void
placeContainsUsingBestOverlaps(UnitigVector &unitigs) {
  uint32   fragsPlaced  = 1;
  uint32   fragsPending = 0;

  logFileFlags &= ~LOG_PLACE_FRAG;

  while (fragsPlaced > 0) {
    fragsPlaced  = 0;
    fragsPending = 0;

    writeLog("==> PLACING CONTAINED FRAGMENTS\n");

    for (uint32 fid=1; fid<FI->numFragments()+1; fid++) {
      BestContainment *bestcont = OG->getBestContainer(fid);
      Unitig          *utg;

      if (bestcont->isContained == false)
        //  Not a contained fragment.
        continue;

      if (Unitig::fragIn(fid) != 0)
        //  Containee already placed.
        continue;

      if (Unitig::fragIn(bestcont->container) == 0) {
        //  Container not placed (yet).
        fragsPending++;
        continue;
      }

      utg = unitigs[Unitig::fragIn(bestcont->container)];
      utg->addContainedFrag(fid, bestcont, logFileFlagSet(LOG_INITIAL_CONTAINED_PLACEMENT));

      if (utg->id() != Unitig::fragIn(fid))
        writeLog("placeContainsUsingBestOverlaps()-- FAILED to add frag %d to unitig %d.\n", fid, bestcont->container);
      assert(utg->id() == Unitig::fragIn(fid));


      fragsPlaced++;
    }

    writeLog("==> PLACING CONTAINED FRAGMENTS - placed %d fragments; still need to place %d\n",
            fragsPlaced, fragsPending);

    if ((fragsPlaced == 0) && (fragsPending > 0)) {
      writeLog("Stopping contained fragment placement due to zombies.\n");
      fragsPlaced  = 0;
      fragsPending = 0;
    }
  }

  for (uint32 ti=1; ti<unitigs.size(); ti++) {
    Unitig *utg = unitigs[ti];

    if (utg)
      utg->sort();
  }
}

예제 #10

0

파일 보기

파일: AS_BAT_Instrumentation.C 프로젝트: xtmgah/canu

void
reportUnitigs(UnitigVector &unitigs, const char *prefix, const char *name) {

  if (logFileFlagSet(LOG_INTERMEDIATE_UNITIGS) == 0)
    return;

  uint32  numFragsT  = 0;
  uint32  numFragsP  = 0;
  uint64  utgLen     = 0;

  //  Compute average frags per partition.
  for (uint32  ti=0; ti<unitigs.size(); ti++) {
    Unitig  *utg = unitigs[ti];

    if (utg == NULL)
      continue;

    numFragsT += utg->ufpath.size();

    if (utg->ufpath.size() > 2)
      utgLen    += utg->getLength();
  }

  if      (utgLen < 16 * 1024 * 1024)
    numFragsP = numFragsT / 7;
  else if (utgLen < 64 * 1024 * 1024)
    numFragsP = numFragsT / 63;
  else
    numFragsP = numFragsT / 127;

  char tigStorePath[FILENAME_MAX];
  sprintf(tigStorePath, "%s.%03u.%s.tigStore", prefix, logFileOrder, name);

  //  Failing to do this results in consensus running about 40 times slower.  Three hours instead of
  //  five minutes.
  setParentAndHang(unitigs);

  writeUnitigsToStore(unitigs, tigStorePath, tigStorePath, numFragsP, false);
}

예제 #11

0

파일 보기

파일: AS_BAT_UnitigVector.C 프로젝트: Ecogpr/canu

void
UnitigVector::computeErrorProfiles(const char *prefix, const char *label) {
  uint32  tiLimit = size();
  uint32  numThreads = omp_get_max_threads();
  uint32  blockSize = (tiLimit < 100000 * numThreads) ? numThreads : tiLimit / 99999;

  fprintf(stderr, "Computing error profiles for %u unitigs using %u threads.\n", tiLimit, numThreads);

  //#pragma omp parallel for schedule(dynamic, blockSize)
  for (uint32 ti=0; ti<tiLimit; ti++) {
    Unitig  *tig = operator[](ti);

    if (tig == NULL)
      continue;

    if (tig->ufpath.size() == 1)
      continue;

    tig->computeErrorProfile(prefix, label);
  }

  fprintf(stderr, "Computing error profiles - FINISHED.\n");
}

예제 #12

0

파일 보기

파일: AS_BAT_Outputs.C 프로젝트: ondovb/canu

//  For every unitig, report the best overlaps contained in the
//  unitig, and all overlaps contained in the unitig.
//
//  Wow, this is ancient.
//
void
writeOverlapsUsed(UnitigVector &unitigs,
                  char         *fileprefix) {
  char         filename[FILENAME_MAX] = {0};
#if 0
  GenericMesg  pmesg;
  OverlapMesg  omesg;
#endif

  sprintf(filename, "%s.unused.ovl", fileprefix);
  FILE *file = fopen(filename, "w");
  assert(file != NULL);

#if 0
  for (uint32  ti=0; ti<unitigs.size(); ti++) {
    Unitig  *utg = unitigs[ti];

    if (utg == NULL)
      continue;

    for (uint32 fi=0; fi<utg->ufpath.size(); fi++) {
      ufNode  *frg = &utg->ufpath[fi];

      //  Where is our best overlap?  Contained or dovetail?

      BestEdgeOverlap *bestedge5 = OG->getBestEdgeOverlap(frg->ident, false);
      BestEdgeOverlap *bestedge3 = OG->getBestEdgeOverlap(frg->ident, true);

      int              bestident5 = 0;
      int              bestident3 = 0;

      if (bestedge5) {
        bestident5 = bestedge5->fragId();

        if ((bestident5 > 0) && (utg->fragIn(bestident5) != utg->id())) {
          omesg.aifrag          = frg->ident;
          omesg.bifrag          = bestident5;
          omesg.ahg             = bestedge5->ahang();
          omesg.bhg             = bestedge5->bhang();
          omesg.orientation.setIsUnknown();
          omesg.overlap_type    = AS_DOVETAIL;
          omesg.quality         = 0.0;
          omesg.min_offset      = 0;
          omesg.max_offset      = 0;
          omesg.polymorph_ct    = 0;
          omesg.alignment_trace = NULL;
#ifdef AS_MSG_USE_OVL_DELTA
          omesg.alignment_delta = NULL;
#endif

          //  This overlap is off of the 5' end of this fragment.
          if (bestedge5->frag3p() == false)
            omesg.orientation.setIsOuttie();
          if (bestedge5->frag3p() == true)
            omesg.orientation.setIsAnti();

          pmesg.t = MESG_OVL;
          pmesg.m = &omesg;

          WriteProtoMesg_AS(file, &pmesg);
        }
      }

      if (bestedge3) {
        bestident3 = bestedge3->fragId();

        if ((bestident3 > 0) && (utg->fragIn(bestident3) != utg->id())) {
          omesg.aifrag          = frg->ident;
          omesg.bifrag          = bestident3;
          omesg.ahg             = bestedge3->ahang();
          omesg.bhg             = bestedge3->bhang();
          omesg.orientation.setIsUnknown();
          omesg.overlap_type    = AS_DOVETAIL;
          omesg.quality         = 0.0;
          omesg.min_offset      = 0;
          omesg.max_offset      = 0;
          omesg.polymorph_ct    = 0;
          omesg.alignment_trace = NULL;
#ifdef AS_MSG_USE_OVL_DELTA
          omesg.alignment_delta = NULL;
#endif

          //  This overlap is off of the 3' end of this fragment.
          if (bestedge3->frag3p() == false)
            omesg.orientation.setIsNormal();
          if (bestedge3->frag3p() == true)
            omesg.orientation.setIsInnie();

          pmesg.t = MESG_OVL;
          pmesg.m = &omesg;

          WriteProtoMesg_AS(file, &pmesg);
        }
      }
    }
  }
#endif

  fclose(file);
}

예제 #13

0

파일 보기

파일: AS_BAT_SplitDiscontinuous.C 프로젝트: xtmgah/canu

//  After splitting and ejecting some contains, check for discontinuous unitigs.
//
void splitDiscontinuousUnitigs(UnitigVector &unitigs, uint32 minOverlap) {

  writeLog("==> SPLIT DISCONTINUOUS\n");

  uint32                numTested  = 0;
  uint32                numSplit   = 0;
  uint32                numCreated = 0;

  uint32                splitFragsLen = 0;
  uint32                splitFragsMax = 0;
  ufNode               *splitFrags    = NULL;

  for (uint32 ti=0; ti<unitigs.size(); ti++) {
    Unitig  *tig = unitigs[ti];

    if ((tig == NULL) || (tig->ufpath.size() < 2))
      continue;

    //  Unitig must be sorted.  Someone upstream os screwing this up.
    tig->sort();

    //  We'll want to build an array of new fragments to split out.  This can be up
    //  to the size of the largest unitig.
    splitFragsMax = MAX(splitFragsMax, tig->ufpath.size());

    //  Check that the unitig starts at position zero.  Not critical for the next loop, but
    //  needs to be dome sometime.
    int32   minPos = MIN(tig->ufpath[0].position.bgn, tig->ufpath[0].position.end);

    if (minPos == 0)
      continue;

    writeLog("splitDiscontinuous()-- tig "F_U32" offset messed up; reset by "F_S32".\n", tig->id(), minPos);

    for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
      ufNode  *frg = &tig->ufpath[fi];

      frg->position.bgn -= minPos;
      frg->position.end -= minPos;
    }
  }

  splitFrags = new ufNode [splitFragsMax];

  //  Now, finally, we can check for gaps in unitigs.

  for (uint32 ti=0; ti<unitigs.size(); ti++) {
    Unitig  *tig = unitigs[ti];

    if ((tig == NULL) || (tig->ufpath.size() < 2))
      continue;

    //  We don't expect many unitigs to be broken, so we'll do a first quick pass to just
    //  test if it is.

    int32  maxEnd   = MAX(tig->ufpath[0].position.bgn, tig->ufpath[0].position.end);
    bool   isBroken = false;

    for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
      ufNode  *frg = &tig->ufpath[fi];

      int32    bgn = MIN(frg->position.bgn, frg->position.end);
      int32    end = MAX(frg->position.bgn, frg->position.end);

      if (bgn > maxEnd - minOverlap) {
        isBroken = true;
        break;
      }

      maxEnd = MAX(maxEnd, end);
    }

    numTested++;

    if (isBroken == false)
      continue;

    numSplit++;

    //  Dang, busted unitig.  Fix it up.

    splitFragsLen = 0;
    maxEnd        = 0;

    if (logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS))
      writeLog("splitDiscontinuous()-- discontinuous tig "F_U32" with "F_SIZE_T" fragments broken into:\n",
              tig->id(), tig->ufpath.size());

    for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
      ufNode  *frg = &tig->ufpath[fi];

      int32    bgn = MIN(frg->position.bgn, frg->position.end);
      int32    end = MAX(frg->position.bgn, frg->position.end);

      //  Good thick overlap exists to this fragment, save it.
      if (bgn <= maxEnd - minOverlap) {
        assert(splitFragsLen < splitFragsMax);
        splitFrags[splitFragsLen++] = *frg;
        maxEnd = MAX(maxEnd, end);
        continue;
      }

      //  No thick overlap found.  We need to break right here before the current fragment.

      //  If there is exactly one fragment, and it's contained, and it's not mated, move it to the
      //  container.  (This has a small positive benefit over just making every read a singleton).
      //
      if ((splitFragsLen == 1) &&
          (FI->mateIID(splitFrags[0].ident) == 0) &&
          (splitFrags[0].contained != 0)) {
        Unitig  *dangler  = unitigs[tig->fragIn(splitFrags[0].contained)];

        //  If the parent isn't in a unitig, we must have shattered the repeat unitig it was in.
        //  Do the same here.

        if (dangler == NULL) {
          if (logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS))
            writeLog("splitDiscontinuous()--   singleton frag "F_U32" shattered.\n",
                    splitFrags[0].ident);
          Unitig::removeFrag(splitFrags[0].ident);

        } else {
          assert(dangler->id() == tig->fragIn(splitFrags[0].contained));

          if (logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS))
            writeLog("splitDiscontinuous()--   old tig "F_U32" with "F_SIZE_T" fragments (contained frag "F_U32" moved here).\n",
                    dangler->id(), dangler->ufpath.size() + 1, splitFrags[0].ident);

          BestContainment  *bestcont = OG->getBestContainer(splitFrags[0].ident);

          assert(bestcont->isContained == true);

          dangler->addContainedFrag(splitFrags[0].ident, bestcont, false);
          dangler->bubbleSortLastFrag();

          assert(dangler->id() == Unitig::fragIn(splitFrags[0].ident));
        }
      }

      //  Otherwise, make an entirely new unitig for these fragments.
      else {
        numCreated++;
        makeNewUnitig(unitigs, splitFragsLen, splitFrags);
        tig = unitigs[ti];
      }

      //  Done with the split, save the current fragment.  This resets everything.

      splitFragsLen = 0;
      splitFrags[splitFragsLen++] = *frg;

      maxEnd = end;
    }


    //  If we did any splitting, then the length of the frags in splitFrags will be less than the length
    //  of the path in the current unitig.  Make a final new unitig for the remaining fragments.
    //
    if (splitFragsLen != tig->ufpath.size()) {
      numCreated++;
      makeNewUnitig(unitigs, splitFragsLen, splitFrags);

      delete unitigs[ti];
      unitigs[ti] = NULL;
    }
  }

  writeLog("splitDiscontinuous()-- Tested "F_U32" unitigs, split "F_U32" into "F_U32" new unitigs.\n",
          numTested, numSplit, numCreated);

  delete [] splitFrags;
}

예제 #14

0

파일 보기

파일: AS_BAT_PopBubbles.C 프로젝트: AndreasHegerGenomics/canu

vector<overlapPlacement>  *
findBubbleReadPlacements(UnitigVector    &unitigs,
                         BubTargetList   &potentialBubbles,
                         double           deviationBubble) {
  uint32  fiLimit      = FI->numFragments();
  uint32  fiNumThreads = omp_get_max_threads();
  uint32  fiBlockSize  = (fiLimit < 1000 * fiNumThreads) ? fiNumThreads : fiLimit / 999;

  vector<overlapPlacement>   *placed = new vector<overlapPlacement> [fiLimit + 1];

#pragma omp parallel for schedule(dynamic, fiBlockSize)
  for (uint32 fi=0; fi<fiLimit; fi++) {
    uint32     rdAtigID = Unitig::fragIn(fi);

    if ((rdAtigID == 0) ||                           //  Read not placed in a tig, ignore it.
        (OG->isContained(fi)) ||                     //  Read is contained, ignore it.
        (potentialBubbles.count(rdAtigID) == 0))     //  Read isn't in a potential bubble, ignore it.
      continue;

    Unitig     *rdAtig   = unitigs[rdAtigID];
    ufNode     *rdA      = &rdAtig->ufpath[ Unitig::pathPosition(fi) ];
    bool        rdAfwd   = (rdA->position.bgn < rdA->position.end);
    int32       rdAlo    = (rdAfwd) ? rdA->position.bgn : rdA->position.end;
    int32       rdAhi    = (rdAfwd) ? rdA->position.end : rdA->position.bgn;

    uint32      ovlLen   = 0;
    BAToverlap *ovl      = OC->getOverlaps(rdA->ident, AS_MAX_ERATE, ovlLen);

    set<uint32> intersections;

    //if ((fi % 100) == 0)
    //  fprintf(stderr, "findBubbleReadPlacements()-- read %8u with %6u overlaps - %6.2f%% finished.\r",
    //          rdA->ident, ovlLen, 100.0 * fi / fiLimit);

    //  Compute all placements for this read.

    vector<overlapPlacement>   placements;

    placeFragUsingOverlaps(unitigs, AS_MAX_ERATE, NULL, rdA->ident, placements);

    //  Weed out placements that aren't for bubbles, or that are for bubbles but are poor quality.  Or are to ourself!

    for (uint32 pi=0; pi<placements.size(); pi++) {
      uint32    rdBtigID = placements[pi].tigID;
      Unitig   *rdBtig   = unitigs[rdBtigID];

      uint32    lo       = (placements[pi].position.bgn < placements[pi].position.end) ? placements[pi].position.bgn : placements[pi].position.end;
      uint32    hi       = (placements[pi].position.bgn < placements[pi].position.end) ? placements[pi].position.end : placements[pi].position.bgn;

      double    erate    = placements[pi].errors / placements[pi].aligned;

      //  Ignore the placement if it is to ourself.

      if (rdAtigID == rdBtigID) {
        //writeLog("tig %6u frag %8u -> tig %6u %6u reads at %8u-%8u (cov %7.5f erate %6.4f) - SAME TIG\n",
        //         rdAtigID, placements[pi].frgID, placements[pi].tigID, rdBtig->ufpath.size(), placements[pi].position.bgn, placements[pi].position.end, placements[pi].fCoverage, erate);
        continue;
      }

      //  Ignore the placement if it is to a non-tig / singleton read, or if it didn't place the
      //  read fully.

      if ((rdBtigID == 0) ||
          (rdBtig   == NULL) ||
          (rdBtig->ufpath.size() == 1) ||
          (placements[pi].fCoverage < 0.99)) {
        if (logFileFlagSet(LOG_BUBBLE_DETAIL))
          writeLog("tig %6u frag %8u -> tig %6u %6u reads at %8u-%8u (cov %7.5f erate %6.4f) - PARTIALLY PLACED\n",
                   rdAtigID, placements[pi].frgID, placements[pi].tigID, rdBtig->ufpath.size(), placements[pi].position.bgn, placements[pi].position.end, placements[pi].fCoverage, erate);
        continue;
      }

      //  Ignore the placement if it isn't to one of our bubble-popping candidate unitigs.

      bool             dontcare = true;
      vector<uint32>  &pbubbles = potentialBubbles[rdAtigID];

      for (uint32 pb=0; pb<pbubbles.size(); pb++) {
        if (pbubbles[pb] == rdBtigID)
          dontcare = false;
      }

      if (dontcare) {
        if (logFileFlagSet(LOG_BUBBLE_DETAIL))
          writeLog("tig %6u frag %8u -> tig %6u %6u reads at %8u-%8u (cov %7.5f erate %6.4f) - NOT CANDIDATE TIG\n",
                   rdAtigID, placements[pi].frgID, placements[pi].tigID, rdBtig->ufpath.size(), placements[pi].position.bgn, placements[pi].position.end, placements[pi].fCoverage, erate);
        continue;
      }

      //  Ignore the placement if it is too diverged from the destination tig.

      if (rdBtig->overlapConsistentWithTig(deviationBubble, lo, hi, erate) < 0.5) {
        if (logFileFlagSet(LOG_BUBBLE_DETAIL))
          writeLog("tig %6u frag %8u -> tig %6u %6u reads at %8u-%8u (cov %7.5f erate %6.4f) - HIGH ERROR\n",
                   rdAtigID, placements[pi].frgID, placements[pi].tigID, rdBtig->ufpath.size(), placements[pi].position.bgn, placements[pi].position.end, placements[pi].fCoverage, erate);
        continue;
      }

      //  Good placement!

      if (logFileFlagSet(LOG_BUBBLE_DETAIL))
        writeLog("tig %6u frag %8u -> tig %6u %6u reads at %8u-%8u (cov %7.5f erate %6.4f)\n",
                 rdAtigID, placements[pi].frgID, placements[pi].tigID, rdBtig->ufpath.size(), placements[pi].position.bgn, placements[pi].position.end, placements[pi].fCoverage, erate);

      placed[fi].push_back(placements[pi]);
    }
  }

  return(placed);
}

예제 #15

0

파일 보기

파일: AS_BAT_PlaceContains.C 프로젝트: Ecogpr/canu

void
placeUnplacedUsingAllOverlaps(UnitigVector &unitigs,
                              const char   *prefix) {
  uint32  fiLimit    = FI->numFragments();
  uint32  numThreads = omp_get_max_threads();
  uint32  blockSize  = (fiLimit < 100 * numThreads) ? numThreads : fiLimit / 99;

  uint32       *placedTig = new uint32      [FI->numFragments() + 1];
  SeqInterval  *placedPos = new SeqInterval [FI->numFragments() + 1];

  memset(placedTig, 0, sizeof(uint32)      * (FI->numFragments() + 1));
  memset(placedPos, 0, sizeof(SeqInterval) * (FI->numFragments() + 1));

  //  Just some logging.  Count the number of reads we try to place.

  uint32   nToPlaceContained = 0;
  uint32   nToPlace          = 0;
  uint32   nPlacedContained  = 0;
  uint32   nPlaced           = 0;
  uint32   nFailedContained  = 0;
  uint32   nFailed           = 0;

  for (uint32 fid=1; fid<FI->numFragments()+1; fid++)
    if (Unitig::fragIn(fid) == 0)
      if (OG->isContained(fid))
        nToPlaceContained++;
      else
        nToPlace++;

  writeLog("placeContains()-- placing %u contained and %u unplaced reads, with %d threads.\n",
           nToPlaceContained, nToPlace, numThreads);

  //  Do the placing!

#pragma omp parallel for schedule(dynamic, blockSize)
  for (uint32 fid=1; fid<FI->numFragments()+1; fid++) {
    bool  enableLog = true;

    if (Unitig::fragIn(fid) > 0)
      continue;

    //  Place the read.

    vector<overlapPlacement>   placements;

    placeFragUsingOverlaps(unitigs, AS_MAX_ERATE, NULL, fid, placements);

    //  Search the placements for the highest expected identity placement using all overlaps in the unitig.

    uint32   b = UINT32_MAX;

    for (uint32 i=0; i<placements.size(); i++) {
      Unitig *tig = unitigs[placements[i].tigID];

      if (placements[i].fCoverage < 0.99)                   //  Ignore partially placed reads.
        continue;

      if (tig->ufpath.size() == 1)  //  Ignore placements in singletons.
        continue;

      uint32  bgn   = (placements[i].position.bgn < placements[i].position.end) ? placements[i].position.bgn : placements[i].position.end;
      uint32  end   = (placements[i].position.bgn < placements[i].position.end) ? placements[i].position.end : placements[i].position.bgn;

      double  erate = placements[i].errors / placements[i].aligned;

      if (tig->overlapConsistentWithTig(5.0, bgn, end, erate) < 0.5) {
        if ((enableLog == true) && (logFileFlagSet(LOG_PLACE_UNPLACED)))
          writeLog("frag %8u tested tig %6u (%6u reads) at %8u-%8u (cov %7.5f erate %6.4f) - HIGH ERROR\n",
                   fid, placements[i].tigID, tig->ufpath.size(), placements[i].position.bgn, placements[i].position.end, placements[i].fCoverage, erate);
        continue;
      }

      if ((enableLog == true) && (logFileFlagSet(LOG_PLACE_UNPLACED)))
        writeLog("frag %8u tested tig %6u (%6u reads) at %8u-%8u (cov %7.5f erate %6.4f)\n",
                 fid, placements[i].tigID, tig->ufpath.size(), placements[i].position.bgn, placements[i].position.end, placements[i].fCoverage, erate);

      if ((b == UINT32_MAX) ||
          (placements[i].errors / placements[i].aligned < placements[b].errors / placements[b].aligned))
        b = i;
    }

    //  If we didn't find a best, b will be invalid; set positions for adding to a new tig.
    //  If we did, save both the position it was placed at, and the tigID it was placed in.

    if (b == UINT32_MAX) {
      if ((enableLog == true) && (logFileFlagSet(LOG_PLACE_UNPLACED)))
        writeLog("frag %8u remains unplaced\n", fid);
      placedPos[fid].bgn = 0;
      placedPos[fid].end = FI->fragmentLength(fid);
    }

    else {
      if ((enableLog == true) && (logFileFlagSet(LOG_PLACE_UNPLACED)))
        writeLog("frag %8u placed tig %6u (%6u reads) at %8u-%8u (cov %7.5f erate %6.4f)\n",
                 fid, placements[b].tigID, unitigs[placements[b].tigID]->ufpath.size(),
                 placements[b].position.bgn, placements[b].position.end,
                 placements[b].fCoverage,
                 placements[b].errors / placements[b].aligned);
      placedTig[fid] = placements[b].tigID;
      placedPos[fid] = placements[b].position;
    }
  }

  //  All reads placed, now just dump them in their correct tigs.

  for (uint32 fid=1; fid<FI->numFragments()+1; fid++) {
    Unitig  *tig = NULL;
    ufNode   frg;

    if (Unitig::fragIn(fid) > 0)
      continue;

    //  If not placed, dump it in a new unitig.  Well, not anymore.  These reads were not placed in
    //  any tig initially, were not allowed to seed a tig, and now, could find no place to go.
    //  They're garbage.  Plus, it screws up the logging above because we don't know the new tig ID
    //  until now.

    if (placedTig[fid] == 0) {
      if (OG->isContained(fid))
        nFailedContained++;
      else
        nFailed++;

      //tig = unitigs.newUnitig(false);
    }

    //  Otherwise, it was placed somewhere, grab the tig.

    else {
      if (OG->isContained(fid))
        nPlacedContained++;
      else
        nPlaced++;

      tig = unitigs[placedTig[fid]];
    }

    //  Regardless, add it to the tig.  Logging for this is above.

    if (tig) {
      frg.ident             = fid;
      frg.contained         = 0;
      frg.parent            = 0;
      frg.ahang             = 0;
      frg.bhang             = 0;
      frg.position          = placedPos[fid];

      tig->addFrag(frg, 0, false);
    }
  }

  //  Cleanup.

  delete [] placedPos;
  delete [] placedTig;

  writeLog("placeContains()-- Placed %u contained reads and %u unplaced reads.\n", nPlacedContained, nPlaced);
  writeLog("placeContains()-- Failed to place %u contained reads (too high error suspected) and %u unplaced reads (lack of overlaps suspected).\n", nFailedContained, nFailed);

  //  But wait!  All the tigs need to be sorted.  Well, not really _all_, but the hard ones to sort
  //  are big, and those quite likely had reads added to them, so it's really not worth the effort
  //  of tracking which ones need sorting, since the ones that don't need it are trivial to sort.

  for (uint32 ti=1; ti<unitigs.size(); ti++) {
    Unitig *utg = unitigs[ti];

    if (utg)
      utg->sort();
  }
}

예제 #16

0

파일 보기

파일: AS_BAT_PopBubbles.C 프로젝트: AndreasHegerGenomics/canu

void
popBubbles(UnitigVector &unitigs,
           double deviationBubble) {

  BubTargetList   potentialBubbles;

  findPotentialBubbles(unitigs, potentialBubbles);

  writeStatus("popBubbles()-- Found "F_SIZE_T" potential bubbles.\n", potentialBubbles.size());

  //if (potentialBubbles.size() == 0)
  //  return;

  writeLog("\n");
  writeLog("Found "F_SIZE_T" potential bubbles.\n", potentialBubbles.size());
  writeLog("\n");

  vector<overlapPlacement>   *placed = findBubbleReadPlacements(unitigs, potentialBubbles, deviationBubble);

  //  We now have, in 'placed', a list of all the places that each read could be placed.  Decide if there is a _single_
  //  place for each bubble to be popped.

  uint32  tiLimit      = unitigs.size();
  //uint32  tiNumThreads = omp_get_max_threads();
  //uint32  tiBlockSize  = (tiLimit < 100000 * tiNumThreads) ? tiNumThreads : tiLimit / 99999;

  //  Clear flags.
  for (uint32 ti=0; ti<tiLimit; ti++) {
    if (unitigs[ti]) {
      unitigs[ti]->_isBubble = false;
      unitigs[ti]->_isRepeat = false;
    }
  }

  //  In parallel, process the placements.

  for (uint32 ti=0; ti<tiLimit; ti++) {
    if (potentialBubbles.count(ti) == 0)   //  Not a potential bubble
      continue;

    //  Scan the bubble, decide if there are _ANY_ read placements.  Log appropriately.

    Unitig  *bubble = unitigs[ti];
    bool     hasPlacements = false;

    for (uint32 fi=0; fi<bubble->ufpath.size(); fi++) {
      uint32  readID  = bubble->ufpath[fi].ident;

      if (placed[readID].size() > 0)
        hasPlacements = true;
    }

    if (hasPlacements == false)
      writeLog("potential bubble %u had no valid placements (all were not contained in target tig)\n", ti);
    else
      writeLog("potential bubble %u\n", ti);

    //  Split the placements into piles for each target and build an interval list for each target.
    //  For each read in the tig, convert the vector of placements into interval lists, one list per target tig.

    map<uint32, intervalList<uint32> *>  targetIntervals;

    for (uint32 fi=0; fi<bubble->ufpath.size(); fi++) {
      uint32  readID  = bubble->ufpath[fi].ident;

      for (uint32 pp=0; pp<placed[readID].size(); pp++) {
        uint32  tid = placed[readID][pp].tigID;

        assert(placed[readID][pp].frgID > 0);

        uint32  bgn = (placed[readID][pp].position.bgn < placed[readID][pp].position.end) ? placed[readID][pp].position.bgn : placed[readID][pp].position.end;
        uint32  end = (placed[readID][pp].position.bgn < placed[readID][pp].position.end) ? placed[readID][pp].position.end : placed[readID][pp].position.bgn;

        if (targetIntervals[tid] == NULL)
          targetIntervals[tid] = new intervalList<uint32>;

        //writeLog("read %u -> tig %u intervals %u-%u\n", readID, tid, bgn, end);

        targetIntervals[tid]->add(bgn, end-bgn);
      }
    }

    vector<candidatePop *>    targets;

    //  Squish the intervals.  Create new candidatePops for each interval that isn't too big or
    //  small.  Assign each overlapPlacements to the correct candidatePop.

    for (map<uint32, intervalList<uint32> *>::iterator it=targetIntervals.begin(); it != targetIntervals.end(); ++it) {
      uint32                 targetID = it->first;
      intervalList<uint32>  *IL       = it->second;

      IL->merge();

      //  Discard intervals that are significantly too small or large.  Save the ones that are
      //  nicely sized.  Logging here isn't terribly useful, it's just repeated (out of order) later
      //  when we try to make sense of the read alignments.

      for (uint32 ii=0; ii<IL->numberOfIntervals(); ii++) {
        if ((IL->hi(ii) - IL->lo(ii) < 0.75 * bubble->getLength()) ||   //  Too small!
            (1.25 * bubble->getLength() < IL->hi(ii) - IL->lo(ii))) {   //  Too big!
          writeLog("tig %8u length %9u -> target %8u piece %2u position %9u-%9u length %8u - size mismatch, discarded\n",
                   bubble->id(), bubble->getLength(),
                   targetID, ii, IL->lo(ii), IL->hi(ii), IL->hi(ii) - IL->lo(ii));
          continue;
        }

        writeLog("tig %8u length %9u -> target %8u piece %2u position %9u-%9u length %8u\n",
                 bubble->id(), bubble->getLength(),
                 targetID, ii, IL->lo(ii), IL->hi(ii), IL->hi(ii) - IL->lo(ii));

        targets.push_back(new candidatePop(bubble, unitigs[targetID], IL->lo(ii), IL->hi(ii)));
      }

      delete IL;
    }

    targetIntervals.clear();

    //  If no targets, nothing to do.

    if (targets.size() == 0)
      continue;

    //  Run through the placements again, and assign them to the correct target.
    //
    //  For each read:
    //  For each acceptable placement:
    //  For each target location:
    //  If the placement is for this target, save it.

    for (uint32 fi=0; fi<bubble->ufpath.size(); fi++) {
      uint32  readID  = bubble->ufpath[fi].ident;

      for (uint32 pp=0; pp<placed[readID].size(); pp++) {
        uint32  tid = placed[readID][pp].tigID;

        uint32  bgn = (placed[readID][pp].position.bgn < placed[readID][pp].position.end) ? placed[readID][pp].position.bgn : placed[readID][pp].position.end;
        uint32  end = (placed[readID][pp].position.bgn < placed[readID][pp].position.end) ? placed[readID][pp].position.end : placed[readID][pp].position.bgn;

        for (uint32 tt=0; tt<targets.size(); tt++)
          if ((targets[tt]->target->id() == tid) &&
              (targets[tt]->bgn < end) && (bgn < targets[tt]->end))
            targets[tt]->placed.push_back(placed[readID][pp]);
      }
    }

    //  Count the number of targets that have all the reads (later: in the correct order, etc, etc).  Remove those
    //  that don't.

    uint32  nTargets = 0;

    set<uint32>  tigReads;  //  Reads in the bubble tig.
    set<uint32>  tgtReads;  //  Reads in the bubble that have a placement in the target.

    //  Remove duplicate placements from each target.

    for (uint32 tt=0; tt<targets.size(); tt++) {
      candidatePop *t = targets[tt];

      //  Detect duplicates, keep the one with lower error.  There are a lot of duplicate
      //  placements, logging isn't terribly useful.

      for (uint32 aa=0; aa<t->placed.size(); aa++) {
        for (uint32 bb=0; bb<t->placed.size(); bb++) {
          if ((aa == bb) ||
              (t->placed[aa].frgID != t->placed[bb].frgID) ||
              (t->placed[aa].frgID == 0) ||
              (t->placed[bb].frgID == 0))
            continue;

          if (t->placed[aa].errors / t->placed[aa].aligned < t->placed[bb].errors / t->placed[bb].aligned) {
#ifdef SHOW_MULTIPLE_PLACEMENTS
            writeLog("duplicate read alignment for tig %u read %u - better %u-%u %.4f - worse %u-%u %.4f\n",
                     t->placed[aa].tigID, t->placed[aa].frgID,
                     t->placed[aa].position.bgn, t->placed[aa].position.end, t->placed[aa].errors / t->placed[aa].aligned,
                     t->placed[bb].position.bgn, t->placed[bb].position.end, t->placed[bb].errors / t->placed[bb].aligned);
#endif
            t->placed[bb] = overlapPlacement();
          } else {
#ifdef SHOW_MULTIPLE_PLACEMENTS
            writeLog("duplicate read alignment for tig %u read %u - better %u-%u %.4f - worse %u-%u %.4f\n",
                     t->placed[aa].tigID, t->placed[aa].frgID,
                     t->placed[bb].position.bgn, t->placed[bb].position.end, t->placed[bb].errors / t->placed[bb].aligned,
                     t->placed[aa].position.bgn, t->placed[aa].position.end, t->placed[aa].errors / t->placed[aa].aligned);
#endif
            t->placed[aa] = overlapPlacement();
          }
        }
      }

      //  Get rid of any now-empty entries.

      for (uint32 aa=t->placed.size(); aa--; ) {
        if (t->placed[aa].frgID == 0) {
          t->placed[aa] = t->placed.back();
          t->placed.pop_back();
        }
      }
    }

    //  Make a set of the reads in the bubble.  We'll compare each target against this to decide if all reads are placed.

    for (uint32 fi=0; fi<bubble->ufpath.size(); fi++)
      tigReads.insert(bubble->ufpath[fi].ident);

    uint32   nOrphan      = 0;   //  Full coverage; bubble can be popped.
    uint32   orphanTarget = 0;

    uint32   nBubble      = 0;   //  Partial coverage, bubble cannot be popped.
    uint32   bubbleTarget = 0;

    for (uint32 tt=0; tt<targets.size(); tt++) {
      tgtReads.clear();

      for (uint32 op=0; op<targets[tt]->placed.size(); op++) {
        if (logFileFlagSet(LOG_BUBBLE_DETAIL))
          writeLog("tig %8u length %9u -> target %8u piece %2u position %9u-%9u length %8u - read %7u at %9u-%9u\n",
                   bubble->id(), bubble->getLength(),
                   targets[tt]->target->id(), tt, targets[tt]->bgn, targets[tt]->end, targets[tt]->end - targets[tt]->bgn,
                   targets[tt]->placed[op].frgID,
                   targets[tt]->placed[op].position.bgn, targets[tt]->placed[op].position.end);

        assert(targets[tt]->placed[op].frgID > 0);
        tgtReads.insert(targets[tt]->placed[op].frgID);
      }

      //  Count the number of consecutive reads from the 5' or 3' end of the bubble that are placed
      //  in the target.
      //
      //  Also, count the number of reads in the bubble that are placed in the target.  Likely the
      //  same as n5 + n3.

      uint32  n5 = 0;
      uint32  n3 = 0;
      uint32  nt = 0;

      for (uint32 fi=0; fi<bubble->ufpath.size(); fi++)
        if (tgtReads.count(bubble->ufpath[fi].ident) > 0)
          n5++;
        else
          break;

      for (uint32 fi=bubble->ufpath.size(); fi-->0; )
        if (tgtReads.count(bubble->ufpath[fi].ident) > 0)
          n3++;
        else
          break;


      for (uint32 fi=0; fi<bubble->ufpath.size(); fi++)
        if (tgtReads.count(bubble->ufpath[fi].ident) > 0)
          nt++;


      //  Report now, before we nuke targets[tt] for being not a bubble!

      if ((nt == bubble->ufpath.size()) ||
          ((n5 > 0) && (n3 > 0)))
        writeLog("tig %8u length %9u -> target %8u piece %2u position %9u-%9u length %8u - expected %3"F_SIZE_TP" reads, had %3"F_SIZE_TP" reads.  n5=%3u n3=%3u nt=%3u\n",
                 bubble->id(), bubble->getLength(),
                 targets[tt]->target->id(), tt, targets[tt]->bgn, targets[tt]->end, targets[tt]->end - targets[tt]->bgn,
                 tigReads.size(),
                 tgtReads.size(), n5, n3, nt);

      //  Decide if this is a bubble, orphan from construction, or repeat.

      if (nt == bubble->ufpath.size()) {
        nOrphan++;
        orphanTarget = tt;
      }

      else if ((n5 > 0) && (n3 > 0)) {
        nBubble++;
        bubbleTarget = tt;
      }
    }

    //  If no placements, pbbbt.

    if (nOrphan + nBubble == 0) {
      //writeLog("tig %8u length %8u reads %6u had no bubble or orphan placements.\n", bubble->id(), bubble->getLength(), bubble->ufpath.size());
      continue;
    }

    //  If multiple orphan and/or bubble placements, it's a repeat.

    if (nOrphan + nBubble > 1) {
      writeLog("tig %8u length %8u reads %6u - repeat - %u orphan %u bubble placements.\n",
               bubble->id(), bubble->getLength(), bubble->ufpath.size(),
               nOrphan, nBubble);
      writeLog("\n");
      bubble->_isRepeat = true;
      continue;
    }

    //  If a bubble placement, mark it as a bubble so it can be skipped during repeat detection.

    if (nBubble > 0) {
      writeLog("tig %8u length %8u reads %6u - bubble\n",
               bubble->id(), bubble->getLength(), bubble->ufpath.size());
      writeLog("\n");
      bubble->_isBubble = true;
      continue;
    }

    //  Otherwise, it's an orphan, move the reads to the proper place.

    writeLog("tig %8u length %8u reads %6u - orphan\n", bubble->id(), bubble->getLength(), bubble->ufpath.size());

    for (uint32 op=0, tt=orphanTarget; op<targets[tt]->placed.size(); op++) {
      ufNode  frg;

      frg.ident        = targets[tt]->placed[op].frgID;
      frg.contained    = 0;
      frg.parent       = 0;
      frg.ahang        = 0;
      frg.bhang        = 0;
      frg.position.bgn = targets[tt]->placed[op].position.bgn;
      frg.position.end = targets[tt]->placed[op].position.end;

      writeLog("move read %u from tig %u to tig %u %u-%u\n",
               frg.ident,
               bubble->id(),
               targets[tt]->target->id(), frg.position.bgn, frg.position.end);

      targets[tt]->target->addFrag(frg, 0, false);
    }

    writeLog("\n");

    unitigs[bubble->id()] = NULL;
    delete bubble;
  }  //  Over all bubbles

  writeLog("\n");   //  Needed if no bubbles are popped.

  delete [] placed;

  //  Sort reads in all the tigs.  Overkill, but correct.

  for (uint32 ti=0; ti<tiLimit; ti++) {
    Unitig  *tig = unitigs[ti];

    if ((tig == NULL) ||               //  Not a tig, ignore it.
        (tig->ufpath.size() == 1))     //  Singleton, already sorted.
      continue;

    tig->sort();
  }
}

예제 #17

0

파일 보기

파일: AS_BAT_PopBubbles.C 프로젝트: AndreasHegerGenomics/canu

void
findPotentialBubbles(UnitigVector    &unitigs,
                     BubTargetList   &potentialBubbles) {
  uint32  tiLimit      = unitigs.size();
  uint32  tiNumThreads = omp_get_max_threads();
  uint32  tiBlockSize  = (tiLimit < 100000 * tiNumThreads) ? tiNumThreads : tiLimit / 99999;

  writeStatus("\n");
  writeStatus("bubbleDetect()-- working on "F_U32" unitigs, with "F_U32" threads.\n", tiLimit, tiNumThreads);

  for (uint32 ti=0; ti<tiLimit; ti++) {
    Unitig  *tig = unitigs[ti];

    if ((tig == NULL) ||               //  Not a tig, ignore it.
        (tig->ufpath.size() == 1))     //  Singleton, handled elsewhere.
      continue;

    uint32  nonContainedReads = 0;
    bool    validBubble       = true;

    map<uint32,uint32>  tigOlapsTo;

    uint32  fiLimit      = tig->ufpath.size();
    uint32  fiNumThreads = omp_get_max_threads();
    uint32  fiBlockSize  = (fiLimit < 100 * fiNumThreads) ? fiNumThreads : fiLimit / 99;

    for (uint32 fi=0; (validBubble == true) && (fi<fiLimit); fi++) {
      uint32      rid      = tig->ufpath[fi].ident;

      if (OG->isContained(rid) == true)  //  Don't need to check contained reads.  If their container
        continue;                        //  passes the tests below, the contained read will too.

      nonContainedReads++;

      uint32      ovlLen   = 0;
      BAToverlap *ovl      = OC->getOverlaps(rid, AS_MAX_ERATE, ovlLen);

      set<uint32>  readOlapsTo;

      for (uint32 oi=0; oi<ovlLen; oi++) {
        uint32  ovlTigID = Unitig::fragIn(ovl[oi].b_iid);
        Unitig *ovlTig   = unitigs[ovlTigID];

        //  Skip this overlap if it is to an unplaced read, to a singleton tig, to ourself,
        //  or to a unitig that is shorter than us.  We can not pop this tig as a bubble
        //  in any of those cases.

        if ((ovlTigID == 0) ||
            (ovlTig == NULL) ||
            (ovlTig->ufpath.size() == 1) ||
            (ovlTig->id() == tig->id()) ||
            (ovlTig->getLength() < tig->getLength()))
          continue;

        //  Otherwise, remember that we had an overlap to ovlTig.

        //writeLog("tig %u read %u overlap to tig %u read %u\n",
        //         tig->id(), rid, ovlTigID, ovl[oi].b_iid);

        readOlapsTo.insert(ovlTigID);
      }

      //writeLog("tig %8u read %8u has %u olaps\n", tig->id(), rid, readOlapsTo.size());

      //  Transfer the per-read counts to the per-unitig counts:  add one to the counter for each tig
      //  that we have overlaps to.

      for (set<uint32>::iterator it=readOlapsTo.begin(); it != readOlapsTo.end(); ++it)
        tigOlapsTo[*it]++;

      //  Decide if we're a valid potential bubble.  If tig id (in it->first) has overlaps to every
      //  read we've seen so far (nonContainedReads), we're still a valid bubble.
      //
      //  To _attempt_ to have differences in the bubble, we'll accept it if 3/4 of the reads
      //  have overlaps.

      validBubble = false;

      for (map<uint32,uint32>::iterator it=tigOlapsTo.begin(); it != tigOlapsTo.end(); ++it)
        if (it->second >= BUBBLE_READ_FRACTION * nonContainedReads)
          validBubble = true;

      //  If we've not seen that many reads, pretend it's a valid bubble.  It'll get screened out later.

      if (nonContainedReads < 16)
        validBubble = true;
    }

    //  If not validBubble, report.

#if 0
    if (validBubble == false) {
      writeLog("notValidBubble tig %8d expects %6u reads\n", tig->id(), nonContainedReads);

      for (map<uint32,uint32>::iterator it=tigOlapsTo.begin(); it != tigOlapsTo.end(); ++it)
        writeLog("  to tig %8u overlaps %6u\n", it->first, it->second);
    }
#endif

    //  If validBubble, then there is a tig that every dovetail read has at least one overlap to.
    //  Save those tigs in potentialBubbles.

    uint32  nTigs = 0;

    if (validBubble) {
      for (map<uint32,uint32>::iterator it=tigOlapsTo.begin(); it != tigOlapsTo.end(); ++it)
        if (it->second >= BUBBLE_READ_FRACTION * nonContainedReads)
          nTigs++;
    }

    //  ALWAYS log potential bubbles.

    if (nTigs > 0) {
      writeLog("\n");
      writeLog("potential bubble tig %8u length %9u nReads %7u to %3u tigs:\n",
               tig->id(), tig->getLength(), tig->ufpath.size(), nTigs);

      for (map<uint32,uint32>::iterator it=tigOlapsTo.begin(); it != tigOlapsTo.end(); ++it) {
        if (it->second >= BUBBLE_READ_FRACTION * nonContainedReads) {
          Unitig  *dest = unitigs[it->first];

          writeLog("                 tig %8u length %9u nReads %7u\n", dest->id(), dest->getLength(), dest->ufpath.size());

          potentialBubbles[ti].push_back(dest->id());
        }
      }
    }
  }

  flushLog();
}

예제 #18

0

파일 보기

파일: AS_BAT_IntersectSplit.C 프로젝트: ondovb/canu

intersectionList::intersectionList(UnitigVector &unitigs) {

  for (uint32 ti=0; ti<unitigs.size(); ti++) {
    Unitig             *tig = unitigs[ti];

    if (tig == NULL)
      continue;

    intersectionEvidence *evidence = new intersectionEvidence [tig->ufpath.size()];

    for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
      ufNode  *frg = &tig->ufpath[fi];

      if (OG->isContained(frg->ident))
        continue;

      //  For my best overlap, the ID of the unitig that the overlapping fragment is in.

      evidence[fi].edge5 = *OG->getBestEdgeOverlap(frg->ident, false);
      evidence[fi].edge3 = *OG->getBestEdgeOverlap(frg->ident, true);

      evidence[fi].frag5tig = tig->fragIn(evidence[fi].edge5.fragId());
      evidence[fi].frag3tig = tig->fragIn(evidence[fi].edge3.fragId());

      //  Do NOT initialize these!  An earlier fragment could have already confirmed an end.
      //  Properly, only the 5' end of a forward fragment (or 3' end of a reverse fragment) can be
      //  confirmed already (otherwise the tig is nonsense), but we don't yet check that.
      //
      //evidence[fi].frag5confirmed = false;
      //evidence[fi].frag3confirmed = false;

      //  But, because the path could be promiscuous, not every overlap to a different tig is bad.
      //
      //  If my best overlap is to a different tig, but there is an overlapping fragment (in the
      //  unitig placement) with a best edge to me, I'm still good.  The BOG build this unitig using
      //  the edge from the other fragment to me.
      //
      //  If the fragments do not overlap in the layout (yet the best edge still exists) that is a
      //  self-intersection.
      //
      //  The two blocks are identical, except for 'edge3' and 'edge5'.

      if (evidence[fi].frag5tig == tig->id()) {
        uint32   ti  = tig->pathPosition(evidence[fi].edge5.fragId());
        ufNode  *trg = &tig->ufpath[ti];

        uint32  minf = (frg->position.bgn < frg->position.end) ? frg->position.bgn : frg->position.end;
        uint32  maxf = (frg->position.bgn < frg->position.end) ? frg->position.end : frg->position.bgn;

        uint32  mint = (trg->position.bgn < trg->position.end) ? trg->position.bgn : trg->position.end;
        uint32  maxt = (trg->position.bgn < trg->position.end) ? trg->position.end : trg->position.bgn;

        //  If they overlap, mark as confirmed, else remember an intersection.

        if (((minf < mint) && (mint < maxf)) ||  //  t begins inside f
            ((mint < minf) && (minf < maxt))) {  //  f begins inside t
          if (evidence[fi].edge5.frag3p())
            evidence[ti].frag3confirmed = true;
          else
            evidence[ti].frag5confirmed = true;

        } else {
          evidence[fi].frag5self = true;

          //  Not the correct place to report this.  Some of these get confirmed by later fragments.
          //writeLog("BUG1 F: %d,%d T %d,%d\n", minf, maxf, mint, maxt);
          //writeLog("INTERSECT from unitig %d frag %d end %d TO unitig %d frag %d end %d (SELF)\n",
          //        tig->id(), frg->ident, 5, evidence[fi].frag5tig, evidence[fi].edge5.fragId(), evidence[fi].edge5.frag3p() ? 3 : 5);
        }
      }



      if (evidence[fi].frag3tig == tig->id()) {
        uint32   ti  = tig->pathPosition(evidence[fi].edge3.fragId());
        ufNode  *trg = &tig->ufpath[ti];

        uint32  minf = (frg->position.bgn < frg->position.end) ? frg->position.bgn : frg->position.end;
        uint32  maxf = (frg->position.bgn < frg->position.end) ? frg->position.end : frg->position.bgn;

        uint32  mint = (trg->position.bgn < trg->position.end) ? trg->position.bgn : trg->position.end;
        uint32  maxt = (trg->position.bgn < trg->position.end) ? trg->position.end : trg->position.bgn;

        if (((minf < mint) && (mint < maxf)) ||  //  t begins inside f
            ((mint < minf) && (minf < maxt))) {  //  f begins inside t
          if (evidence[fi].edge3.frag3p())
            evidence[ti].frag3confirmed = true;
          else
            evidence[ti].frag5confirmed = true;

        } else {
          evidence[fi].frag3self = true;

          //  Not the correct place to report this.  Some of these get confirmed by later fragments.
          //writeLog("BUG2 F: %d,%d T %d,%d\n", minf, maxf, mint, maxt);
          //writeLog("INTERSECT from unitig %d frag %d end %d TO unitig %d frag %d end %d (SELF)\n",
          //        tig->id(), frg->ident, 3, evidence[fi].frag3tig, evidence[fi].edge3.fragId(), evidence[fi].edge3.frag3p() ? 3 : 5);
        }
      }
    }

    //
    //  Build the list.
    //

    for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
      ufNode             *frg = &tig->ufpath[fi];

      if ((evidence[fi].frag5tig != 0) &&
          (evidence[fi].frag5tig != tig->id()) &&
          (evidence[fi].frag5confirmed == false))
        isects.push_back(intersectionPoint(evidence[fi].edge5, frg->ident, false, false));

      if ((evidence[fi].frag5tig == tig->id()) &&
          (evidence[fi].frag5self == true) &&
          (evidence[fi].frag5confirmed == false))
        isects.push_back(intersectionPoint(evidence[fi].edge5, frg->ident, false, true));

      if ((evidence[fi].frag3tig != 0) &&
          (evidence[fi].frag3tig != tig->id()) &&
          (evidence[fi].frag3confirmed == false))
        isects.push_back(intersectionPoint(evidence[fi].edge3, frg->ident, true, false));

      if ((evidence[fi].frag3tig == tig->id()) &&
          (evidence[fi].frag3self == true) &&
          (evidence[fi].frag3confirmed == false))
        isects.push_back(intersectionPoint(evidence[fi].edge3, frg->ident, true, true));
    }

    delete [] evidence;
  }


  //  Sort the intersections by the ID of the intersected fragment, then build an index into the array.

  std::sort(isects.begin(), isects.end());

  //  Terminate the intersection list with a sentinal intersection.  This is CRITICAL
  //  to the way we iterate over intersections.

  isects.push_back(intersectionPoint(BestEdgeOverlap(), 0, true, true));

  //  Build a map from fragment id to the first intersection in the list.

  for (uint32 i=0; i<isects.size(); i++) {
    isectsNum[isects[i].isectFrg]++;

    if (isectsMap.find(isects[i].isectFrg) == isectsMap.end())
      isectsMap[isects[i].isectFrg] = i;
  }
}

예제 #19

0

파일 보기

파일: AS_BAT_Joining.C 프로젝트: ondovb/canu

static
void
joinUnitigs_append(UnitigVector &unitigs, joinEntry *join) {
  uint32    frId = Unitig::fragIn(join->frFragID);
  uint32    toId = Unitig::fragIn(join->toFragID);

  Unitig   *fr   = unitigs[frId];
  Unitig   *to   = unitigs[toId];

  uint32    frIdx = Unitig::pathPosition(join->frFragID);
  uint32    toIdx = Unitig::pathPosition(join->toFragID);

  //  The 'fr' unitig is assumed to be forward, and assumed to be the one we join to.

  //  Compute the offset for our append.  We just need to compute where the join fragment would
  //  appear in the unitig.  The join fragment MUST be the first thing in the frUnitig.

  //int32 offset = MIN(frF.position.bgn, frF.position.end);

  //  Over all fragments in the frUnitig, add them to either the joinUnitig or the discUnitig.

  Unitig *joinUnitig = unitigs.newUnitig(false);
  Unitig *discUnitig = unitigs.newUnitig(false);

  //  Reverse the 'to' unitig if needed.

  if (join->toFlip)
    to->reverseComplement(true);

  //  If we're joining off the 5' end of the fr untiig, add the to reads first.

  if (join->frFirst == true) {
    uint32 ii=0;

    for (; ii < toIdx; ii++)
      joinUnitig->addFrag(to->ufpath[ii], 0, false);

    for (; ii < to->ufpath.size(); ii++)
      discUnitig->addFrag(to->ufpath[ii], 0, false);
  }

  //  Now add all the fr unitig reads.

  for (uint32 ii=0; ii < fr->ufpath.size(); ii++)
    joinUnitig->addFrag(to->ufpath[ii], 0, false);

  //  If we're not joining off the 5' end, add the to unitig reads last.

  if (join->frFirst == false) {
    uint32 ii = 0;

    for (; ii < toIdx; ii++)
      discUnitig->addFrag(to->ufpath[ii], 0, false);

    for (; ii < to->ufpath.size(); ii++)
      joinUnitig->addFrag(to->ufpath[ii], 0, false);
  }

  //  Delete the donor unitigs.

  delete fr;
  delete to;

  unitigs[frId] = NULL;
  unitigs[toId] = NULL;

  //  And make sure the new unitigs are consistent.

  joinUnitig->sort();
  discUnitig->sort();
}

예제 #20

0

파일 보기

파일: AS_BAT_Joining.C 프로젝트: ondovb/canu

//  Examine the first (few?) fragments of a unitig, evaluate if they indicate a join should be made.
static
bool
joinUnitigs_examineEnd(UnitigVector      &unitigs,
                       Unitig            *fr,
                       uint32             idx,
                       bool               frFirstEnd,
                       vector<joinEntry> &joins) {
  uint32           frgIdx  = (frFirstEnd) ? (idx) : (fr->ufpath.size() - 1 - idx);
  ufNode          *frg     = &fr->ufpath[frgIdx];
  bool             frgRev  = (frg->position.end < frg->position.bgn);

  //  Grab the best edge for this end frag.  The last arg requests the 3' end if true.
  //
  //  If we're looking at the first read, we want to get:
  //    5' - if the frag is forward
  //    3' - if the frag is reverse (frgRev == true)
  //
  //  If we're looking at the lat read, we want to get:
  //    5' - if the frag is reverse
  //    3' - if the frag is forward  (frgRev == false)
  //
  BestEdgeOverlap *bestEdge    = OG->getBestEdgeOverlap(frg->ident, (frgRev == frFirstEnd));

  uint32      tgtId = bestEdge->fragId();
  bool        tgt3p = bestEdge->frag3p();

  if (tgtId == 0)
    //  No best edge?  Skip it.
    return(false);

  //  Grab the unitig for that best edge.

  uint32   toID  = fr->fragIn(tgtId);
  Unitig  *to    = unitigs[toID];

  if (to->ufpath.size() == 1)
    //  Joining to something teeny?  Don't bother checking further.
    return(false);

  if (to->id() == fr->id())
    //  Join to myself?  Nope.
    return(false);

  //  Grab the read we have an edge to, an compute the overlapping length and left over length.

  ufNode  *tgt    = &to->ufpath[to->pathPosition(tgtId)];
  bool     tgtRev = (tgt->position.end < tgt->position.bgn);

  //  If tgt3p (we overlap to the 3' end) is the same as tgtRev (read is reverse) then the unitig is oriented
  //  correctly.  Otherwise, positions need to be reverse-complemented.


  bool     toFlip = false;

  if ((frFirstEnd == true) && (tgt3p == false) && (tgtRev == false))
    //  source read is at the start, overlap to 5' and the read is forward, need to flip the target unitig
    toFlip = true;

  if ((frFirstEnd == true) && (tgt3p == true) && (tgtRev == true))
    //  source read is at the start, overlap to 3' and the read is reverse, need to flip the target unitig
    toFlip = true;


  if ((frFirstEnd == false) && (tgt3p == false) && (tgtRev == true))
    //  source read is at the end, overlap to 5' and the read is reverse, need to flip the target unitig
    toFlip = true;

  if ((frFirstEnd == false) && (tgt3p == true) && (tgtRev == false))
    //  source read is at the end, overlap to 3' and the read is forward, need to flip the target unitig
    toFlip = true;


  uint32   toMin = MIN(tgt->position.bgn, tgt->position.end);
  uint32   toMax = MAX(tgt->position.bgn, tgt->position.end);
  uint32   toLen = to->getLength();
  uint32   frLen = fr->getLength();

  if (toFlip) {
    toMin = toLen - MAX(tgt->position.bgn, tgt->position.end);
    toMax = toLen - MIN(tgt->position.bgn, tgt->position.end);
  }

  assert(toMin < toMax);

  //  Our two unitigs are of length frLen and toLen.  We are appending some portion of 'to' onto
  //  'fr', and 'discarding' the rest.  If the 'discarded' piece is larger than the 'fr' unitig, we
  //  don't want to do the join.
  //
  //  We err on the side of the discarded piece.

  uint32   joinLen = 0;
  uint32   discLen = 0;

  if (frFirstEnd == true) {
    joinLen = toMin + frLen;  //  Prepend the start of 'to' onto 'fr'.
    discLen = toLen - toMin;

  } else {
    joinLen = frLen + toLen - toMax;  //  Append the end of 'to' onto 'fr'.
    discLen = toMax;
  }

  //  If the discard is bigger than us, we do damage by joining.

  if (discLen > frLen)
    return(false);

  //  The joined should be much larger and the discarded much smaller.

  uint32    maxLen = MAX(frLen, toLen);
  uint32    minLen = MIN(frLen, toLen);

  double    joinChange = (double)joinLen / maxLen;
  double    discChange = (double)discLen / minLen;

  bool      isBad = false;

  if ((joinChange < 1.10) ||
      (0.75       < discChange))
    //  Bad if we didn't really change sizes.
    isBad = true;

  if ((1.0        < joinChange) &&
      (discChange < 0.5))
    //  But good if discard is tiny.  This occurs if we merge a small with a big.  The join change
    //  is somewhat small (1.05 say) yet most of the smaller unitig is used.
    isBad = false;

  if (isBad) {
    writeLog("joinUnitigs_examineEnd()-- join unitig %6u (%7ubp) frag %6u %s <-> unitig %6u (%7ubp) frag %6u %s <-> length %5.2f %7u and %5.2f %7u BAD\n",
             fr->id(), fr->getLength(), frg->ident, (frgRev) ? "rev" : "fwd",
             to->id(), to->getLength(), tgt->ident, (tgtRev) ? "rev" : "fwd",
             joinChange, joinLen,
             discChange, discLen);
    return(false);
  }

  //  OK, join.

  writeLog("joinUnitigs_examineEnd()-- join unitig %6u (%7ubp) frag %6u %s <-> unitig %6u (%7ubp) frag %6u %s <-> length %5.2f %7u and %5.2f %7u\n",
           fr->id(), fr->getLength(), frg->ident, (frgRev) ? "rev" : "fwd",
           to->id(), to->getLength(), tgt->ident, (tgtRev) ? "rev" : "fwd",
           joinChange, joinLen,
           discChange, discLen);

  joins.push_back(joinEntry(frg->ident, frFirstEnd, tgt->ident, toFlip, joinLen));

  return(true);
}

예제 #21

0

파일 보기

파일: AS_BAT_PlaceZombies.C 프로젝트: lhon/canu

void
placeZombies(UnitigVector &unitigs, double erate) {

  writeLog("==> SEARCHING FOR ZOMBIES\n");

  uint32 *inUnitig   = new uint32 [FI->numFragments()+1];
  int     numZombies = 0;

  //  Mark fragments as dead, then unmark them if they are in a real living unitig.

  for (uint32 i=0; i<FI->numFragments()+1; i++)
    inUnitig[i] = noUnitig;

  for (uint32 ti=0; ti<unitigs.size(); ti++) {
    Unitig  *utg = unitigs[ti];

    if (utg == NULL)
      continue;

    for (uint32 fi=0; fi<utg->ufpath.size(); fi++)
      inUnitig[utg->ufpath[fi].ident] = utg->id();
  }

  //  For anything not in a living unitig, reload the overlaps and find a new container.
  //  (NOT IMPLEMENTED - for now we just move these to new singleton unitigs).

  for (uint32 i=0; i<FI->numFragments()+1; i++) {
    if (FI->fragmentLength(i) == 0)
      //  Deleted fragment
      continue;

    if (inUnitig[i] != noUnitig)
      //  Valid fragment in a unitig
      continue;

    Unitig      *utg = unitigs.newUnitig(false);
    ufNode       frg;

    frg.ident             = i;
    frg.contained         = 0;
    frg.parent            = 0;

    frg.ahang             = 0;
    frg.bhang             = 0;

    frg.position.bgn      = 0;
    frg.position.end      = FI->fragmentLength(i);

    frg.containment_depth = 0;

    utg->addFrag(frg, 0, false);

    writeLog("placeZombies()-- unitig %d created from zombie fragment %d\n",
            utg->id(), i);
    numZombies++;
  }

  writeLog("RESURRECTED %d ZOMBIE FRAGMENT%s.\n", numZombies, (numZombies != 1) ? "s" : "");

  delete [] inUnitig;
}

예제 #22

0

파일 보기

파일: AS_BAT_ExtendByMates.C 프로젝트: ondovb/canu

void
extendByMates(UnitigVector &unitigs,
              double        erateGraph) {

  //logFileFlags |= LOG_CHUNK_GRAPH;
  logFileFlags |= LOG_POPULATE_UNITIG;

  writeLog("==> EXTENDING UNITIGS WITH MATE PAIRS.\n");

  uint32 tiMax = unitigs.size();

  for (uint32 ti=0; ti<tiMax; ti++) {
    Unitig        *target = unitigs[ti];

    if (target == NULL)
      continue;

    if (target->ufpath.size() < 2)
      continue;

    //  Build a list of all the fragments in this unitig, and any mates that are not in a unitig.

    uint32        extraMates = 0;

    for (uint32 fi=0; fi<target->ufpath.size(); fi++) {
      uint32  fid = target->ufpath[fi].ident;
      uint32  mid = FI->mateIID(fid);

      if ((mid != 0) &&
          (Unitig::fragIn(mid) == 0))
        extraMates++;
    }

    writeLog("\n");
    writeLog("unitig "F_U32" of size "F_SIZE_T" with "F_U32" extra fragments via mates\n",
            ti, target->ufpath.size(), extraMates);

    if (extraMates == 0)
      continue;

    //  Build a set of the fragments in this unitig plus their mates, and a set of just the mates.

    set<uint32>   frags;
    set<uint32>   mates;

    for (uint32 fi=0; fi<target->ufpath.size(); fi++) {
      uint32  fid = target->ufpath[fi].ident;
      uint32  mid = FI->mateIID(fid);

      frags.insert(fid);

      if ((mid != 0) &&
          (Unitig::fragIn(mid) == 0)) {
        writeLog("  mate frag "F_U32"\n", mid);
        frags.insert(mid);
        mates.insert(mid);
      }
    }

    //  Now, remove all the unitig fragments from the unitig so we can reconstruct it with the
    //  additional mated fragments.  Note that this loop cannot be combined with the last, since
    //  the test for 'additional mate' is 'not in the same unitig' -- and if we remove the
    //  fragments too early, we can't distinguish 'additional' from 'included'.

    for (uint32 fi=0; fi<target->ufpath.size(); fi++)
      target->removeFrag(target->ufpath[fi].ident);

    unitigs[ti] = NULL;
    delete target;

    //  Build a new BOG for just those fragments - in particular, only overlaps within the set are
    //  used for the BOG.

    BestOverlapGraph  *OGsave = OG;
    ChunkGraph        *CGsave = CG;

    OG = new BestOverlapGraph(erateGraph, &frags);
    CG = new ChunkGraph(&frags);

    uint32  numTigs = unitigs.size();

    //  Build new unitigs.  There should only be one new unitig constructed, but that isn't
    //  guaranteed.  No new unitigs are built if they are seeded from the mate fragments.  This
    //  isn't ideal -- we'd like to allow the first unitig (supposedly the longest) to start from
    //  a mate fragment.  However, consider the not-so-rare case where the original unitig is two
    //  backbone fragments and lots of contains.  Those contains contribute mate pairs that all
    //  assemble together, giving a longer path than the original unitig.  We don't want to
    //  assemble the mated fragments yet (we'll wait until we get the rest of the fragments that
    //  could assemble together).

    for (uint32 fi = CG->nextFragByChunkLength(); fi > 0; fi=CG->nextFragByChunkLength()) {
      if ((Unitig::fragIn(fi) != 0) ||
          (mates.count(fi) > 0))
        //  Fragment already in a unitig, or is an additional mate that we don't want
        //  to seed from.
        continue;

      populateUnitig(unitigs, fi);
    }

    //  Report what was constructed

    if (unitigs.size() - numTigs > 1)
      writeLog("WARNING: mate extension split a unitig.\n");


    for (uint32 newTigs=numTigs; newTigs<unitigs.size(); newTigs++) {
      Unitig  *tig = unitigs[newTigs];

      if (tig == NULL)
        continue;

      placeContainsUsingBestOverlaps(tig, &frags);

      writeLog("  new tig "F_U32" with "F_SIZE_T" fragments\n",
              tig->id(), tig->ufpath.size());
    }

    delete OG;
    delete CG;

    OG = OGsave;
    CG = CGsave;
  }
}

예제 #23

0

파일 보기

파일: AS_BAT_Instrumentation.C 프로젝트: AndreasHegerGenomics/canu

void
reportUnitigs(UnitigVector &unitigs, const char *prefix, const char *name, uint64 genomeSize) {

  //  Generate n50.  Assumes unitigs have been 'classified' already.

  vector<uint32>   unassembledLength;
  vector<uint32>   bubbleLength;
  vector<uint32>   repeatLength;
  vector<uint32>   circularLength;
  vector<uint32>   contigLength;

  for (uint32  ti=0; ti<unitigs.size(); ti++) {
    Unitig  *utg = unitigs[ti];

    if (utg == NULL)
      continue;

    if (utg->_isUnassembled) {
      unassembledLength.push_back(utg->getLength());
    }

    else if (utg->_isBubble) {
      bubbleLength.push_back(utg->getLength());
    }

    else if (utg->_isRepeat) {
      repeatLength.push_back(utg->getLength());
    }

    else if (utg->_isCircular) {
      circularLength.push_back(utg->getLength());
    }

    else {
      contigLength.push_back(utg->getLength());
    }
  }

  char   N[FILENAME_MAX];

  sprintf(N, "%s.sizes", getLogFilePrefix());

  errno = 0;
  FILE *F = fopen(N, "w");
  if (errno == 0) {
    reportN50(F, unassembledLength, "UNASSEMBLED", genomeSize);
    reportN50(F, bubbleLength,      "BUBBLE",      genomeSize);
    reportN50(F, repeatLength,      "REPEAT",      genomeSize);
    reportN50(F, circularLength,    "CIRCULAR",    genomeSize);
    reportN50(F, contigLength,      "CONTIGS",     genomeSize);

    fclose(F);
  }

  if (logFileFlagSet(LOG_INTERMEDIATE_UNITIGS) == 0)
    return;

  //  Dump to an intermediate store.

  char tigStorePath[FILENAME_MAX];
  sprintf(tigStorePath, "%s.tigStore", getLogFilePrefix());

  fprintf(stderr, "Creating intermediate tigStore '%s'\n", tigStorePath);

  uint32  numFragsT  = 0;
  uint32  numFragsP  = 0;
  uint64  utgLen     = 0;

  //  Compute average frags per partition.

  for (uint32  ti=0; ti<unitigs.size(); ti++) {
    Unitig  *utg = unitigs[ti];

    if (utg == NULL)
      continue;

    numFragsT += utg->ufpath.size();

    if (utg->ufpath.size() > 2)
      utgLen    += utg->getLength();
  }

  if      (utgLen < 16 * 1024 * 1024)
    numFragsP = numFragsT / 7;
  else if (utgLen < 64 * 1024 * 1024)
    numFragsP = numFragsT / 63;
  else
    numFragsP = numFragsT / 127;

  //  Dump the unitigs to an intermediate store.

  setParentAndHang(unitigs);

  writeUnitigsToStore(unitigs, tigStorePath, tigStorePath, numFragsP, false);
}

예제 #24

0

파일 보기

파일: AS_BAT_Instrumentation.C 프로젝트: AndreasHegerGenomics/canu

//  Decides if a unitig is unassembled.  The other classifications (isBubble, isCircular, isRepeat)
//  are made when the type is processed (e.g., when bubbles are popped).
//
//  A unitig is unassembled if:
//    1) it has fewer than R reads (R=2)
//    2) it is shorter than S bases (S=1000)
//    3) a single read spans at least fraction F of the lenth (F=1.0)
//    4) at least fraction F of the unitig is below read depth D (F=1.0, D=2)
//
void
classifyUnitigsAsUnassembled(UnitigVector &unitigs,
                             uint32        fewReadsNumber,
                             uint32        tooShortLength,
                             double        spanFraction,
                             double        lowcovFraction,   uint32  lowcovDepth) {
  uint32  nTooFew   = 0;
  uint32  nShort    = 0;
  uint32  nSingle   = 0;
  uint32  nCoverage = 0;
  uint32  nContig   = 0;

  uint64  bTooFew   = 0;
  uint64  bShort    = 0;
  uint64  bSingle   = 0;
  uint64  bCoverage = 0;
  uint64  bContig   = 0;

  char   N[FILENAME_MAX];

  sprintf(N, "%s.unassembled", getLogFilePrefix());

  errno = 0;
  FILE *F = fopen(N, "w");
  if (errno)
    F = NULL;

  for (uint32  ti=0; ti<unitigs.size(); ti++) {
    Unitig  *utg = unitigs[ti];

    if (utg == NULL)
      continue;

    utg->_isUnassembled = false;

    //  Rule 1.  Too few reads.

    if (utg->ufpath.size() < fewReadsNumber) {
      fprintf(F, "unitig "F_U32" unassembled - too few reads ("F_U64" < "F_U32")\n", ti, utg->ufpath.size(), fewReadsNumber);
      utg->_isUnassembled = true;
      nTooFew += 1;
      bTooFew += utg->getLength();
      continue;
    }

    //  Rule 2.  Short.

    if (utg->getLength() < tooShortLength) {
      fprintf(F, "unitig "F_U32" unassembled - too short ("F_U32" < "F_U32")\n", ti, utg->getLength(), tooShortLength);
      utg->_isUnassembled = true;
      nShort += 1;
      bShort += utg->getLength();
      continue;
    }

    //  Rule 3.  Single read spans large fraction of tig.

    for (uint32 oi=0; oi<utg->ufpath.size(); oi++) {
      ufNode  *frg = &utg->ufpath[oi];

      int frgbgn = MIN(frg->position.bgn, frg->position.end);
      int frgend = MAX(frg->position.bgn, frg->position.end);

      if (frgend - frgbgn > utg->getLength() * spanFraction) {
        fprintf(F, "unitig "F_U32" unassembled - single read spans unitig (read "F_U32" "F_U32"-"F_U32" spans fraction %f > %f\n",
                 ti, frg->ident, frg->position.bgn, frg->position.end, (double)(frgend - frgbgn) / utg->getLength(), spanFraction);
        utg->_isUnassembled = true;
        nSingle += 1;
        bSingle += utg->getLength();
        break;
      }
    }
    if (utg->_isUnassembled)
      continue;

    //  Rule 4.  Low coverage.

    intervalList<int32>  IL;

    for (uint32 oi=0; oi<utg->ufpath.size(); oi++) {
      ufNode  *frg = &utg->ufpath[oi];

      int frgbgn = MIN(frg->position.bgn, frg->position.end);
      int frgend = MAX(frg->position.bgn, frg->position.end);

      IL.add(frgbgn, frgend - frgbgn);
    }

    intervalList<int32>  ID(IL);

    uint32  basesLow  = 0;
    uint32  basesHigh = 0;

    for (uint32 ii=0; ii<ID.numberOfIntervals(); ii++)
      if (ID.depth(ii) < lowcovDepth)
        basesLow  += ID.hi(ii) - ID.lo(ii) + 1;
      else
        basesHigh += ID.hi(ii) - ID.lo(ii) + 1;

    double  lowcov = (double)basesLow / (basesLow + basesHigh);

    if (lowcov >= lowcovFraction) {
      fprintf(F, "Unitig "F_U32" unassembled - low coverage (%.4f > %.4f at < "F_U32"x coverage)\n",
               ti, lowcov, lowcovFraction, lowcovDepth);
      utg->_isUnassembled = true;
      nCoverage += 1;
      bCoverage += utg->getLength();
      continue;
    }

    //  Otherwise, unitig is assembled!

    nContig += 1;
    bContig += utg->getLength();
  }

  writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- too few reads\n",        nTooFew,   bTooFew);
  writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- too short\n",            nShort,    bShort);
  writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- single spanning read\n", nSingle,   bSingle);
  writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- low coverage\n",         nCoverage, bCoverage);
  writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- acceptable contigs\n",   nContig,   bContig);
}

예제 #25

0

파일 보기

파일: AS_BOG_SetParentAndHang.C 프로젝트: cdunn2001/DConvert

void
UnitigGraph::setParentAndHang(void) {

  for (uint32 ti=0; ti<unitigs.size(); ti++) {
    Unitig        *utg = unitigs[ti];

    if (utg == NULL)
      continue;

    if (utg->ufpath.size() == 0)
      continue;

    //  Reset parent and hangs for everything.

    for (uint32 fi=1; fi<utg->ufpath.size(); fi++) {
      ufNode *frg = &utg->ufpath[fi];

      frg->parent       = 0;
      frg->ahang        = 0;
      frg->bhang        = 0;
    }

    //  For each fragment, set parent/hangs using the edges.

    for (uint32 fi=0; fi<utg->ufpath.size(); fi++) {
      ufNode *frg  = &utg->ufpath[fi];

      //  If we're contained, gee, I sure hope the container is here!

      BestContainment *bestcont  = OG->getBestContainer(frg->ident);

      if ((bestcont) && (utg->fragIn(bestcont->container) == utg->id())) {
        int32   pi   = utg->pathPosition(bestcont->container);
        ufNode *par  = &utg->ufpath[pi];

        frg->parent = bestcont->container;

        //  The hangs assume the container is forward; adjust if not so.
        if (par->position.bgn < par->position.end) {
          frg->ahang  = bestcont->a_hang;
          frg->bhang  = bestcont->b_hang;
        } else {
          frg->ahang  = -bestcont->b_hang;
          frg->bhang  = -bestcont->a_hang;
        }

        continue;
      }

      //  Nope, not contained.  If we don't have a parent set, see if one of our best overlaps
      //  can set it.

      BestEdgeOverlap *bestedge5 = OG->getBestEdgeOverlap(frg->ident, false);
      BestEdgeOverlap *bestedge3 = OG->getBestEdgeOverlap(frg->ident, true);

      if ((bestedge5->fragId()) && (utg->fragIn(bestedge5->fragId()) == utg->id())) {
        int32         pi5  = utg->pathPosition(bestedge5->fragId());
        ufNode *oth  = &utg->ufpath[pi5];

        //  Consensus is expected parent/hangs to be relative to the parent fragment.  This is used
        //  ONLY to place the fragment, not to orient the fragment.  Orientation comes from the
        //  absolute positioning coordinates.
        //
        //  Interestingly, all four overlap transformations are used here.
        //
        //  The inner if tests (on fragment orientation) should be asserts, but due to imprecise
        //  layouts, they are sometimes violated:
        //    A fragment from       271-547 had a 5'overlap to something after it;
        //    the frag after was at 543-272, close enough to a tie to screw up placements
        //
        if (pi5 < fi) {
          //  We have an edge off our 5' end to something before us --> fragment MUST be forward.
          //  Flip the overlap so it is relative to the other fragment.
          if (frg->position.bgn < frg->position.end) {
            frg->parent = bestedge5->fragId();
            frg->ahang  = -bestedge5->ahang();
            frg->bhang  = -bestedge5->bhang();
            assert(frg->ahang >= 0);
          }
        } else {
          //  We have an edge off our 5' end to something after us --> fragment MUST be reverse.
          //  Because our fragment is now reverse, we must reverse the overlap too.
          if (frg->position.end < frg->position.bgn) {
            oth->parent = frg->ident;
            oth->ahang  = -bestedge5->bhang();
            oth->bhang  = -bestedge5->ahang();
            assert(oth->ahang >= 0);
          }
        }
      }

      if ((bestedge3->fragId()) && (utg->fragIn(bestedge3->fragId()) == utg->id())) {
        int32         pi3  = utg->pathPosition(bestedge3->fragId());
        ufNode *oth  = &utg->ufpath[pi3];

        if (pi3 < fi) {
          //  We have an edge off our 3' end to something before us --> fragment MUST be reverse.
          //  Flip the overlap so it is relative to the other fragment.
          //  Because our fragment is now reverse, we must reverse the overlap too.
          if (frg->position.end < frg->position.bgn) {
            frg->parent = bestedge3->fragId();
            frg->ahang  = bestedge3->bhang();
            frg->bhang  = bestedge3->ahang();
            assert(frg->ahang >= 0);
          }
        } else {
          //  We have an edge off our 3' end to something after us --> fragment MUST be forward.
          //  This is the simplest case, the overlap is already correct.
          if (frg->position.bgn < frg->position.end) {
            oth->parent = frg->ident;
            oth->ahang  = bestedge3->ahang();
            oth->bhang  = bestedge3->bhang();
            assert(oth->ahang >= 0);
          }
        }
      }
    }
  }
}

예제 #26

0

파일 보기

파일: AS_BAT_Outputs.C 프로젝트: ondovb/canu

void
writeUnitigsToStore(UnitigVector  &unitigs,
                    char          *fileprefix,
                    char          *tigStorePath,
                    uint32         frg_count_target,
                    bool           isFinal) {
  uint32      utg_count              = 0;
  uint32      frg_count              = 0;
  uint32      prt_count              = 1;
  char        filename[FILENAME_MAX] = {0};
  uint32     *partmap                = new uint32 [unitigs.size()];

  //  This code closely follows that in AS_CGB_unitigger.c::output_the_chunks()

  if (isFinal)
    checkUnitigMembership(unitigs);

  // Open up the initial output file

  sprintf(filename, "%s.iidmap", fileprefix);
  FILE *iidm = fopen(filename, "w");
  assert(NULL != iidm);

  sprintf(filename, "%s.partitioning", fileprefix);
  FILE *part = fopen(filename, "w");
  assert(NULL != part);

  sprintf(filename, "%s.partitioningInfo", fileprefix);
  FILE *pari = fopen(filename, "w");
  assert(NULL != pari);

  //  Step through all the unitigs once to build the partition mapping and IID mapping.

  memset(partmap, 0xff, sizeof(uint32) * unitigs.size());

  for (uint32 iumiid=0, ti=0; ti<unitigs.size(); ti++) {
    Unitig  *utg = unitigs[ti];
    uint32   nf  = (utg) ? utg->getNumFrags() : 0;

    if ((utg == NULL) || (nf == 0))
      continue;

    assert(utg->getLength() > 0);
    assert(nf == utg->ufpath.size());

    if ((frg_count + nf >= frg_count_target) &&
        (frg_count      >  0)) {
      fprintf(pari, "Partition %d has %d unitigs and %d fragments.\n",
              prt_count, utg_count, frg_count);

      prt_count++;
      utg_count = 0;
      frg_count = 0;
    }

    uint32 tigid = (isFinal) ? iumiid : ti;

    assert(tigid < unitigs.size());
    partmap[tigid] = prt_count;

    fprintf(iidm, "Unitig "F_U32" == IUM "F_U32" (in partition "F_U32" with "F_U32" frags)\n",
            utg->id(),
            (tigid),
            partmap[(tigid)],
            nf);

    for (uint32 fragIdx=0; fragIdx<nf; fragIdx++) {
      ufNode  *f = &utg->ufpath[fragIdx];

      fprintf(part, "%d\t%d\n", prt_count, f->ident);
    }

    utg_count += 1;
    frg_count += nf;

    iumiid++;
  }

  fprintf(pari, "Partition %d has %d unitigs and %d fragments.\n",
          prt_count, utg_count, frg_count);

  fclose(pari);
  fclose(part);
  fclose(iidm);

  //  Step through all the unitigs once to build the partition mapping and IID mapping.

  tgStore     *tigStore = new tgStore(tigStorePath);
  tgTig       *tig      = new tgTig;

  for (uint32 iumiid=0, ti=0; ti<unitigs.size(); ti++) {
    Unitig  *utg = unitigs[ti];
    uint32   nf  = (utg) ? utg->getNumFrags() : 0;

    if ((utg == NULL) || (nf == 0))
      continue;

    unitigToTig(tig, (isFinal) ? iumiid : ti, utg);

    tigStore->insertTig(tig, false);

    iumiid++;
  }

  delete    tig;
  delete    tigStore;
  delete [] partmap;
}

예제 #27

0

파일 보기

파일: AS_BAT_IntersectSplit.C 프로젝트: ondovb/canu

void
breakUnitigs(UnitigVector &unitigs,
             char         *output_prefix,
             bool          enableIntersectionBreaking) {

  writeLog("==> BREAKING UNITIGS.\n");

  intersectionList  *ilist = new intersectionList(unitigs);

  //  Stop when we've seen all current unitigs.  Replace tiMax
  //  in the for loop below with unitigs.size() to recursively
  //  split unitigs.

  uint32 tiMax = unitigs.size();

  for (uint32 ti=0; ti<tiMax; ti++) {
    Unitig             *tig = unitigs[ti];

    if (tig == NULL)
      continue;

    vector<breakPoint>   breaks;

    for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
      ufNode             *frg   = &tig->ufpath[fi];
      intersectionPoint  *isect = ilist->getIntersection(frg->ident, 0);

      if (isect == NULL)
        continue;

      for (; isect->isectFrg == frg->ident; isect++) {
        assert(tig->id() == Unitig::fragIn(isect->isectFrg));

        //  Grab the invading unitig

        Unitig *inv = unitigs[Unitig::fragIn(isect->invadFrg)];
        assert(inv->id() == Unitig::fragIn(isect->invadFrg));

        //  Grab the best edges off the invading fragment.

        BestEdgeOverlap  *best5 = OG->getBestEdgeOverlap(isect->invadFrg, false);
        BestEdgeOverlap  *best3 = OG->getBestEdgeOverlap(isect->invadFrg, true);

        //  Check if the incoming tig is a spur, and we should just ignore it immediately

        if ((inv->ufpath.size() == 1) &&
            ((best5->fragId() == 0) ||
             (best3->fragId() == 0))) {
          if (logFileFlagSet(LOG_INTERSECTION_BREAKING))
            writeLog("unitig %d frag %d end %c' into unitig %d frag %d end %c' -- IS A SPUR, skip it\n",
                    inv->id(), isect->invadFrg, isect->invad3p ? '3' : '5',
                    tig->id(), isect->isectFrg, isect->isect3p ? '3' : '5');
          continue;
        }

        //  Keep only significant intersections

        if ((inv->getLength()   > MIN_BREAK_LENGTH) &&
            (inv->ufpath.size() > MIN_BREAK_FRAGS)) {
          if (logFileFlagSet(LOG_INTERSECTION_BREAKING))
            writeLog("unitig %d frag %d end %c' into unitig %d frag %d end %c'\n",
                    inv->id(), isect->invadFrg, isect->invad3p ? '3' : '5',
                    tig->id(), isect->isectFrg, isect->isect3p ? '3' : '5');
          breaks.push_back(breakPoint(isect->isectFrg, isect->isect3p, true, false));
        }
      }  //  Over all incoming fragments

      //  If this is the last fragment, terminate the break point list with a 'fakeEnd' (in AS_BAT_Breaking.cc) break point
      //  at the end of the unitig.

      if ((fi+1 == tig->ufpath.size()) &&
          (breaks.size() > 0)) {
        breaks.push_back(breakPoint(frg->ident, (frg->position.bgn < frg->position.end), true, false));
      }
    }  //  Over all fragments in the unitig


    if (breaks.size() == 0)
      continue;

    //  Report where breaks occur.  'breaks' is a list, not a vector.
#if 0
    //  We've lost the fields in breaks[i] -- but the reports above aren't updated yet.
    if (logFileFlagSet(LOG_INTERSECTION_BREAKING) ||
        logFileFlagSet(LOG_MATE_SPLIT_COVERAGE_PLOT))
      for (uint32 i=0; i<breaks.size(); i++)
        writeLog("BREAK unitig %d at position %d,%d from inSize %d inFrags %d.\n",
                tig->id(),
                breaks[i].fragPos.bgn,
                breaks[i].fragPos.end,
                breaks[i].inSize,
                breaks[i].inFrags);
#endif

    //  Actually do the breaking.
    if (enableIntersectionBreaking)
      breakUnitigAt(unitigs, tig, breaks, true);

    breaks.clear();
  }  //  Over all unitigs
}

예제 #28

0

파일 보기

파일: AS_BAT_Outputs.C 프로젝트: swang8/canu

void
writeUnitigsToStore(UnitigVector  &unitigs,
                    char          *fileprefix,
                    char          *tigStorePath,
                    uint32         frg_count_target,
                    bool           isFinal) {
  uint32      utg_count              = 0;
  uint32      frg_count              = 0;
  uint32      prt_count              = 1;
  char        filename[FILENAME_MAX] = {0};
  uint32     *partmap                = new uint32 [unitigs.size()];

  //  This code closely follows that in AS_CGB_unitigger.c::output_the_chunks()

  if (isFinal)
    checkUnitigMembership(unitigs);

  // Open up the initial output file

  sprintf(filename, "%s.iidmap", fileprefix);
  FILE *iidm = fopen(filename, "w");
  assert(NULL != iidm);

  sprintf(filename, "%s.partitioning", fileprefix);
  FILE *part = fopen(filename, "w");
  assert(NULL != part);

  sprintf(filename, "%s.partitioningInfo", fileprefix);
  FILE *pari = fopen(filename, "w");
  assert(NULL != pari);

  //  Step through all the unitigs once to build the partition mapping and IID mapping.

  tgStore     *tigStore = new tgStore(tigStorePath);
  tgTig       *tig      = new tgTig;

  for (uint32 tigID=0, ti=0; ti<unitigs.size(); ti++) {
    Unitig  *utg = unitigs[ti];

    if ((utg == NULL) || (utg->getNumFrags() == 0))
      continue;

    assert(utg->getLength() > 0);

    //  Convert the bogart tig to a tgTig and save to the store.

    unitigToTig(tig, (isFinal) ? tigID : ti, utg);
    tigID++;

    tigStore->insertTig(tig, false);

    //  Increment the partition if the current one is too large.

    if ((frg_count + utg->getNumFrags() >= frg_count_target) &&
        (frg_count                      >  0)) {
      fprintf(pari, "Partition %d has %d unitigs and %d fragments.\n",
              prt_count, utg_count, frg_count);

      prt_count++;
      utg_count = 0;
      frg_count = 0;
    }

    //  Note that the tig is included in this partition.

    utg_count += 1;
    frg_count += utg->getNumFrags();

    //  Map the tig to a partition, and log both the tig-to-partition map and the partition-to-read map.

    fprintf(iidm, "bogart "F_U32" -> tig "F_U32" (in partition "F_U32" with "F_U32" frags)\n",
            utg->id(),
            utg->tigID(),
            prt_count,
            utg->getNumFrags());

    for (uint32 fragIdx=0; fragIdx<utg->getNumFrags(); fragIdx++)
      fprintf(part, "%d\t%d\n", prt_count, utg->ufpath[fragIdx].ident);
  }

  fprintf(pari, "Partition %d has %d unitigs and %d fragments.\n",   //  Don't forget to log the last partition!
          prt_count, utg_count, frg_count);

  fclose(pari);
  fclose(part);
  fclose(iidm);

  delete    tig;
  delete    tigStore;
}

예제 #29

0

파일 보기

파일: AS_BAT_MateBubble.C 프로젝트: ondovb/canu

void
popMateBubbles(UnitigVector &unitigs) {
  uint32      nBubblePopped   = 0;
  uint32      nBubbleTooBig   = 0;
  uint32      nBubbleConflict = 0;

  writeLog("==> SEARCHING FOR MATE BUBBLES\n");

  //  For each unitig, if all (or most) of the external mates are to a single other unitig (not
  //  counting singletons), then this is a potential bubble popping unitig.
  //
  //  At present, this is exploratory only.

  for (uint32 ti=0; ti<unitigs.size(); ti++) {
    Unitig        *tig = unitigs[ti];

    if ((tig == NULL) ||
        (tig->ufpath.size() == 0))
      //   No tig here.
      continue;

    if ((tig->getLength() > 1000) ||
        (tig->ufpath.size() >= 3000))
      //  Tig too big.
      continue;

    //if ((tig->getLength() < 150) ||
    //    (tig->ufpath.size() < 5))
    //  //  Tig too small.
    //  continue;

    uint32        *lkg    = new uint32 [tig->ufpath.size()];
    uint32         lkgLen = 0;
    uint32         lkgExt = 0;

    for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
      ufNode *frg = &tig->ufpath[fi];
      int32         frgID = frg->ident;
      int32         matID = FI->mateIID(frgID);

      uint32        mtigID = 0;
      Unitig       *mtig   = 0L;

      if (matID == 0)
        //  No mate.
        continue;

      mtigID = tig->fragIn(matID);
      mtig   = unitigs[mtigID];

      if (mtigID == tig->id())
        //  Mate is not external.
        continue;

      lkgExt++;

      if (mtig->ufpath.size() < 2)
        //  Mate is in singleton.
        continue;

      lkg[lkgLen++] = mtigID;
    }

    if (lkgLen == 0)
      //  No external mates.
      continue;

    sort(lkg, lkg+lkgLen);

    uint32  last = lkg[0];
    uint32  lcnt = 1;

    for (uint32 i=1; i<lkgLen; i++) {
      if (last != lkg[i]) {
        if ((lcnt > 3))
          writeLog("popMateBubble()-- tig %d len %d might pop bubble in tig %u (%u mates in there out of %d external mates)\n",
                  tig->id(), tig->getLength(), last, lcnt, lkgExt);
        last = lkg[i];
        lcnt = 0;
      }

      lcnt++;
    }

    if ((lcnt > 3))
      writeLog("popMateBubble()-- tig %d len %d might pop bubble in tig %u (%u mates in there out of %d external mates)\n",
              tig->id(), tig->getLength(), last, lcnt, lkgExt);

    delete [] lkg;
  }
}

예제 #30

0

파일 보기

파일: AS_BAT_Outputs.C 프로젝트: swang8/canu

//  For every unitig, report the best overlaps contained in the
//  unitig, and all overlaps contained in the unitig.
//
//  Wow, this is ancient.
//
void
writeOverlapsUsed(UnitigVector &unitigs,
                  char         *prefix) {
  char   N[FILENAME_MAX];

  sprintf(N, "%s.unused.best.edges", prefix);

  FILE  *F = fopen(N, "w");

  for (uint32  ti=0; ti<unitigs.size(); ti++) {
    Unitig  *tig = unitigs[ti];
    Unitig  *ovl = NULL;
    char     tyt = 'C';

    if (tig == NULL)
      continue;

    if (tig->_isUnassembled)  tyt = 'U';
    if (tig->_isBubble)       tyt = 'B';
    if (tig->_isRepeat)       tyt = 'R';
    if (tig->_isCircular)     tyt = 'O';

    for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
      ufNode  *frg = &tig->ufpath[fi];
      ufNode  *oth = NULL;

      //  Report the unused best edge

      BestEdgeOverlap *be5 = OG->getBestEdgeOverlap(frg->ident, false);
      uint32   rd5 = (be5 == NULL) ?    0 : be5->fragId();
      Unitig  *tg5 = (be5 == NULL) ? NULL : unitigs[Unitig::fragIn(rd5)];
      char     ty5 = 'C';

      if ((tg5 != NULL) && (tg5->tigID() != tig->tigID())) {
        uint32  ord = Unitig::pathPosition(rd5);
        ufNode *oth = &tg5->ufpath[ord];

        if (tig->_isUnassembled)  ty5 = 'U';
        if (tig->_isBubble)       ty5 = 'B';
        if (tig->_isRepeat)       ty5 = 'R';
        if (tig->_isCircular)     ty5 = 'O';

        fprintf(F, "tig %7u %c read %8u at %9u %-9u %c' -- %8d %-8d -- tig %7u %c read %8u at %9u %-9u %c'\n",
                tig->tigID(), tyt, frg->ident, frg->position.bgn, frg->position.end, '5',
                be5->ahang(), be5->bhang(),
                tg5->tigID(), ty5, oth->ident, oth->position.bgn, oth->position.end, (be5->frag3p() == false) ? '5' : '3');
      }

      BestEdgeOverlap *be3 = OG->getBestEdgeOverlap(frg->ident, true);
      uint32   rd3 = (be3 == NULL) ?    0 : be3->fragId();
      Unitig  *tg3 = (be3 == NULL) ? NULL : unitigs[Unitig::fragIn(rd3)];
      char     ty3 = 'C';

      if ((tg3 != NULL) && (tg3->tigID() != tig->tigID())) {
        uint32  ord = Unitig::pathPosition(rd3);
        ufNode *oth = &tg3->ufpath[ord];

        if (tig->_isUnassembled)  ty3 = 'U';
        if (tig->_isBubble)       ty3 = 'B';
        if (tig->_isRepeat)       ty3 = 'R';
        if (tig->_isCircular)     ty3 = 'O';

        fprintf(F, "tig %7u %c read %8u at %9u %-9u %c' -- %8d %-8d -- tig %7u %c read %8u at %9u %-9u %c'\n",
                tig->tigID(), tyt, frg->ident, frg->position.bgn, frg->position.end, '3',
                be3->ahang(), be3->bhang(),
                tg3->tigID(), ty3, oth->ident, oth->position.bgn, oth->position.end, (be3->frag3p() == false) ? '5' : '3');
      }
    }
  }

  fclose(F);
}