C++ (Cpp) Unitig::id Beispiele

Programmiersprache: C++ (Cpp)

Klasse / Typ: Unitig

Methode / Funktion: id

Beispiele auf hotexamples.com: 19

C++ (Cpp) Unitig::id - 19 Beispiele gefunden. Dies sind die am besten bewerteten C++ (Cpp) Beispiele für die Unitig::id, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

id(19)

getLength(11)

addFrag(7)

fragIn(6)

sort(5)

addContainedFrag(3)

getNumFrags(3)

pathPosition(3)

overlapConsistentWithTig(2)

removeFrag(2)

tigID(2)

bubbleSortLastFrag(1)

computeArrivalRate(1)

computeErrorProfile(1)

reportErrorProfile(1)

reverseComplement(1)

Beispiel #1

Datei anzeigen

Datei: AS_BAT_UnitigVector.C Projekt: Ecogpr/canu

Unitig *
UnitigVector::newUnitig(bool verbose) {
  Unitig *u = new Unitig();

#pragma omp critical
  {
    u->_id = _totalUnitigs++;

    if (verbose)
      writeLog("Creating Unitig %d\n", u->_id);

    if (_blockNext >= _blockSize) {
      assert(_numBlocks < _maxBlocks);

      _blocks[_numBlocks] = new Unitig * [_blockSize];

      memset(_blocks[_numBlocks], 0, sizeof(Unitig **) * _blockSize);

      _numBlocks++;
      _blockNext = 0;
    }

    _blocks[_numBlocks-1][_blockNext++] = u;

    //  The rest are just sanity checks.

    assert((u->id() / _blockSize) == (_numBlocks - 1));
    assert((u->id() % _blockSize) == (_blockNext - 1));

    assert(operator[](u->id()) == u);
  }

  return(u);
};

Beispiel #2

Datei anzeigen

Datei: AS_BAT_PlaceContains.C Projekt: cdunn2001/DConvert

void
placeContainsUsingBestOverlaps(UnitigVector &unitigs) {
  uint32   fragsPlaced  = 1;
  uint32   fragsPending = 0;

  logFileFlags &= ~LOG_PLACE_FRAG;

  while (fragsPlaced > 0) {
    fragsPlaced  = 0;
    fragsPending = 0;

    writeLog("==> PLACING CONTAINED FRAGMENTS\n");

    for (uint32 fid=1; fid<FI->numFragments()+1; fid++) {
      BestContainment *bestcont = OG->getBestContainer(fid);
      Unitig          *utg;

      if (bestcont->isContained == false)
        //  Not a contained fragment.
        continue;

      if (Unitig::fragIn(fid) != 0)
        //  Containee already placed.
        continue;

      if (Unitig::fragIn(bestcont->container) == 0) {
        //  Container not placed (yet).
        fragsPending++;
        continue;
      }

      utg = unitigs[Unitig::fragIn(bestcont->container)];
      utg->addContainedFrag(fid, bestcont, logFileFlagSet(LOG_INITIAL_CONTAINED_PLACEMENT));

      if (utg->id() != Unitig::fragIn(fid))
        writeLog("placeContainsUsingBestOverlaps()-- FAILED to add frag %d to unitig %d.\n", fid, bestcont->container);
      assert(utg->id() == Unitig::fragIn(fid));


      fragsPlaced++;
    }

    writeLog("==> PLACING CONTAINED FRAGMENTS - placed %d fragments; still need to place %d\n",
            fragsPlaced, fragsPending);

    if ((fragsPlaced == 0) && (fragsPending > 0)) {
      writeLog("Stopping contained fragment placement due to zombies.\n");
      fragsPlaced  = 0;
      fragsPending = 0;
    }
  }

  for (uint32 ti=1; ti<unitigs.size(); ti++) {
    Unitig *utg = unitigs[ti];

    if (utg)
      utg->sort();
  }
}

Beispiel #3

Datei anzeigen

Datei: AS_BAT_Instrumentation.C Projekt: AndreasHegerGenomics/canu

void
checkUnitigMembership(UnitigVector &unitigs) {
  uint32 *inUnitig = new uint32 [FI->numFragments()+1];
  uint32  noUnitig = 0xffffffff;

  //  All reads start of not placed in a unitig.

  for (uint32 i=0; i<FI->numFragments()+1; i++)
    inUnitig[i] = noUnitig;

  //  Over all unitigs, remember where each read is.

  for (uint32 ti=0; ti<unitigs.size(); ti++) {
    Unitig  *tig = unitigs[ti];
    int32    len = 0;

    if (tig == NULL)
      continue;

    for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
      ufNode  *frg = &tig->ufpath[fi];

      if (frg->ident > FI->numFragments())
        fprintf(stderr, "tig %u ufpath[%d] ident %u more than number of reads %u\n",
                tig->id(), fi, frg->ident, FI->numFragments());

      if (inUnitig[frg->ident] != noUnitig)
        fprintf(stderr, "tig %u ufpath[%d] ident %u placed multiple times\n",
                tig->id(), fi, frg->ident);

      assert(frg->ident <= FI->numFragments());   //  Can't be out of range.
      assert(inUnitig[frg->ident] == noUnitig);   //  Read must be not placed yet.

      inUnitig[frg->ident] = ti;
    }
  }

  //  Find any read not placed in a unitig.

  for (uint32 i=0; i<FI->numFragments()+1; i++) {
    if (FI->fragmentLength(i) == 0)  //  Deleted read.
      continue;

    assert(inUnitig[i] != 0);         //  There shouldn't be a unitig 0.
    assert(inUnitig[i] != noUnitig);  //  The read should be in a unitig.
  }

  delete [] inUnitig;
}

Beispiel #4

Datei anzeigen

Datei: AS_BAT_SplitDiscontinuous.C Projekt: xtmgah/canu

static
void
makeNewUnitig(UnitigVector &unitigs,
              uint32        splitFragsLen,
              ufNode       *splitFrags) {
  Unitig *dangler = unitigs.newUnitig(false);

  if (logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS))
    writeLog("splitDiscontinuous()--   new tig "F_U32" with "F_U32" fragments (starting at frag "F_U32").\n",
            dangler->id(), splitFragsLen, splitFrags[0].ident);

  int splitOffset = -MIN(splitFrags[0].position.bgn, splitFrags[0].position.end);

  //  This should already be true, but we force it still
  splitFrags[0].contained = 0;

  for (uint32 i=0; i<splitFragsLen; i++)
    dangler->addFrag(splitFrags[i], splitOffset, false);  //logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS));
}

Beispiel #5

Datei anzeigen

Datei: AS_BAT_Outputs.C Projekt: ondovb/canu

//  For every unitig, report the best overlaps contained in the
//  unitig, and all overlaps contained in the unitig.
//
//  Wow, this is ancient.
//
void
writeOverlapsUsed(UnitigVector &unitigs,
                  char         *fileprefix) {
  char         filename[FILENAME_MAX] = {0};
#if 0
  GenericMesg  pmesg;
  OverlapMesg  omesg;
#endif

  sprintf(filename, "%s.unused.ovl", fileprefix);
  FILE *file = fopen(filename, "w");
  assert(file != NULL);

#if 0
  for (uint32  ti=0; ti<unitigs.size(); ti++) {
    Unitig  *utg = unitigs[ti];

    if (utg == NULL)
      continue;

    for (uint32 fi=0; fi<utg->ufpath.size(); fi++) {
      ufNode  *frg = &utg->ufpath[fi];

      //  Where is our best overlap?  Contained or dovetail?

      BestEdgeOverlap *bestedge5 = OG->getBestEdgeOverlap(frg->ident, false);
      BestEdgeOverlap *bestedge3 = OG->getBestEdgeOverlap(frg->ident, true);

      int              bestident5 = 0;
      int              bestident3 = 0;

      if (bestedge5) {
        bestident5 = bestedge5->fragId();

        if ((bestident5 > 0) && (utg->fragIn(bestident5) != utg->id())) {
          omesg.aifrag          = frg->ident;
          omesg.bifrag          = bestident5;
          omesg.ahg             = bestedge5->ahang();
          omesg.bhg             = bestedge5->bhang();
          omesg.orientation.setIsUnknown();
          omesg.overlap_type    = AS_DOVETAIL;
          omesg.quality         = 0.0;
          omesg.min_offset      = 0;
          omesg.max_offset      = 0;
          omesg.polymorph_ct    = 0;
          omesg.alignment_trace = NULL;
#ifdef AS_MSG_USE_OVL_DELTA
          omesg.alignment_delta = NULL;
#endif

          //  This overlap is off of the 5' end of this fragment.
          if (bestedge5->frag3p() == false)
            omesg.orientation.setIsOuttie();
          if (bestedge5->frag3p() == true)
            omesg.orientation.setIsAnti();

          pmesg.t = MESG_OVL;
          pmesg.m = &omesg;

          WriteProtoMesg_AS(file, &pmesg);
        }
      }

      if (bestedge3) {
        bestident3 = bestedge3->fragId();

        if ((bestident3 > 0) && (utg->fragIn(bestident3) != utg->id())) {
          omesg.aifrag          = frg->ident;
          omesg.bifrag          = bestident3;
          omesg.ahg             = bestedge3->ahang();
          omesg.bhg             = bestedge3->bhang();
          omesg.orientation.setIsUnknown();
          omesg.overlap_type    = AS_DOVETAIL;
          omesg.quality         = 0.0;
          omesg.min_offset      = 0;
          omesg.max_offset      = 0;
          omesg.polymorph_ct    = 0;
          omesg.alignment_trace = NULL;
#ifdef AS_MSG_USE_OVL_DELTA
          omesg.alignment_delta = NULL;
#endif

          //  This overlap is off of the 3' end of this fragment.
          if (bestedge3->frag3p() == false)
            omesg.orientation.setIsNormal();
          if (bestedge3->frag3p() == true)
            omesg.orientation.setIsInnie();

          pmesg.t = MESG_OVL;
          pmesg.m = &omesg;

          WriteProtoMesg_AS(file, &pmesg);
        }
      }
    }
  }
#endif

  fclose(file);
}

Beispiel #6

Datei anzeigen

Datei: AS_BAT_SplitDiscontinuous.C Projekt: xtmgah/canu

//  After splitting and ejecting some contains, check for discontinuous unitigs.
//
void splitDiscontinuousUnitigs(UnitigVector &unitigs, uint32 minOverlap) {

  writeLog("==> SPLIT DISCONTINUOUS\n");

  uint32                numTested  = 0;
  uint32                numSplit   = 0;
  uint32                numCreated = 0;

  uint32                splitFragsLen = 0;
  uint32                splitFragsMax = 0;
  ufNode               *splitFrags    = NULL;

  for (uint32 ti=0; ti<unitigs.size(); ti++) {
    Unitig  *tig = unitigs[ti];

    if ((tig == NULL) || (tig->ufpath.size() < 2))
      continue;

    //  Unitig must be sorted.  Someone upstream os screwing this up.
    tig->sort();

    //  We'll want to build an array of new fragments to split out.  This can be up
    //  to the size of the largest unitig.
    splitFragsMax = MAX(splitFragsMax, tig->ufpath.size());

    //  Check that the unitig starts at position zero.  Not critical for the next loop, but
    //  needs to be dome sometime.
    int32   minPos = MIN(tig->ufpath[0].position.bgn, tig->ufpath[0].position.end);

    if (minPos == 0)
      continue;

    writeLog("splitDiscontinuous()-- tig "F_U32" offset messed up; reset by "F_S32".\n", tig->id(), minPos);

    for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
      ufNode  *frg = &tig->ufpath[fi];

      frg->position.bgn -= minPos;
      frg->position.end -= minPos;
    }
  }

  splitFrags = new ufNode [splitFragsMax];

  //  Now, finally, we can check for gaps in unitigs.

  for (uint32 ti=0; ti<unitigs.size(); ti++) {
    Unitig  *tig = unitigs[ti];

    if ((tig == NULL) || (tig->ufpath.size() < 2))
      continue;

    //  We don't expect many unitigs to be broken, so we'll do a first quick pass to just
    //  test if it is.

    int32  maxEnd   = MAX(tig->ufpath[0].position.bgn, tig->ufpath[0].position.end);
    bool   isBroken = false;

    for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
      ufNode  *frg = &tig->ufpath[fi];

      int32    bgn = MIN(frg->position.bgn, frg->position.end);
      int32    end = MAX(frg->position.bgn, frg->position.end);

      if (bgn > maxEnd - minOverlap) {
        isBroken = true;
        break;
      }

      maxEnd = MAX(maxEnd, end);
    }

    numTested++;

    if (isBroken == false)
      continue;

    numSplit++;

    //  Dang, busted unitig.  Fix it up.

    splitFragsLen = 0;
    maxEnd        = 0;

    if (logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS))
      writeLog("splitDiscontinuous()-- discontinuous tig "F_U32" with "F_SIZE_T" fragments broken into:\n",
              tig->id(), tig->ufpath.size());

    for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
      ufNode  *frg = &tig->ufpath[fi];

      int32    bgn = MIN(frg->position.bgn, frg->position.end);
      int32    end = MAX(frg->position.bgn, frg->position.end);

      //  Good thick overlap exists to this fragment, save it.
      if (bgn <= maxEnd - minOverlap) {
        assert(splitFragsLen < splitFragsMax);
        splitFrags[splitFragsLen++] = *frg;
        maxEnd = MAX(maxEnd, end);
        continue;
      }

      //  No thick overlap found.  We need to break right here before the current fragment.

      //  If there is exactly one fragment, and it's contained, and it's not mated, move it to the
      //  container.  (This has a small positive benefit over just making every read a singleton).
      //
      if ((splitFragsLen == 1) &&
          (FI->mateIID(splitFrags[0].ident) == 0) &&
          (splitFrags[0].contained != 0)) {
        Unitig  *dangler  = unitigs[tig->fragIn(splitFrags[0].contained)];

        //  If the parent isn't in a unitig, we must have shattered the repeat unitig it was in.
        //  Do the same here.

        if (dangler == NULL) {
          if (logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS))
            writeLog("splitDiscontinuous()--   singleton frag "F_U32" shattered.\n",
                    splitFrags[0].ident);
          Unitig::removeFrag(splitFrags[0].ident);

        } else {
          assert(dangler->id() == tig->fragIn(splitFrags[0].contained));

          if (logFileFlagSet(LOG_MATE_SPLIT_DISCONTINUOUS))
            writeLog("splitDiscontinuous()--   old tig "F_U32" with "F_SIZE_T" fragments (contained frag "F_U32" moved here).\n",
                    dangler->id(), dangler->ufpath.size() + 1, splitFrags[0].ident);

          BestContainment  *bestcont = OG->getBestContainer(splitFrags[0].ident);

          assert(bestcont->isContained == true);

          dangler->addContainedFrag(splitFrags[0].ident, bestcont, false);
          dangler->bubbleSortLastFrag();

          assert(dangler->id() == Unitig::fragIn(splitFrags[0].ident));
        }
      }

      //  Otherwise, make an entirely new unitig for these fragments.
      else {
        numCreated++;
        makeNewUnitig(unitigs, splitFragsLen, splitFrags);
        tig = unitigs[ti];
      }

      //  Done with the split, save the current fragment.  This resets everything.

      splitFragsLen = 0;
      splitFrags[splitFragsLen++] = *frg;

      maxEnd = end;
    }


    //  If we did any splitting, then the length of the frags in splitFrags will be less than the length
    //  of the path in the current unitig.  Make a final new unitig for the remaining fragments.
    //
    if (splitFragsLen != tig->ufpath.size()) {
      numCreated++;
      makeNewUnitig(unitigs, splitFragsLen, splitFrags);

      delete unitigs[ti];
      unitigs[ti] = NULL;
    }
  }

  writeLog("splitDiscontinuous()-- Tested "F_U32" unitigs, split "F_U32" into "F_U32" new unitigs.\n",
          numTested, numSplit, numCreated);

  delete [] splitFrags;
}

Beispiel #7

Datei anzeigen

Datei: AS_BAT_Outputs.C Projekt: swang8/canu

void
writeUnitigsToStore(UnitigVector  &unitigs,
                    char          *fileprefix,
                    char          *tigStorePath,
                    uint32         frg_count_target,
                    bool           isFinal) {
  uint32      utg_count              = 0;
  uint32      frg_count              = 0;
  uint32      prt_count              = 1;
  char        filename[FILENAME_MAX] = {0};
  uint32     *partmap                = new uint32 [unitigs.size()];

  //  This code closely follows that in AS_CGB_unitigger.c::output_the_chunks()

  if (isFinal)
    checkUnitigMembership(unitigs);

  // Open up the initial output file

  sprintf(filename, "%s.iidmap", fileprefix);
  FILE *iidm = fopen(filename, "w");
  assert(NULL != iidm);

  sprintf(filename, "%s.partitioning", fileprefix);
  FILE *part = fopen(filename, "w");
  assert(NULL != part);

  sprintf(filename, "%s.partitioningInfo", fileprefix);
  FILE *pari = fopen(filename, "w");
  assert(NULL != pari);

  //  Step through all the unitigs once to build the partition mapping and IID mapping.

  tgStore     *tigStore = new tgStore(tigStorePath);
  tgTig       *tig      = new tgTig;

  for (uint32 tigID=0, ti=0; ti<unitigs.size(); ti++) {
    Unitig  *utg = unitigs[ti];

    if ((utg == NULL) || (utg->getNumFrags() == 0))
      continue;

    assert(utg->getLength() > 0);

    //  Convert the bogart tig to a tgTig and save to the store.

    unitigToTig(tig, (isFinal) ? tigID : ti, utg);
    tigID++;

    tigStore->insertTig(tig, false);

    //  Increment the partition if the current one is too large.

    if ((frg_count + utg->getNumFrags() >= frg_count_target) &&
        (frg_count                      >  0)) {
      fprintf(pari, "Partition %d has %d unitigs and %d fragments.\n",
              prt_count, utg_count, frg_count);

      prt_count++;
      utg_count = 0;
      frg_count = 0;
    }

    //  Note that the tig is included in this partition.

    utg_count += 1;
    frg_count += utg->getNumFrags();

    //  Map the tig to a partition, and log both the tig-to-partition map and the partition-to-read map.

    fprintf(iidm, "bogart "F_U32" -> tig "F_U32" (in partition "F_U32" with "F_U32" frags)\n",
            utg->id(),
            utg->tigID(),
            prt_count,
            utg->getNumFrags());

    for (uint32 fragIdx=0; fragIdx<utg->getNumFrags(); fragIdx++)
      fprintf(part, "%d\t%d\n", prt_count, utg->ufpath[fragIdx].ident);
  }

  fprintf(pari, "Partition %d has %d unitigs and %d fragments.\n",   //  Don't forget to log the last partition!
          prt_count, utg_count, frg_count);

  fclose(pari);
  fclose(part);
  fclose(iidm);

  delete    tig;
  delete    tigStore;
}

Beispiel #8

Datei anzeigen

Datei: AS_BAT_Joining.C Projekt: ondovb/canu

//  Examine the first (few?) fragments of a unitig, evaluate if they indicate a join should be made.
static
bool
joinUnitigs_examineEnd(UnitigVector      &unitigs,
                       Unitig            *fr,
                       uint32             idx,
                       bool               frFirstEnd,
                       vector<joinEntry> &joins) {
  uint32           frgIdx  = (frFirstEnd) ? (idx) : (fr->ufpath.size() - 1 - idx);
  ufNode          *frg     = &fr->ufpath[frgIdx];
  bool             frgRev  = (frg->position.end < frg->position.bgn);

  //  Grab the best edge for this end frag.  The last arg requests the 3' end if true.
  //
  //  If we're looking at the first read, we want to get:
  //    5' - if the frag is forward
  //    3' - if the frag is reverse (frgRev == true)
  //
  //  If we're looking at the lat read, we want to get:
  //    5' - if the frag is reverse
  //    3' - if the frag is forward  (frgRev == false)
  //
  BestEdgeOverlap *bestEdge    = OG->getBestEdgeOverlap(frg->ident, (frgRev == frFirstEnd));

  uint32      tgtId = bestEdge->fragId();
  bool        tgt3p = bestEdge->frag3p();

  if (tgtId == 0)
    //  No best edge?  Skip it.
    return(false);

  //  Grab the unitig for that best edge.

  uint32   toID  = fr->fragIn(tgtId);
  Unitig  *to    = unitigs[toID];

  if (to->ufpath.size() == 1)
    //  Joining to something teeny?  Don't bother checking further.
    return(false);

  if (to->id() == fr->id())
    //  Join to myself?  Nope.
    return(false);

  //  Grab the read we have an edge to, an compute the overlapping length and left over length.

  ufNode  *tgt    = &to->ufpath[to->pathPosition(tgtId)];
  bool     tgtRev = (tgt->position.end < tgt->position.bgn);

  //  If tgt3p (we overlap to the 3' end) is the same as tgtRev (read is reverse) then the unitig is oriented
  //  correctly.  Otherwise, positions need to be reverse-complemented.


  bool     toFlip = false;

  if ((frFirstEnd == true) && (tgt3p == false) && (tgtRev == false))
    //  source read is at the start, overlap to 5' and the read is forward, need to flip the target unitig
    toFlip = true;

  if ((frFirstEnd == true) && (tgt3p == true) && (tgtRev == true))
    //  source read is at the start, overlap to 3' and the read is reverse, need to flip the target unitig
    toFlip = true;


  if ((frFirstEnd == false) && (tgt3p == false) && (tgtRev == true))
    //  source read is at the end, overlap to 5' and the read is reverse, need to flip the target unitig
    toFlip = true;

  if ((frFirstEnd == false) && (tgt3p == true) && (tgtRev == false))
    //  source read is at the end, overlap to 3' and the read is forward, need to flip the target unitig
    toFlip = true;


  uint32   toMin = MIN(tgt->position.bgn, tgt->position.end);
  uint32   toMax = MAX(tgt->position.bgn, tgt->position.end);
  uint32   toLen = to->getLength();
  uint32   frLen = fr->getLength();

  if (toFlip) {
    toMin = toLen - MAX(tgt->position.bgn, tgt->position.end);
    toMax = toLen - MIN(tgt->position.bgn, tgt->position.end);
  }

  assert(toMin < toMax);

  //  Our two unitigs are of length frLen and toLen.  We are appending some portion of 'to' onto
  //  'fr', and 'discarding' the rest.  If the 'discarded' piece is larger than the 'fr' unitig, we
  //  don't want to do the join.
  //
  //  We err on the side of the discarded piece.

  uint32   joinLen = 0;
  uint32   discLen = 0;

  if (frFirstEnd == true) {
    joinLen = toMin + frLen;  //  Prepend the start of 'to' onto 'fr'.
    discLen = toLen - toMin;

  } else {
    joinLen = frLen + toLen - toMax;  //  Append the end of 'to' onto 'fr'.
    discLen = toMax;
  }

  //  If the discard is bigger than us, we do damage by joining.

  if (discLen > frLen)
    return(false);

  //  The joined should be much larger and the discarded much smaller.

  uint32    maxLen = MAX(frLen, toLen);
  uint32    minLen = MIN(frLen, toLen);

  double    joinChange = (double)joinLen / maxLen;
  double    discChange = (double)discLen / minLen;

  bool      isBad = false;

  if ((joinChange < 1.10) ||
      (0.75       < discChange))
    //  Bad if we didn't really change sizes.
    isBad = true;

  if ((1.0        < joinChange) &&
      (discChange < 0.5))
    //  But good if discard is tiny.  This occurs if we merge a small with a big.  The join change
    //  is somewhat small (1.05 say) yet most of the smaller unitig is used.
    isBad = false;

  if (isBad) {
    writeLog("joinUnitigs_examineEnd()-- join unitig %6u (%7ubp) frag %6u %s <-> unitig %6u (%7ubp) frag %6u %s <-> length %5.2f %7u and %5.2f %7u BAD\n",
             fr->id(), fr->getLength(), frg->ident, (frgRev) ? "rev" : "fwd",
             to->id(), to->getLength(), tgt->ident, (tgtRev) ? "rev" : "fwd",
             joinChange, joinLen,
             discChange, discLen);
    return(false);
  }

  //  OK, join.

  writeLog("joinUnitigs_examineEnd()-- join unitig %6u (%7ubp) frag %6u %s <-> unitig %6u (%7ubp) frag %6u %s <-> length %5.2f %7u and %5.2f %7u\n",
           fr->id(), fr->getLength(), frg->ident, (frgRev) ? "rev" : "fwd",
           to->id(), to->getLength(), tgt->ident, (tgtRev) ? "rev" : "fwd",
           joinChange, joinLen,
           discChange, discLen);

  joins.push_back(joinEntry(frg->ident, frFirstEnd, tgt->ident, toFlip, joinLen));

  return(true);
}

Beispiel #9

Datei anzeigen

Datei: AS_BOG_MoveContains.C Projekt: cdunn2001/DConvert

//  Make sure that contained fragments are in the same unitig
//  as their container.  Due to sorting, contained fragments
//  can come much later in the unitig:
//
//  ------------1
//    -------------2
//       --------------3
//         ----4 (contained in 1, too much error keeps it out of 2 and 3)
//
//  So, our first pass is to move contained fragments around.
//
void UnitigGraph::moveContains(void) {

  for (uint32 ti=0; ti<unitigs.size(); ti++) {
    Unitig  *thisUnitig = unitigs[ti];

    if ((thisUnitig == NULL) ||
        (thisUnitig->ufpath.size() < 2))
      continue;

    MateLocation positions(thisUnitig);

    ufNode               *frags         = new ufNode [thisUnitig->ufpath.size()];
    uint32                fragsLen      = 0;

    if (logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS))
      fprintf(logFile, "moveContain unitig %d\n", thisUnitig->id());

    for (uint32 fi=0; fi<thisUnitig->ufpath.size(); fi++) {
      ufNode  *frg = &thisUnitig->ufpath[fi];

      BestContainment   *bestcont   = OG->getBestContainer(frg->ident);
      MateLocationEntry  mloc       = positions.getById(frg->ident);

      uint32  thisFrgID = frg->ident;
      uint32  contFrgID = (bestcont) ? bestcont->container : 0;
      uint32  mateFrgID = FI->mateIID(frg->ident);

      uint32  thisUtgID = thisUnitig->fragIn(thisFrgID);
      uint32  contUtgID = thisUnitig->fragIn(contFrgID);
      uint32  mateUtgID = thisUnitig->fragIn(mateFrgID);

      //  id1 != 0 -> we found the fragment in the mate happiness table
      //  isBad -> and the mate is unhappy.
      //
      //  What's id1 vs id2 in MateLocationEntry?  Dunno.  All I
      //  know is that if there is no mate present, one of those
      //  will be 0.  (Similar test used above too.)
      //
      bool    isMated    = (mateFrgID > 0);
      bool    isGrumpy   = ((isMated) && (mloc.mleFrgID1 != 0) && (mloc.mleFrgID2 != 0) && (mloc.isGrumpy == true));

      //
      //  Figure out what to do.
      //

      bool    moveToContainer = false;
      bool    moveToSingleton = false;

      if        ((frg->contained == 0) && (bestcont == NULL)) {
        //  CASE 1:  Not contained.  Leave the fragment here.
        //fprintf(logFile, "case1 frag %d fragsLen %d\n", thisFrgID, fragsLen);

      } else if (isMated == false) {
        //  CASE 2: Contained but not mated.  Move to be with the
        //  container (if the container isn't here).
        //fprintf(logFile, "case2 frag %d contID %d fragsLen %d\n", thisFrgID, contUtgID, fragsLen);

        if (thisUtgID != contUtgID)
          moveToContainer = true;

      } else if ((isGrumpy == true) && (thisUtgID == mateUtgID)) {
        //  CASE 3: Not happy, and the frag and mate are together.
        //  Kick out to a singleton.

        //fprintf(logFile, "case3 frag %d utg %d mate %d utg %d cont %d utg %d fragsLen %d\n",
        //        thisFrgID, thisUtgID, mateFrgID, mateUtgID, contFrgID, contUtgID, fragsLen);

        if (thisUtgID == mateUtgID)
          moveToSingleton = true;

      } else {

        //  This makes for some ugly code (we break the nice if else
        //  if else structure we had going on) but the next two cases
        //  need to know if there is an overlap to the rest of the
        //  unitig.

        bool  hasOverlap   = (thisUtgID == contUtgID);
        bool  allContained = false;


        if (hasOverlap == false) {
          if (fragsLen == 0) {
            //  The first fragment.  Check fragments after to see if
            //  there is an overlap (note only frags with an overlap
            //  in the layout are tested).  In rare cases, we ejected
            //  the container, and left a containee with no overlap to
            //  fragments remaining.
            //
            //  Note that this checks if there is an overlap to the
            //  very first non-contained (aka dovetail) fragment ONLY.
            //  If there isn't an overlap to the first non-contained
            //  fragment, then that fragment will likely NOT align
            //  correctly.

            uint32 ft = fi + 1;

#warning 2x BUGS IN COMPARISON HERE

            //  Skip all the contains.
            while ((ft < thisUnitig->ufpath.size()) &&
                   (OG->isContained(thisUnitig->ufpath[ft].ident) == true) &&
                   (MAX(frg->position.bgn, frg->position.end) < MIN(thisUnitig->ufpath[ft].position.bgn, thisUnitig->ufpath[ft].position.end)))
              ft++;

            //  If the frag is not contained (we could be the
            //  container), and overlaps in the layout, see if there
            //  is a real overlap.
            if ((ft < thisUnitig->ufpath.size()) &&
                (OG->isContained(thisUnitig->ufpath[ft].ident) == false) &&
                (MAX(frg->position.bgn, frg->position.end) < MIN(thisUnitig->ufpath[ft].position.bgn, thisUnitig->ufpath[ft].position.end)))
              hasOverlap = OG->containHaveEdgeTo(thisFrgID, thisUnitig->ufpath[ft].ident);
          } else {
            //  Not the first fragment, search for an overlap to an
            //  already placed frag.

            uint32  ft = fi;

            do {
              ft--;

              //  OK to overlap to a contained frag; he could be our
              //  container.

              hasOverlap = OG->containHaveEdgeTo(thisFrgID, thisUnitig->ufpath[ft].ident);

              //  Stop if we found an overlap, or we just checked the
              //  first frag in the unitig, or we no longer overlap in
              //  the layout.
            } while ((hasOverlap == false) &&
                     (ft > 0) &&
                     (MIN(frg->position.bgn, frg->position.end) < MAX(thisUnitig->ufpath[ft].position.bgn, thisUnitig->ufpath[ft].position.end)));
          }
        }  //  end of hasOverlap


        //  An unbelievabe special case.  When the unitig is just a
        //  single container fragment (and any contained frags under
        //  it) rule 4 breaks.  The first fragment has no overlap (all
        //  later reads are contained) and so we want to eject it to a
        //  new unitig.  Since there are multiple fragments in this
        //  unitig, the ejection occurs.  Later, all the contains get
        //  moved to the new unitig.  And we repeat.  To prevent, we
        //  abort the ejection if the unitig is all contained in one
        //  fragment.
        //
        if (fragsLen == 0) {
          allContained = true;

          for (uint32 ft = fi + 1; ((allContained == true) && (ft < thisUnitig->ufpath.size())); ft++)
            allContained = OG->isContained(thisUnitig->ufpath[ft].ident);
        }



        if (isGrumpy == true) {
          //  CASE 4: Not happy and not with the mate.  This one is a
          //  bit of a decision.
          //
          //  If an overlap exists to the rest of the unitig, we'll
          //  leave it here.  We'll also leave it here if it is the
          //  rest of the unitig is all contained in this fragment.
          //
          //  If no overlap, and the mate and container are in the
          //  same unitig, we'll just eject.  That also implies the
          //  other unitig is somewhat large, at least as big as the
          //  insert size.
          //
          //  Otherwise, we'll move to the container and cross our
          //  fingers we place it correctly.  The alternative is to
          //  eject, and hope that we didn't also eject the mate to a
          //  singleton.

          //fprintf(logFile, "case4 frag %d utg %d mate %d utg %d cont %d utg %d fragsLen %d\n",
          //        thisFrgID, thisUtgID, mateFrgID, mateUtgID, contFrgID, contUtgID, fragsLen);

          if ((hasOverlap == false) && (allContained == false))
            if (mateUtgID == contUtgID)
              moveToSingleton = true;
            else
              moveToContainer = true;

        } else {
          //  CASE 5: Happy!  If with container, or an overlap exists to
          //  some earlier fragment, leave it here.  Otherwise, eject it
          //  to a singleton.  The fragment is ejected instead of moved
          //  to be with its container since we don't know which is
          //  correct - the mate or the overlap.
          //
          //  If not happy, we've already made sure that the mate is not
          //  here (that was case 3).

          //fprintf(logFile, "case5 frag %d utg %d mate %d utg %d cont %d utg %d fragsLen %d\n",
          //        thisFrgID, thisUtgID, mateFrgID, mateUtgID, contFrgID, contUtgID, fragsLen);

          //  If no overlap (so not with container or no overlap to
          //  other frags) eject.
          if ((hasOverlap == false) && (allContained == false))
            moveToSingleton = true;
        }
      }  //  End of cases

      //
      //  Do it.
      //

      if (moveToContainer == true) {
        //  Move the fragment to be with its container.

        Unitig         *thatUnitig = unitigs[contUtgID];
        ufNode          containee  = *frg;

        assert(thatUnitig->id() == contUtgID);

        //  Nuke the fragment in the current list
        frg->ident        = 999999999;
        frg->contained    = 999999999;
        frg->position.bgn = 0;
        frg->position.end = 0;

        assert(thatUnitig->id() == contUtgID);

        if (logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS))
          fprintf(logFile, "Moving contained fragment %d from unitig %d to be with its container %d in unitig %d\n",
                  thisFrgID, thisUtgID, contFrgID, contUtgID);

        assert(bestcont->container == contFrgID);

        thatUnitig->addContainedFrag(thisFrgID, bestcont, logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS));
        assert(thatUnitig->id() == Unitig::fragIn(thisFrgID));

      } else if ((moveToSingleton == true) && (thisUnitig->getNumFrags() != 1)) {
        //  Eject the fragment to a singleton (unless we ARE the singleton)
        Unitig        *singUnitig  = new Unitig(logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS));
        ufNode         containee  = *frg;

        //  Nuke the fragment in the current list
        frg->ident        = 999999999;
        frg->contained    = 999999999;
        frg->position.bgn = 0;
        frg->position.end = 0;

        if (logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS))
          fprintf(logFile, "Ejecting unhappy contained fragment %d from unitig %d into new unitig %d\n",
                  thisFrgID, thisUtgID, singUnitig->id());

        containee.contained = 0;

        singUnitig->addFrag(containee, -MIN(containee.position.bgn, containee.position.end), logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS));

        unitigs.push_back(singUnitig);
        thisUnitig = unitigs[ti];  //  Reset the pointer; unitigs might be reallocated

      } else {
        //  Leave fragment here.  Copy the fragment to the list -- if
        //  we need to rebuild the unitig (because fragments were
        //  removed), the list is used, otherwise, we have already
        //  made the changes needed.
        //
        //  Also, very important, update our containment mark.  If our
        //  container was moved, but we stayed put because of a happy
        //  mate, we're still marked as being contained.  Rather than
        //  put this check in all the places where we stay put in the
        //  above if-else-else-else, it's here.

        if ((frg->contained) && (thisUtgID != contUtgID))
          frg->contained = 0;

        frags[fragsLen] = *frg;
        fragsLen++;
      }

    }  //  over all frags

    //  Now, rebuild this unitig if we made changes.

    if (fragsLen != thisUnitig->ufpath.size()) {
      if (logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS))
        fprintf(logFile, "Rebuild unitig %d after removing contained fragments.\n", thisUnitig->id());

      thisUnitig->ufpath.clear();

      //  Occasionally, we move all fragments out of the original unitig.  Might be worth checking
      //  if that makes sense!!
      //
#warning EMPTIED OUT A UNITIG
      if (fragsLen > 0) {
        //  No need to resort.  Offsets only need adjustment if the first fragment is thrown out.
        //  If not, splitOffset will be zero.
        //
        int splitOffset = -MIN(frags[0].position.bgn, frags[0].position.end);

        //  This is where we clean up from the splitting not dealing with contained fragments -- we
        //  force the first frag to be uncontained.
        //
        frags[0].contained = 0;

        for (uint32 i=0; i<fragsLen; i++)
          thisUnitig->addFrag(frags[i], splitOffset, logFileFlagSet(LOG_MATE_SPLIT_UNHAPPY_CONTAINS));
      }
    }

    delete [] frags;
    frags = NULL;

  }  //  Over all unitigs
}

Beispiel #10

Datei anzeigen

Datei: AS_BAT_IntersectSplit.C Projekt: ondovb/canu

intersectionList::intersectionList(UnitigVector &unitigs) {

  for (uint32 ti=0; ti<unitigs.size(); ti++) {
    Unitig             *tig = unitigs[ti];

    if (tig == NULL)
      continue;

    intersectionEvidence *evidence = new intersectionEvidence [tig->ufpath.size()];

    for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
      ufNode  *frg = &tig->ufpath[fi];

      if (OG->isContained(frg->ident))
        continue;

      //  For my best overlap, the ID of the unitig that the overlapping fragment is in.

      evidence[fi].edge5 = *OG->getBestEdgeOverlap(frg->ident, false);
      evidence[fi].edge3 = *OG->getBestEdgeOverlap(frg->ident, true);

      evidence[fi].frag5tig = tig->fragIn(evidence[fi].edge5.fragId());
      evidence[fi].frag3tig = tig->fragIn(evidence[fi].edge3.fragId());

      //  Do NOT initialize these!  An earlier fragment could have already confirmed an end.
      //  Properly, only the 5' end of a forward fragment (or 3' end of a reverse fragment) can be
      //  confirmed already (otherwise the tig is nonsense), but we don't yet check that.
      //
      //evidence[fi].frag5confirmed = false;
      //evidence[fi].frag3confirmed = false;

      //  But, because the path could be promiscuous, not every overlap to a different tig is bad.
      //
      //  If my best overlap is to a different tig, but there is an overlapping fragment (in the
      //  unitig placement) with a best edge to me, I'm still good.  The BOG build this unitig using
      //  the edge from the other fragment to me.
      //
      //  If the fragments do not overlap in the layout (yet the best edge still exists) that is a
      //  self-intersection.
      //
      //  The two blocks are identical, except for 'edge3' and 'edge5'.

      if (evidence[fi].frag5tig == tig->id()) {
        uint32   ti  = tig->pathPosition(evidence[fi].edge5.fragId());
        ufNode  *trg = &tig->ufpath[ti];

        uint32  minf = (frg->position.bgn < frg->position.end) ? frg->position.bgn : frg->position.end;
        uint32  maxf = (frg->position.bgn < frg->position.end) ? frg->position.end : frg->position.bgn;

        uint32  mint = (trg->position.bgn < trg->position.end) ? trg->position.bgn : trg->position.end;
        uint32  maxt = (trg->position.bgn < trg->position.end) ? trg->position.end : trg->position.bgn;

        //  If they overlap, mark as confirmed, else remember an intersection.

        if (((minf < mint) && (mint < maxf)) ||  //  t begins inside f
            ((mint < minf) && (minf < maxt))) {  //  f begins inside t
          if (evidence[fi].edge5.frag3p())
            evidence[ti].frag3confirmed = true;
          else
            evidence[ti].frag5confirmed = true;

        } else {
          evidence[fi].frag5self = true;

          //  Not the correct place to report this.  Some of these get confirmed by later fragments.
          //writeLog("BUG1 F: %d,%d T %d,%d\n", minf, maxf, mint, maxt);
          //writeLog("INTERSECT from unitig %d frag %d end %d TO unitig %d frag %d end %d (SELF)\n",
          //        tig->id(), frg->ident, 5, evidence[fi].frag5tig, evidence[fi].edge5.fragId(), evidence[fi].edge5.frag3p() ? 3 : 5);
        }
      }



      if (evidence[fi].frag3tig == tig->id()) {
        uint32   ti  = tig->pathPosition(evidence[fi].edge3.fragId());
        ufNode  *trg = &tig->ufpath[ti];

        uint32  minf = (frg->position.bgn < frg->position.end) ? frg->position.bgn : frg->position.end;
        uint32  maxf = (frg->position.bgn < frg->position.end) ? frg->position.end : frg->position.bgn;

        uint32  mint = (trg->position.bgn < trg->position.end) ? trg->position.bgn : trg->position.end;
        uint32  maxt = (trg->position.bgn < trg->position.end) ? trg->position.end : trg->position.bgn;

        if (((minf < mint) && (mint < maxf)) ||  //  t begins inside f
            ((mint < minf) && (minf < maxt))) {  //  f begins inside t
          if (evidence[fi].edge3.frag3p())
            evidence[ti].frag3confirmed = true;
          else
            evidence[ti].frag5confirmed = true;

        } else {
          evidence[fi].frag3self = true;

          //  Not the correct place to report this.  Some of these get confirmed by later fragments.
          //writeLog("BUG2 F: %d,%d T %d,%d\n", minf, maxf, mint, maxt);
          //writeLog("INTERSECT from unitig %d frag %d end %d TO unitig %d frag %d end %d (SELF)\n",
          //        tig->id(), frg->ident, 3, evidence[fi].frag3tig, evidence[fi].edge3.fragId(), evidence[fi].edge3.frag3p() ? 3 : 5);
        }
      }
    }

    //
    //  Build the list.
    //

    for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
      ufNode             *frg = &tig->ufpath[fi];

      if ((evidence[fi].frag5tig != 0) &&
          (evidence[fi].frag5tig != tig->id()) &&
          (evidence[fi].frag5confirmed == false))
        isects.push_back(intersectionPoint(evidence[fi].edge5, frg->ident, false, false));

      if ((evidence[fi].frag5tig == tig->id()) &&
          (evidence[fi].frag5self == true) &&
          (evidence[fi].frag5confirmed == false))
        isects.push_back(intersectionPoint(evidence[fi].edge5, frg->ident, false, true));

      if ((evidence[fi].frag3tig != 0) &&
          (evidence[fi].frag3tig != tig->id()) &&
          (evidence[fi].frag3confirmed == false))
        isects.push_back(intersectionPoint(evidence[fi].edge3, frg->ident, true, false));

      if ((evidence[fi].frag3tig == tig->id()) &&
          (evidence[fi].frag3self == true) &&
          (evidence[fi].frag3confirmed == false))
        isects.push_back(intersectionPoint(evidence[fi].edge3, frg->ident, true, true));
    }

    delete [] evidence;
  }


  //  Sort the intersections by the ID of the intersected fragment, then build an index into the array.

  std::sort(isects.begin(), isects.end());

  //  Terminate the intersection list with a sentinal intersection.  This is CRITICAL
  //  to the way we iterate over intersections.

  isects.push_back(intersectionPoint(BestEdgeOverlap(), 0, true, true));

  //  Build a map from fragment id to the first intersection in the list.

  for (uint32 i=0; i<isects.size(); i++) {
    isectsNum[isects[i].isectFrg]++;

    if (isectsMap.find(isects[i].isectFrg) == isectsMap.end())
      isectsMap[isects[i].isectFrg] = i;
  }
}

Beispiel #11

Datei anzeigen

Datei: AS_BOG_SetParentAndHang.C Projekt: cdunn2001/DConvert

void
UnitigGraph::setParentAndHang(void) {

  for (uint32 ti=0; ti<unitigs.size(); ti++) {
    Unitig        *utg = unitigs[ti];

    if (utg == NULL)
      continue;

    if (utg->ufpath.size() == 0)
      continue;

    //  Reset parent and hangs for everything.

    for (uint32 fi=1; fi<utg->ufpath.size(); fi++) {
      ufNode *frg = &utg->ufpath[fi];

      frg->parent       = 0;
      frg->ahang        = 0;
      frg->bhang        = 0;
    }

    //  For each fragment, set parent/hangs using the edges.

    for (uint32 fi=0; fi<utg->ufpath.size(); fi++) {
      ufNode *frg  = &utg->ufpath[fi];

      //  If we're contained, gee, I sure hope the container is here!

      BestContainment *bestcont  = OG->getBestContainer(frg->ident);

      if ((bestcont) && (utg->fragIn(bestcont->container) == utg->id())) {
        int32   pi   = utg->pathPosition(bestcont->container);
        ufNode *par  = &utg->ufpath[pi];

        frg->parent = bestcont->container;

        //  The hangs assume the container is forward; adjust if not so.
        if (par->position.bgn < par->position.end) {
          frg->ahang  = bestcont->a_hang;
          frg->bhang  = bestcont->b_hang;
        } else {
          frg->ahang  = -bestcont->b_hang;
          frg->bhang  = -bestcont->a_hang;
        }

        continue;
      }

      //  Nope, not contained.  If we don't have a parent set, see if one of our best overlaps
      //  can set it.

      BestEdgeOverlap *bestedge5 = OG->getBestEdgeOverlap(frg->ident, false);
      BestEdgeOverlap *bestedge3 = OG->getBestEdgeOverlap(frg->ident, true);

      if ((bestedge5->fragId()) && (utg->fragIn(bestedge5->fragId()) == utg->id())) {
        int32         pi5  = utg->pathPosition(bestedge5->fragId());
        ufNode *oth  = &utg->ufpath[pi5];

        //  Consensus is expected parent/hangs to be relative to the parent fragment.  This is used
        //  ONLY to place the fragment, not to orient the fragment.  Orientation comes from the
        //  absolute positioning coordinates.
        //
        //  Interestingly, all four overlap transformations are used here.
        //
        //  The inner if tests (on fragment orientation) should be asserts, but due to imprecise
        //  layouts, they are sometimes violated:
        //    A fragment from       271-547 had a 5'overlap to something after it;
        //    the frag after was at 543-272, close enough to a tie to screw up placements
        //
        if (pi5 < fi) {
          //  We have an edge off our 5' end to something before us --> fragment MUST be forward.
          //  Flip the overlap so it is relative to the other fragment.
          if (frg->position.bgn < frg->position.end) {
            frg->parent = bestedge5->fragId();
            frg->ahang  = -bestedge5->ahang();
            frg->bhang  = -bestedge5->bhang();
            assert(frg->ahang >= 0);
          }
        } else {
          //  We have an edge off our 5' end to something after us --> fragment MUST be reverse.
          //  Because our fragment is now reverse, we must reverse the overlap too.
          if (frg->position.end < frg->position.bgn) {
            oth->parent = frg->ident;
            oth->ahang  = -bestedge5->bhang();
            oth->bhang  = -bestedge5->ahang();
            assert(oth->ahang >= 0);
          }
        }
      }

      if ((bestedge3->fragId()) && (utg->fragIn(bestedge3->fragId()) == utg->id())) {
        int32         pi3  = utg->pathPosition(bestedge3->fragId());
        ufNode *oth  = &utg->ufpath[pi3];

        if (pi3 < fi) {
          //  We have an edge off our 3' end to something before us --> fragment MUST be reverse.
          //  Flip the overlap so it is relative to the other fragment.
          //  Because our fragment is now reverse, we must reverse the overlap too.
          if (frg->position.end < frg->position.bgn) {
            frg->parent = bestedge3->fragId();
            frg->ahang  = bestedge3->bhang();
            frg->bhang  = bestedge3->ahang();
            assert(frg->ahang >= 0);
          }
        } else {
          //  We have an edge off our 3' end to something after us --> fragment MUST be forward.
          //  This is the simplest case, the overlap is already correct.
          if (frg->position.bgn < frg->position.end) {
            oth->parent = frg->ident;
            oth->ahang  = bestedge3->ahang();
            oth->bhang  = bestedge3->bhang();
            assert(oth->ahang >= 0);
          }
        }
      }
    }
  }
}

Beispiel #12

Datei anzeigen

Datei: AS_BAT_PopBubbles.C Projekt: AndreasHegerGenomics/canu

void
findPotentialBubbles(UnitigVector    &unitigs,
                     BubTargetList   &potentialBubbles) {
  uint32  tiLimit      = unitigs.size();
  uint32  tiNumThreads = omp_get_max_threads();
  uint32  tiBlockSize  = (tiLimit < 100000 * tiNumThreads) ? tiNumThreads : tiLimit / 99999;

  writeStatus("\n");
  writeStatus("bubbleDetect()-- working on "F_U32" unitigs, with "F_U32" threads.\n", tiLimit, tiNumThreads);

  for (uint32 ti=0; ti<tiLimit; ti++) {
    Unitig  *tig = unitigs[ti];

    if ((tig == NULL) ||               //  Not a tig, ignore it.
        (tig->ufpath.size() == 1))     //  Singleton, handled elsewhere.
      continue;

    uint32  nonContainedReads = 0;
    bool    validBubble       = true;

    map<uint32,uint32>  tigOlapsTo;

    uint32  fiLimit      = tig->ufpath.size();
    uint32  fiNumThreads = omp_get_max_threads();
    uint32  fiBlockSize  = (fiLimit < 100 * fiNumThreads) ? fiNumThreads : fiLimit / 99;

    for (uint32 fi=0; (validBubble == true) && (fi<fiLimit); fi++) {
      uint32      rid      = tig->ufpath[fi].ident;

      if (OG->isContained(rid) == true)  //  Don't need to check contained reads.  If their container
        continue;                        //  passes the tests below, the contained read will too.

      nonContainedReads++;

      uint32      ovlLen   = 0;
      BAToverlap *ovl      = OC->getOverlaps(rid, AS_MAX_ERATE, ovlLen);

      set<uint32>  readOlapsTo;

      for (uint32 oi=0; oi<ovlLen; oi++) {
        uint32  ovlTigID = Unitig::fragIn(ovl[oi].b_iid);
        Unitig *ovlTig   = unitigs[ovlTigID];

        //  Skip this overlap if it is to an unplaced read, to a singleton tig, to ourself,
        //  or to a unitig that is shorter than us.  We can not pop this tig as a bubble
        //  in any of those cases.

        if ((ovlTigID == 0) ||
            (ovlTig == NULL) ||
            (ovlTig->ufpath.size() == 1) ||
            (ovlTig->id() == tig->id()) ||
            (ovlTig->getLength() < tig->getLength()))
          continue;

        //  Otherwise, remember that we had an overlap to ovlTig.

        //writeLog("tig %u read %u overlap to tig %u read %u\n",
        //         tig->id(), rid, ovlTigID, ovl[oi].b_iid);

        readOlapsTo.insert(ovlTigID);
      }

      //writeLog("tig %8u read %8u has %u olaps\n", tig->id(), rid, readOlapsTo.size());

      //  Transfer the per-read counts to the per-unitig counts:  add one to the counter for each tig
      //  that we have overlaps to.

      for (set<uint32>::iterator it=readOlapsTo.begin(); it != readOlapsTo.end(); ++it)
        tigOlapsTo[*it]++;

      //  Decide if we're a valid potential bubble.  If tig id (in it->first) has overlaps to every
      //  read we've seen so far (nonContainedReads), we're still a valid bubble.
      //
      //  To _attempt_ to have differences in the bubble, we'll accept it if 3/4 of the reads
      //  have overlaps.

      validBubble = false;

      for (map<uint32,uint32>::iterator it=tigOlapsTo.begin(); it != tigOlapsTo.end(); ++it)
        if (it->second >= BUBBLE_READ_FRACTION * nonContainedReads)
          validBubble = true;

      //  If we've not seen that many reads, pretend it's a valid bubble.  It'll get screened out later.

      if (nonContainedReads < 16)
        validBubble = true;
    }

    //  If not validBubble, report.

#if 0
    if (validBubble == false) {
      writeLog("notValidBubble tig %8d expects %6u reads\n", tig->id(), nonContainedReads);

      for (map<uint32,uint32>::iterator it=tigOlapsTo.begin(); it != tigOlapsTo.end(); ++it)
        writeLog("  to tig %8u overlaps %6u\n", it->first, it->second);
    }
#endif

    //  If validBubble, then there is a tig that every dovetail read has at least one overlap to.
    //  Save those tigs in potentialBubbles.

    uint32  nTigs = 0;

    if (validBubble) {
      for (map<uint32,uint32>::iterator it=tigOlapsTo.begin(); it != tigOlapsTo.end(); ++it)
        if (it->second >= BUBBLE_READ_FRACTION * nonContainedReads)
          nTigs++;
    }

    //  ALWAYS log potential bubbles.

    if (nTigs > 0) {
      writeLog("\n");
      writeLog("potential bubble tig %8u length %9u nReads %7u to %3u tigs:\n",
               tig->id(), tig->getLength(), tig->ufpath.size(), nTigs);

      for (map<uint32,uint32>::iterator it=tigOlapsTo.begin(); it != tigOlapsTo.end(); ++it) {
        if (it->second >= BUBBLE_READ_FRACTION * nonContainedReads) {
          Unitig  *dest = unitigs[it->first];

          writeLog("                 tig %8u length %9u nReads %7u\n", dest->id(), dest->getLength(), dest->ufpath.size());

          potentialBubbles[ti].push_back(dest->id());
        }
      }
    }
  }

  flushLog();
}

Beispiel #13

Datei anzeigen

Datei: AS_BAT_PopBubbles.C Projekt: AndreasHegerGenomics/canu

void
popBubbles(UnitigVector &unitigs,
           double deviationBubble) {

  BubTargetList   potentialBubbles;

  findPotentialBubbles(unitigs, potentialBubbles);

  writeStatus("popBubbles()-- Found "F_SIZE_T" potential bubbles.\n", potentialBubbles.size());

  //if (potentialBubbles.size() == 0)
  //  return;

  writeLog("\n");
  writeLog("Found "F_SIZE_T" potential bubbles.\n", potentialBubbles.size());
  writeLog("\n");

  vector<overlapPlacement>   *placed = findBubbleReadPlacements(unitigs, potentialBubbles, deviationBubble);

  //  We now have, in 'placed', a list of all the places that each read could be placed.  Decide if there is a _single_
  //  place for each bubble to be popped.

  uint32  tiLimit      = unitigs.size();
  //uint32  tiNumThreads = omp_get_max_threads();
  //uint32  tiBlockSize  = (tiLimit < 100000 * tiNumThreads) ? tiNumThreads : tiLimit / 99999;

  //  Clear flags.
  for (uint32 ti=0; ti<tiLimit; ti++) {
    if (unitigs[ti]) {
      unitigs[ti]->_isBubble = false;
      unitigs[ti]->_isRepeat = false;
    }
  }

  //  In parallel, process the placements.

  for (uint32 ti=0; ti<tiLimit; ti++) {
    if (potentialBubbles.count(ti) == 0)   //  Not a potential bubble
      continue;

    //  Scan the bubble, decide if there are _ANY_ read placements.  Log appropriately.

    Unitig  *bubble = unitigs[ti];
    bool     hasPlacements = false;

    for (uint32 fi=0; fi<bubble->ufpath.size(); fi++) {
      uint32  readID  = bubble->ufpath[fi].ident;

      if (placed[readID].size() > 0)
        hasPlacements = true;
    }

    if (hasPlacements == false)
      writeLog("potential bubble %u had no valid placements (all were not contained in target tig)\n", ti);
    else
      writeLog("potential bubble %u\n", ti);

    //  Split the placements into piles for each target and build an interval list for each target.
    //  For each read in the tig, convert the vector of placements into interval lists, one list per target tig.

    map<uint32, intervalList<uint32> *>  targetIntervals;

    for (uint32 fi=0; fi<bubble->ufpath.size(); fi++) {
      uint32  readID  = bubble->ufpath[fi].ident;

      for (uint32 pp=0; pp<placed[readID].size(); pp++) {
        uint32  tid = placed[readID][pp].tigID;

        assert(placed[readID][pp].frgID > 0);

        uint32  bgn = (placed[readID][pp].position.bgn < placed[readID][pp].position.end) ? placed[readID][pp].position.bgn : placed[readID][pp].position.end;
        uint32  end = (placed[readID][pp].position.bgn < placed[readID][pp].position.end) ? placed[readID][pp].position.end : placed[readID][pp].position.bgn;

        if (targetIntervals[tid] == NULL)
          targetIntervals[tid] = new intervalList<uint32>;

        //writeLog("read %u -> tig %u intervals %u-%u\n", readID, tid, bgn, end);

        targetIntervals[tid]->add(bgn, end-bgn);
      }
    }

    vector<candidatePop *>    targets;

    //  Squish the intervals.  Create new candidatePops for each interval that isn't too big or
    //  small.  Assign each overlapPlacements to the correct candidatePop.

    for (map<uint32, intervalList<uint32> *>::iterator it=targetIntervals.begin(); it != targetIntervals.end(); ++it) {
      uint32                 targetID = it->first;
      intervalList<uint32>  *IL       = it->second;

      IL->merge();

      //  Discard intervals that are significantly too small or large.  Save the ones that are
      //  nicely sized.  Logging here isn't terribly useful, it's just repeated (out of order) later
      //  when we try to make sense of the read alignments.

      for (uint32 ii=0; ii<IL->numberOfIntervals(); ii++) {
        if ((IL->hi(ii) - IL->lo(ii) < 0.75 * bubble->getLength()) ||   //  Too small!
            (1.25 * bubble->getLength() < IL->hi(ii) - IL->lo(ii))) {   //  Too big!
          writeLog("tig %8u length %9u -> target %8u piece %2u position %9u-%9u length %8u - size mismatch, discarded\n",
                   bubble->id(), bubble->getLength(),
                   targetID, ii, IL->lo(ii), IL->hi(ii), IL->hi(ii) - IL->lo(ii));
          continue;
        }

        writeLog("tig %8u length %9u -> target %8u piece %2u position %9u-%9u length %8u\n",
                 bubble->id(), bubble->getLength(),
                 targetID, ii, IL->lo(ii), IL->hi(ii), IL->hi(ii) - IL->lo(ii));

        targets.push_back(new candidatePop(bubble, unitigs[targetID], IL->lo(ii), IL->hi(ii)));
      }

      delete IL;
    }

    targetIntervals.clear();

    //  If no targets, nothing to do.

    if (targets.size() == 0)
      continue;

    //  Run through the placements again, and assign them to the correct target.
    //
    //  For each read:
    //  For each acceptable placement:
    //  For each target location:
    //  If the placement is for this target, save it.

    for (uint32 fi=0; fi<bubble->ufpath.size(); fi++) {
      uint32  readID  = bubble->ufpath[fi].ident;

      for (uint32 pp=0; pp<placed[readID].size(); pp++) {
        uint32  tid = placed[readID][pp].tigID;

        uint32  bgn = (placed[readID][pp].position.bgn < placed[readID][pp].position.end) ? placed[readID][pp].position.bgn : placed[readID][pp].position.end;
        uint32  end = (placed[readID][pp].position.bgn < placed[readID][pp].position.end) ? placed[readID][pp].position.end : placed[readID][pp].position.bgn;

        for (uint32 tt=0; tt<targets.size(); tt++)
          if ((targets[tt]->target->id() == tid) &&
              (targets[tt]->bgn < end) && (bgn < targets[tt]->end))
            targets[tt]->placed.push_back(placed[readID][pp]);
      }
    }

    //  Count the number of targets that have all the reads (later: in the correct order, etc, etc).  Remove those
    //  that don't.

    uint32  nTargets = 0;

    set<uint32>  tigReads;  //  Reads in the bubble tig.
    set<uint32>  tgtReads;  //  Reads in the bubble that have a placement in the target.

    //  Remove duplicate placements from each target.

    for (uint32 tt=0; tt<targets.size(); tt++) {
      candidatePop *t = targets[tt];

      //  Detect duplicates, keep the one with lower error.  There are a lot of duplicate
      //  placements, logging isn't terribly useful.

      for (uint32 aa=0; aa<t->placed.size(); aa++) {
        for (uint32 bb=0; bb<t->placed.size(); bb++) {
          if ((aa == bb) ||
              (t->placed[aa].frgID != t->placed[bb].frgID) ||
              (t->placed[aa].frgID == 0) ||
              (t->placed[bb].frgID == 0))
            continue;

          if (t->placed[aa].errors / t->placed[aa].aligned < t->placed[bb].errors / t->placed[bb].aligned) {
#ifdef SHOW_MULTIPLE_PLACEMENTS
            writeLog("duplicate read alignment for tig %u read %u - better %u-%u %.4f - worse %u-%u %.4f\n",
                     t->placed[aa].tigID, t->placed[aa].frgID,
                     t->placed[aa].position.bgn, t->placed[aa].position.end, t->placed[aa].errors / t->placed[aa].aligned,
                     t->placed[bb].position.bgn, t->placed[bb].position.end, t->placed[bb].errors / t->placed[bb].aligned);
#endif
            t->placed[bb] = overlapPlacement();
          } else {
#ifdef SHOW_MULTIPLE_PLACEMENTS
            writeLog("duplicate read alignment for tig %u read %u - better %u-%u %.4f - worse %u-%u %.4f\n",
                     t->placed[aa].tigID, t->placed[aa].frgID,
                     t->placed[bb].position.bgn, t->placed[bb].position.end, t->placed[bb].errors / t->placed[bb].aligned,
                     t->placed[aa].position.bgn, t->placed[aa].position.end, t->placed[aa].errors / t->placed[aa].aligned);
#endif
            t->placed[aa] = overlapPlacement();
          }
        }
      }

      //  Get rid of any now-empty entries.

      for (uint32 aa=t->placed.size(); aa--; ) {
        if (t->placed[aa].frgID == 0) {
          t->placed[aa] = t->placed.back();
          t->placed.pop_back();
        }
      }
    }

    //  Make a set of the reads in the bubble.  We'll compare each target against this to decide if all reads are placed.

    for (uint32 fi=0; fi<bubble->ufpath.size(); fi++)
      tigReads.insert(bubble->ufpath[fi].ident);

    uint32   nOrphan      = 0;   //  Full coverage; bubble can be popped.
    uint32   orphanTarget = 0;

    uint32   nBubble      = 0;   //  Partial coverage, bubble cannot be popped.
    uint32   bubbleTarget = 0;

    for (uint32 tt=0; tt<targets.size(); tt++) {
      tgtReads.clear();

      for (uint32 op=0; op<targets[tt]->placed.size(); op++) {
        if (logFileFlagSet(LOG_BUBBLE_DETAIL))
          writeLog("tig %8u length %9u -> target %8u piece %2u position %9u-%9u length %8u - read %7u at %9u-%9u\n",
                   bubble->id(), bubble->getLength(),
                   targets[tt]->target->id(), tt, targets[tt]->bgn, targets[tt]->end, targets[tt]->end - targets[tt]->bgn,
                   targets[tt]->placed[op].frgID,
                   targets[tt]->placed[op].position.bgn, targets[tt]->placed[op].position.end);

        assert(targets[tt]->placed[op].frgID > 0);
        tgtReads.insert(targets[tt]->placed[op].frgID);
      }

      //  Count the number of consecutive reads from the 5' or 3' end of the bubble that are placed
      //  in the target.
      //
      //  Also, count the number of reads in the bubble that are placed in the target.  Likely the
      //  same as n5 + n3.

      uint32  n5 = 0;
      uint32  n3 = 0;
      uint32  nt = 0;

      for (uint32 fi=0; fi<bubble->ufpath.size(); fi++)
        if (tgtReads.count(bubble->ufpath[fi].ident) > 0)
          n5++;
        else
          break;

      for (uint32 fi=bubble->ufpath.size(); fi-->0; )
        if (tgtReads.count(bubble->ufpath[fi].ident) > 0)
          n3++;
        else
          break;


      for (uint32 fi=0; fi<bubble->ufpath.size(); fi++)
        if (tgtReads.count(bubble->ufpath[fi].ident) > 0)
          nt++;


      //  Report now, before we nuke targets[tt] for being not a bubble!

      if ((nt == bubble->ufpath.size()) ||
          ((n5 > 0) && (n3 > 0)))
        writeLog("tig %8u length %9u -> target %8u piece %2u position %9u-%9u length %8u - expected %3"F_SIZE_TP" reads, had %3"F_SIZE_TP" reads.  n5=%3u n3=%3u nt=%3u\n",
                 bubble->id(), bubble->getLength(),
                 targets[tt]->target->id(), tt, targets[tt]->bgn, targets[tt]->end, targets[tt]->end - targets[tt]->bgn,
                 tigReads.size(),
                 tgtReads.size(), n5, n3, nt);

      //  Decide if this is a bubble, orphan from construction, or repeat.

      if (nt == bubble->ufpath.size()) {
        nOrphan++;
        orphanTarget = tt;
      }

      else if ((n5 > 0) && (n3 > 0)) {
        nBubble++;
        bubbleTarget = tt;
      }
    }

    //  If no placements, pbbbt.

    if (nOrphan + nBubble == 0) {
      //writeLog("tig %8u length %8u reads %6u had no bubble or orphan placements.\n", bubble->id(), bubble->getLength(), bubble->ufpath.size());
      continue;
    }

    //  If multiple orphan and/or bubble placements, it's a repeat.

    if (nOrphan + nBubble > 1) {
      writeLog("tig %8u length %8u reads %6u - repeat - %u orphan %u bubble placements.\n",
               bubble->id(), bubble->getLength(), bubble->ufpath.size(),
               nOrphan, nBubble);
      writeLog("\n");
      bubble->_isRepeat = true;
      continue;
    }

    //  If a bubble placement, mark it as a bubble so it can be skipped during repeat detection.

    if (nBubble > 0) {
      writeLog("tig %8u length %8u reads %6u - bubble\n",
               bubble->id(), bubble->getLength(), bubble->ufpath.size());
      writeLog("\n");
      bubble->_isBubble = true;
      continue;
    }

    //  Otherwise, it's an orphan, move the reads to the proper place.

    writeLog("tig %8u length %8u reads %6u - orphan\n", bubble->id(), bubble->getLength(), bubble->ufpath.size());

    for (uint32 op=0, tt=orphanTarget; op<targets[tt]->placed.size(); op++) {
      ufNode  frg;

      frg.ident        = targets[tt]->placed[op].frgID;
      frg.contained    = 0;
      frg.parent       = 0;
      frg.ahang        = 0;
      frg.bhang        = 0;
      frg.position.bgn = targets[tt]->placed[op].position.bgn;
      frg.position.end = targets[tt]->placed[op].position.end;

      writeLog("move read %u from tig %u to tig %u %u-%u\n",
               frg.ident,
               bubble->id(),
               targets[tt]->target->id(), frg.position.bgn, frg.position.end);

      targets[tt]->target->addFrag(frg, 0, false);
    }

    writeLog("\n");

    unitigs[bubble->id()] = NULL;
    delete bubble;
  }  //  Over all bubbles

  writeLog("\n");   //  Needed if no bubbles are popped.

  delete [] placed;

  //  Sort reads in all the tigs.  Overkill, but correct.

  for (uint32 ti=0; ti<tiLimit; ti++) {
    Unitig  *tig = unitigs[ti];

    if ((tig == NULL) ||               //  Not a tig, ignore it.
        (tig->ufpath.size() == 1))     //  Singleton, already sorted.
      continue;

    tig->sort();
  }
}

Beispiel #14

Datei anzeigen

Datei: AS_BAT_ExtendByMates.C Projekt: ondovb/canu

void
extendByMates(UnitigVector &unitigs,
              double        erateGraph) {

  //logFileFlags |= LOG_CHUNK_GRAPH;
  logFileFlags |= LOG_POPULATE_UNITIG;

  writeLog("==> EXTENDING UNITIGS WITH MATE PAIRS.\n");

  uint32 tiMax = unitigs.size();

  for (uint32 ti=0; ti<tiMax; ti++) {
    Unitig        *target = unitigs[ti];

    if (target == NULL)
      continue;

    if (target->ufpath.size() < 2)
      continue;

    //  Build a list of all the fragments in this unitig, and any mates that are not in a unitig.

    uint32        extraMates = 0;

    for (uint32 fi=0; fi<target->ufpath.size(); fi++) {
      uint32  fid = target->ufpath[fi].ident;
      uint32  mid = FI->mateIID(fid);

      if ((mid != 0) &&
          (Unitig::fragIn(mid) == 0))
        extraMates++;
    }

    writeLog("\n");
    writeLog("unitig "F_U32" of size "F_SIZE_T" with "F_U32" extra fragments via mates\n",
            ti, target->ufpath.size(), extraMates);

    if (extraMates == 0)
      continue;

    //  Build a set of the fragments in this unitig plus their mates, and a set of just the mates.

    set<uint32>   frags;
    set<uint32>   mates;

    for (uint32 fi=0; fi<target->ufpath.size(); fi++) {
      uint32  fid = target->ufpath[fi].ident;
      uint32  mid = FI->mateIID(fid);

      frags.insert(fid);

      if ((mid != 0) &&
          (Unitig::fragIn(mid) == 0)) {
        writeLog("  mate frag "F_U32"\n", mid);
        frags.insert(mid);
        mates.insert(mid);
      }
    }

    //  Now, remove all the unitig fragments from the unitig so we can reconstruct it with the
    //  additional mated fragments.  Note that this loop cannot be combined with the last, since
    //  the test for 'additional mate' is 'not in the same unitig' -- and if we remove the
    //  fragments too early, we can't distinguish 'additional' from 'included'.

    for (uint32 fi=0; fi<target->ufpath.size(); fi++)
      target->removeFrag(target->ufpath[fi].ident);

    unitigs[ti] = NULL;
    delete target;

    //  Build a new BOG for just those fragments - in particular, only overlaps within the set are
    //  used for the BOG.

    BestOverlapGraph  *OGsave = OG;
    ChunkGraph        *CGsave = CG;

    OG = new BestOverlapGraph(erateGraph, &frags);
    CG = new ChunkGraph(&frags);

    uint32  numTigs = unitigs.size();

    //  Build new unitigs.  There should only be one new unitig constructed, but that isn't
    //  guaranteed.  No new unitigs are built if they are seeded from the mate fragments.  This
    //  isn't ideal -- we'd like to allow the first unitig (supposedly the longest) to start from
    //  a mate fragment.  However, consider the not-so-rare case where the original unitig is two
    //  backbone fragments and lots of contains.  Those contains contribute mate pairs that all
    //  assemble together, giving a longer path than the original unitig.  We don't want to
    //  assemble the mated fragments yet (we'll wait until we get the rest of the fragments that
    //  could assemble together).

    for (uint32 fi = CG->nextFragByChunkLength(); fi > 0; fi=CG->nextFragByChunkLength()) {
      if ((Unitig::fragIn(fi) != 0) ||
          (mates.count(fi) > 0))
        //  Fragment already in a unitig, or is an additional mate that we don't want
        //  to seed from.
        continue;

      populateUnitig(unitigs, fi);
    }

    //  Report what was constructed

    if (unitigs.size() - numTigs > 1)
      writeLog("WARNING: mate extension split a unitig.\n");


    for (uint32 newTigs=numTigs; newTigs<unitigs.size(); newTigs++) {
      Unitig  *tig = unitigs[newTigs];

      if (tig == NULL)
        continue;

      placeContainsUsingBestOverlaps(tig, &frags);

      writeLog("  new tig "F_U32" with "F_SIZE_T" fragments\n",
              tig->id(), tig->ufpath.size());
    }

    delete OG;
    delete CG;

    OG = OGsave;
    CG = CGsave;
  }
}

Beispiel #15

Datei anzeigen

Datei: AS_BAT_Outputs.C Projekt: ondovb/canu

void
writeUnitigsToStore(UnitigVector  &unitigs,
                    char          *fileprefix,
                    char          *tigStorePath,
                    uint32         frg_count_target,
                    bool           isFinal) {
  uint32      utg_count              = 0;
  uint32      frg_count              = 0;
  uint32      prt_count              = 1;
  char        filename[FILENAME_MAX] = {0};
  uint32     *partmap                = new uint32 [unitigs.size()];

  //  This code closely follows that in AS_CGB_unitigger.c::output_the_chunks()

  if (isFinal)
    checkUnitigMembership(unitigs);

  // Open up the initial output file

  sprintf(filename, "%s.iidmap", fileprefix);
  FILE *iidm = fopen(filename, "w");
  assert(NULL != iidm);

  sprintf(filename, "%s.partitioning", fileprefix);
  FILE *part = fopen(filename, "w");
  assert(NULL != part);

  sprintf(filename, "%s.partitioningInfo", fileprefix);
  FILE *pari = fopen(filename, "w");
  assert(NULL != pari);

  //  Step through all the unitigs once to build the partition mapping and IID mapping.

  memset(partmap, 0xff, sizeof(uint32) * unitigs.size());

  for (uint32 iumiid=0, ti=0; ti<unitigs.size(); ti++) {
    Unitig  *utg = unitigs[ti];
    uint32   nf  = (utg) ? utg->getNumFrags() : 0;

    if ((utg == NULL) || (nf == 0))
      continue;

    assert(utg->getLength() > 0);
    assert(nf == utg->ufpath.size());

    if ((frg_count + nf >= frg_count_target) &&
        (frg_count      >  0)) {
      fprintf(pari, "Partition %d has %d unitigs and %d fragments.\n",
              prt_count, utg_count, frg_count);

      prt_count++;
      utg_count = 0;
      frg_count = 0;
    }

    uint32 tigid = (isFinal) ? iumiid : ti;

    assert(tigid < unitigs.size());
    partmap[tigid] = prt_count;

    fprintf(iidm, "Unitig "F_U32" == IUM "F_U32" (in partition "F_U32" with "F_U32" frags)\n",
            utg->id(),
            (tigid),
            partmap[(tigid)],
            nf);

    for (uint32 fragIdx=0; fragIdx<nf; fragIdx++) {
      ufNode  *f = &utg->ufpath[fragIdx];

      fprintf(part, "%d\t%d\n", prt_count, f->ident);
    }

    utg_count += 1;
    frg_count += nf;

    iumiid++;
  }

  fprintf(pari, "Partition %d has %d unitigs and %d fragments.\n",
          prt_count, utg_count, frg_count);

  fclose(pari);
  fclose(part);
  fclose(iidm);

  //  Step through all the unitigs once to build the partition mapping and IID mapping.

  tgStore     *tigStore = new tgStore(tigStorePath);
  tgTig       *tig      = new tgTig;

  for (uint32 iumiid=0, ti=0; ti<unitigs.size(); ti++) {
    Unitig  *utg = unitigs[ti];
    uint32   nf  = (utg) ? utg->getNumFrags() : 0;

    if ((utg == NULL) || (nf == 0))
      continue;

    unitigToTig(tig, (isFinal) ? iumiid : ti, utg);

    tigStore->insertTig(tig, false);

    iumiid++;
  }

  delete    tig;
  delete    tigStore;
  delete [] partmap;
}

Beispiel #16

Datei anzeigen

Datei: AS_BAT_IntersectSplit.C Projekt: ondovb/canu

void
breakUnitigs(UnitigVector &unitigs,
             char         *output_prefix,
             bool          enableIntersectionBreaking) {

  writeLog("==> BREAKING UNITIGS.\n");

  intersectionList  *ilist = new intersectionList(unitigs);

  //  Stop when we've seen all current unitigs.  Replace tiMax
  //  in the for loop below with unitigs.size() to recursively
  //  split unitigs.

  uint32 tiMax = unitigs.size();

  for (uint32 ti=0; ti<tiMax; ti++) {
    Unitig             *tig = unitigs[ti];

    if (tig == NULL)
      continue;

    vector<breakPoint>   breaks;

    for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
      ufNode             *frg   = &tig->ufpath[fi];
      intersectionPoint  *isect = ilist->getIntersection(frg->ident, 0);

      if (isect == NULL)
        continue;

      for (; isect->isectFrg == frg->ident; isect++) {
        assert(tig->id() == Unitig::fragIn(isect->isectFrg));

        //  Grab the invading unitig

        Unitig *inv = unitigs[Unitig::fragIn(isect->invadFrg)];
        assert(inv->id() == Unitig::fragIn(isect->invadFrg));

        //  Grab the best edges off the invading fragment.

        BestEdgeOverlap  *best5 = OG->getBestEdgeOverlap(isect->invadFrg, false);
        BestEdgeOverlap  *best3 = OG->getBestEdgeOverlap(isect->invadFrg, true);

        //  Check if the incoming tig is a spur, and we should just ignore it immediately

        if ((inv->ufpath.size() == 1) &&
            ((best5->fragId() == 0) ||
             (best3->fragId() == 0))) {
          if (logFileFlagSet(LOG_INTERSECTION_BREAKING))
            writeLog("unitig %d frag %d end %c' into unitig %d frag %d end %c' -- IS A SPUR, skip it\n",
                    inv->id(), isect->invadFrg, isect->invad3p ? '3' : '5',
                    tig->id(), isect->isectFrg, isect->isect3p ? '3' : '5');
          continue;
        }

        //  Keep only significant intersections

        if ((inv->getLength()   > MIN_BREAK_LENGTH) &&
            (inv->ufpath.size() > MIN_BREAK_FRAGS)) {
          if (logFileFlagSet(LOG_INTERSECTION_BREAKING))
            writeLog("unitig %d frag %d end %c' into unitig %d frag %d end %c'\n",
                    inv->id(), isect->invadFrg, isect->invad3p ? '3' : '5',
                    tig->id(), isect->isectFrg, isect->isect3p ? '3' : '5');
          breaks.push_back(breakPoint(isect->isectFrg, isect->isect3p, true, false));
        }
      }  //  Over all incoming fragments

      //  If this is the last fragment, terminate the break point list with a 'fakeEnd' (in AS_BAT_Breaking.cc) break point
      //  at the end of the unitig.

      if ((fi+1 == tig->ufpath.size()) &&
          (breaks.size() > 0)) {
        breaks.push_back(breakPoint(frg->ident, (frg->position.bgn < frg->position.end), true, false));
      }
    }  //  Over all fragments in the unitig


    if (breaks.size() == 0)
      continue;

    //  Report where breaks occur.  'breaks' is a list, not a vector.
#if 0
    //  We've lost the fields in breaks[i] -- but the reports above aren't updated yet.
    if (logFileFlagSet(LOG_INTERSECTION_BREAKING) ||
        logFileFlagSet(LOG_MATE_SPLIT_COVERAGE_PLOT))
      for (uint32 i=0; i<breaks.size(); i++)
        writeLog("BREAK unitig %d at position %d,%d from inSize %d inFrags %d.\n",
                tig->id(),
                breaks[i].fragPos.bgn,
                breaks[i].fragPos.end,
                breaks[i].inSize,
                breaks[i].inFrags);
#endif

    //  Actually do the breaking.
    if (enableIntersectionBreaking)
      breakUnitigAt(unitigs, tig, breaks, true);

    breaks.clear();
  }  //  Over all unitigs
}

Beispiel #17

Datei anzeigen

Datei: AS_BAT_PlaceZombies.C Projekt: lhon/canu

void
placeZombies(UnitigVector &unitigs, double erate) {

  writeLog("==> SEARCHING FOR ZOMBIES\n");

  uint32 *inUnitig   = new uint32 [FI->numFragments()+1];
  int     numZombies = 0;

  //  Mark fragments as dead, then unmark them if they are in a real living unitig.

  for (uint32 i=0; i<FI->numFragments()+1; i++)
    inUnitig[i] = noUnitig;

  for (uint32 ti=0; ti<unitigs.size(); ti++) {
    Unitig  *utg = unitigs[ti];

    if (utg == NULL)
      continue;

    for (uint32 fi=0; fi<utg->ufpath.size(); fi++)
      inUnitig[utg->ufpath[fi].ident] = utg->id();
  }

  //  For anything not in a living unitig, reload the overlaps and find a new container.
  //  (NOT IMPLEMENTED - for now we just move these to new singleton unitigs).

  for (uint32 i=0; i<FI->numFragments()+1; i++) {
    if (FI->fragmentLength(i) == 0)
      //  Deleted fragment
      continue;

    if (inUnitig[i] != noUnitig)
      //  Valid fragment in a unitig
      continue;

    Unitig      *utg = unitigs.newUnitig(false);
    ufNode       frg;

    frg.ident             = i;
    frg.contained         = 0;
    frg.parent            = 0;

    frg.ahang             = 0;
    frg.bhang             = 0;

    frg.position.bgn      = 0;
    frg.position.end      = FI->fragmentLength(i);

    frg.containment_depth = 0;

    utg->addFrag(frg, 0, false);

    writeLog("placeZombies()-- unitig %d created from zombie fragment %d\n",
            utg->id(), i);
    numZombies++;
  }

  writeLog("RESURRECTED %d ZOMBIE FRAGMENT%s.\n", numZombies, (numZombies != 1) ? "s" : "");

  delete [] inUnitig;
}

Beispiel #18

Datei anzeigen

Datei: AS_BAT_MateBubble.C Projekt: ondovb/canu

void
popMateBubbles(UnitigVector &unitigs) {
  uint32      nBubblePopped   = 0;
  uint32      nBubbleTooBig   = 0;
  uint32      nBubbleConflict = 0;

  writeLog("==> SEARCHING FOR MATE BUBBLES\n");

  //  For each unitig, if all (or most) of the external mates are to a single other unitig (not
  //  counting singletons), then this is a potential bubble popping unitig.
  //
  //  At present, this is exploratory only.

  for (uint32 ti=0; ti<unitigs.size(); ti++) {
    Unitig        *tig = unitigs[ti];

    if ((tig == NULL) ||
        (tig->ufpath.size() == 0))
      //   No tig here.
      continue;

    if ((tig->getLength() > 1000) ||
        (tig->ufpath.size() >= 3000))
      //  Tig too big.
      continue;

    //if ((tig->getLength() < 150) ||
    //    (tig->ufpath.size() < 5))
    //  //  Tig too small.
    //  continue;

    uint32        *lkg    = new uint32 [tig->ufpath.size()];
    uint32         lkgLen = 0;
    uint32         lkgExt = 0;

    for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
      ufNode *frg = &tig->ufpath[fi];
      int32         frgID = frg->ident;
      int32         matID = FI->mateIID(frgID);

      uint32        mtigID = 0;
      Unitig       *mtig   = 0L;

      if (matID == 0)
        //  No mate.
        continue;

      mtigID = tig->fragIn(matID);
      mtig   = unitigs[mtigID];

      if (mtigID == tig->id())
        //  Mate is not external.
        continue;

      lkgExt++;

      if (mtig->ufpath.size() < 2)
        //  Mate is in singleton.
        continue;

      lkg[lkgLen++] = mtigID;
    }

    if (lkgLen == 0)
      //  No external mates.
      continue;

    sort(lkg, lkg+lkgLen);

    uint32  last = lkg[0];
    uint32  lcnt = 1;

    for (uint32 i=1; i<lkgLen; i++) {
      if (last != lkg[i]) {
        if ((lcnt > 3))
          writeLog("popMateBubble()-- tig %d len %d might pop bubble in tig %u (%u mates in there out of %d external mates)\n",
                  tig->id(), tig->getLength(), last, lcnt, lkgExt);
        last = lkg[i];
        lcnt = 0;
      }

      lcnt++;
    }

    if ((lcnt > 3))
      writeLog("popMateBubble()-- tig %d len %d might pop bubble in tig %u (%u mates in there out of %d external mates)\n",
              tig->id(), tig->getLength(), last, lcnt, lkgExt);

    delete [] lkg;
  }
}

Beispiel #19

Datei anzeigen

Datei: AS_BAT_MarkRepeatReads.C Projekt: Ecogpr/canu

void
markRepeatReads(UnitigVector &unitigs,
                double        deviationRepeat,
                uint32        confusedAbsolute,
                double        confusedPercent) {
  uint32  tiLimit = unitigs.size();
  uint32  numThreads = omp_get_max_threads();
  uint32  blockSize = (tiLimit < 100000 * numThreads) ? numThreads : tiLimit / 99999;

  writeLog("repeatDetect()-- working on "F_U32" unitigs, with "F_U32" threads.\n", tiLimit, numThreads);

  vector<olapDat>      repeatOlaps;   //  Overlaps to reads promoted to tig coords

  intervalList<int32>  tigMarksR;     //  Marked repeats based on reads, filtered by spanning reads
  intervalList<int32>  tigMarksU;     //  Non-repeat invervals, just the inversion of tigMarksR


  for (uint32 ti=0; ti<tiLimit; ti++) {
    Unitig  *tig = unitigs[ti];

    if (tig == NULL)
      continue;

    if (tig->ufpath.size() == 1)
      continue;

    vector<olapDat>   repeats;

    writeLog("Annotating repeats in reads for tig %u/%u.\n", ti, tiLimit);

    //  Clear out all the existing marks.  They're not for this tig.


    //  Analyze overlaps for each read.  For each overlap to a read not in this tig, or not
    //  overlapping in this tig, and of acceptable error rate, add the overlap to repeatOlaps.

    repeatOlaps.clear();

    uint32  fiLimit    = tig->ufpath.size();
    uint32  numThreads = omp_get_max_threads();
    uint32  blockSize  = (fiLimit < 100 * numThreads) ? numThreads : fiLimit / 99;

#pragma omp parallel for if(fiLimit > 100) schedule(dynamic, blockSize)
    for (uint32 fi=0; fi<fiLimit; fi++)
      annotateRepeatsOnRead(unitigs, tig, &tig->ufpath[fi], deviationRepeat, repeatOlaps);

    writeLog("Annotated with %lu overlaps.\n", repeatOlaps.size());

    //  Merge marks for the same read into the largest possible.

    sort(repeatOlaps.begin(), repeatOlaps.end(), olapDatByEviRid);

#ifdef SHOW_ANNOTATE
    for (uint32 ii=0; ii<repeatOlaps.size(); ii++)
      if (repeatOlaps[ii].tigbgn < 1000000)
        writeLog("repeatOlaps[%u] %u-%u from tig %u read %u RAW\n",
                 ii,
                 repeatOlaps[ii].tigbgn, repeatOlaps[ii].tigend,
                 repeatOlaps[ii].eviTid, repeatOlaps[ii].eviRid);

    flushLog();
#endif

    for (uint32 dd=0, ss=1; ss<repeatOlaps.size(); ss++) {
      assert(repeatOlaps[dd].eviRid <= repeatOlaps[ss].eviRid);

      //  If different evidence reads, close the destination olap, set up
      //  for a new destination.

      if (repeatOlaps[dd].eviRid != repeatOlaps[ss].eviRid) {
        dd = ss;
        continue;
      }

      //  If the destination ends before the source begins, there is no overlap between the
      //  two regions.  Close dd, set up for a new dd.

      if (repeatOlaps[dd].tigend <= repeatOlaps[ss].tigbgn) {
        dd = ss;
        continue;
      }

      //  Otherwise, there must be an overlap.  Extend the destination region, erase the source
      //  region.

      repeatOlaps[dd].tigbgn = min(repeatOlaps[ss].tigbgn, repeatOlaps[dd].tigbgn);
      repeatOlaps[dd].tigend = max(repeatOlaps[ss].tigend, repeatOlaps[dd].tigend);

      repeatOlaps[ss].tigbgn = UINT32_MAX;
      repeatOlaps[ss].tigend = UINT32_MAX;
      repeatOlaps[ss].eviTid = UINT32_MAX;
      repeatOlaps[ss].eviRid = UINT32_MAX;
    }

    //  Sort overlaps again.  This pushes all those 'erased' regions to the end of the list, which
    //  we can then just pop off.

    sort(repeatOlaps.begin(), repeatOlaps.end(), olapDatByEviRid);

    for (uint32 ii=repeatOlaps.size(); ii--; )
      if (repeatOlaps[ii].eviTid == UINT32_MAX)
        repeatOlaps.pop_back();

    //  For logging, sort by coordinate

    sort(repeatOlaps.begin(), repeatOlaps.end());

#ifdef SHOW_ANNOTATE
    for (uint32 ii=0; ii<repeatOlaps.size(); ii++)
      writeLog("repeatOlaps[%d] %u-%u from tig %u read %u MERGED\n",
               ii,
               repeatOlaps[ii].tigbgn, repeatOlaps[ii].tigend,
               repeatOlaps[ii].eviTid, repeatOlaps[ii].eviRid);
#endif

    //  Make a new set of intervals based on all the detected repeats.

    tigMarksR.clear();

    for (uint32 bb=0, ii=0; ii<repeatOlaps.size(); ii++)
      tigMarksR.add(repeatOlaps[ii].tigbgn, repeatOlaps[ii].tigend - repeatOlaps[ii].tigbgn);

    //  Collapse these markings Collapse all the read markings to intervals on the unitig, merging those that overlap
    //  significantly.

    writeLog("Merge marks.\n");

    tigMarksR.merge(REPEAT_OVERLAP_MIN);

    //  Scan reads, discard any mark that is contained in a read
    //
    //  We don't need to filterShort() after every one is removed, but it's simpler to do it Right Now than
    //  to track if it is needed.

    writeLog("Scan reads to discard spanned repeats.\n");

    for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
      ufNode     *frg       = &tig->ufpath[fi];
      bool        frgfwd    = (frg->position.bgn < frg->position.end);
      int32       frglo     = (frgfwd) ? frg->position.bgn : frg->position.end;
      int32       frghi     = (frgfwd) ? frg->position.end : frg->position.bgn;
      bool        discarded = false;

      for (uint32 ri=0; ri<tigMarksR.numberOfIntervals(); ri++) {
        bool   spanLo = false;
        bool   spanHi = false;

        //  The decision of 'spanned by a read' is broken into two pieces: does the read span the
        //  lower (higher) boundary of the region.  To be spanned, the boundary needs to be spanned
        //  by at least MIN_ANCHOR_HANG additional bases (to anchor the read to non-repeat
        //  sequence).
        //
        //  This is a problem at the start/end of the tig, beacuse no read will extend past the
        //  start/end of the tig.  Instead, if the repeat is contained within the first (last) read
        //  with no extension at the respective end, it is spanned.

        if ((frglo == 0) &&                                   //  Read at start of tig, spans off the high end
            (tigMarksR.hi(ri) + MIN_ANCHOR_HANG <= frghi))
          spanLo = spanHi = true;

        if ((frghi == tig->getLength()) &&                    //  Read at end of tig, spans off the low end
            (frglo + MIN_ANCHOR_HANG <= tigMarksR.lo(ri)))
          spanLo = spanHi = true;

        if (frglo + MIN_ANCHOR_HANG <= tigMarksR.lo(ri))      //  Read spanned off the low end
          spanLo = true;

        if (tigMarksR.hi(ri) + MIN_ANCHOR_HANG <= frghi)      //  Read spanned off the high end
          spanHi = true;

        if (spanLo && spanHi) {
          writeLog("discard region %8d:%-8d - contained in read %6u %8d-%8d\n",
                   tigMarksR.lo(ri), tigMarksR.hi(ri), frg->ident, frglo, frghi);

          tigMarksR.lo(ri) = 0;
          tigMarksR.hi(ri) = 0;

          discarded = true;
        }
      }


      if (discarded)
        tigMarksR.filterShort(1);
    }

    //  Run through again, looking for the thickest overlap(s) to the remaining regions.
    //  This isn't caring about the end effect noted above.

#if 1
    writeLog("thickest edges to the repeat regions:\n");

    for (uint32 ri=0; ri<tigMarksR.numberOfIntervals(); ri++) {
      uint32   t5 = UINT32_MAX, l5 = 0, t5bgn, t5end;
      uint32   t3 = UINT32_MAX, l3 = 0, t3bgn, t3end;

      for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
        ufNode     *frg       = &tig->ufpath[fi];
        bool        frgfwd    = (frg->position.bgn < frg->position.end);
        int32       frglo     = (frgfwd) ? frg->position.bgn : frg->position.end;
        int32       frghi     = (frgfwd) ? frg->position.end : frg->position.bgn;
        bool        discarded = false;

        //  Overlap off the 5' end of the region.
        if (frglo <= tigMarksR.lo(ri) && (tigMarksR.lo(ri) <= frghi)) {
          uint32 olap = frghi - tigMarksR.lo(ri);
          if (l5 < olap) {
            l5    = olap;
            t5    = fi;
            t5bgn = frglo;  //  Easier than recomputing it later on...
            t5end = frghi;
          }
        }

        //  Overlap off the 3' end of the region.
        if (frglo <= tigMarksR.hi(ri) && (tigMarksR.hi(ri) <= frghi)) {
          uint32 olap = tigMarksR.hi(ri) - frglo;
          if (l3 < olap) {
            l3    = olap;
            t3    = fi;
            t3bgn = frglo;
            t3end = frghi;
          }
        }

        if (frglo <= tigMarksR.lo(ri) && (tigMarksR.hi(ri) <= frghi)) {
          writeLog("saved   region %8d:%-8d - closest    read %6u (%+6d) %8d:%-8d (%+6d) (contained)\n",
                   tigMarksR.lo(ri), tigMarksR.hi(ri),
                   frg->ident,
                   tigMarksR.lo(ri) - frglo, frglo,
                   frghi, frghi - tigMarksR.hi(ri));
        }
      }

      if (t5 != UINT32_MAX)
        writeLog("saved   region %8d:%-8d - closest 5' read %6u (%+6d) %8d:%-8d (%+6d)\n",
                 tigMarksR.lo(ri), tigMarksR.hi(ri),
                 tig->ufpath[t5].ident,
                 tigMarksR.lo(ri) - t5bgn, t5bgn,
                 t5end, t5end - tigMarksR.hi(ri));

      if (t3 != UINT32_MAX)
        writeLog("saved   region %8d:%-8d - closest 3' read %6u (%+6d) %8d:%-8d (%+6d)\n",
                 tigMarksR.lo(ri), tigMarksR.hi(ri),
                 tig->ufpath[t3].ident,
                 tigMarksR.lo(ri) - t3bgn, t3bgn,
                 t3end, t3end - tigMarksR.hi(ri));
    }
#endif


    //  Scan reads.  If a read intersects a repeat interval, and the best edge for that read
    //  is entirely in the repeat region, decide if there is a near-best edge to something
    //  not in this tig.
    //
    //  A region with no such near-best edges is _probably_ correct.

    writeLog("search for confused edges:\n");

    uint32  *isConfused  = new uint32 [tigMarksR.numberOfIntervals()];

    memset(isConfused, 0, sizeof(uint32) * tigMarksR.numberOfIntervals());

    for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
      ufNode     *rdA       = &tig->ufpath[fi];
      uint32      rdAid     = rdA->ident;
      bool        rdAfwd    = (rdA->position.bgn < rdA->position.end);
      int32       rdAlo     = (rdAfwd) ? rdA->position.bgn : rdA->position.end;
      int32       rdAhi     = (rdAfwd) ? rdA->position.end : rdA->position.bgn;

      double      sc        = (rdAhi - rdAlo) / (double)FI->fragmentLength(rdAid);

      if ((OG->isContained(rdAid)  == true) ||
          (OG->isSuspicious(rdAid) == true))
        continue;

      for (uint32 ri=0; ri<tigMarksR.numberOfIntervals(); ri++) {
        uint32  rMin = tigMarksR.lo(ri);
        uint32  rMax = tigMarksR.hi(ri);

        if ((rdAhi < rMin) ||   //  Read ends before the region
            (rMax  < rdAlo))    //  Read starts after the region
          continue;              //   -> don't care about this read!

        //  Compute the position (in the tig) of the best overlaps.

        int32  tig5bgn=0, tig5end=0;
        int32  tig3bgn=0, tig3end=0;

        //  Instead of using the best edge - which might not be the edge used in the unitig -
        //  we need to scan the layout to return the previous/next dovetail

        //  Put this in a function - what to return if no best overlap?

        BestEdgeOverlap   *b5 = OG->getBestEdgeOverlap(rdAid, false);
        BestEdgeOverlap   *b3 = OG->getBestEdgeOverlap(rdAid, true);

        //  If the best edge is to a read not in this tig, there is nothing to compare against.
        //  Is this confused by default?  Possibly.  The unitig was constructed somehow, and that
        //  must then be the edge coming into us.  We'll pick it up later.

        bool b5use = true;
        bool b3use = true;

        if (b5->fragId() == 0)
          b5use = false;
        if (b3->fragId() == 0)
          b3use = false;

        if ((b5use) && (Unitig::fragIn(b5->fragId()) != tig->id()))
          b5use = false;
        if ((b3use) && (Unitig::fragIn(b3->fragId()) != tig->id()))
          b3use = false;

        //  The best edge read is in this tig.  If they don't overlap, again, nothing to compare
        //  against.

        if (b5use) {
          ufNode     *rdB       = &tig->ufpath[Unitig::pathPosition(b5->fragId())];
          uint32      rdBid     = rdB->ident;
          bool        rdBfwd    = (rdB->position.bgn < rdB->position.end);
          int32       rdBlo     = (rdBfwd) ? rdB->position.bgn : rdB->position.end;
          int32       rdBhi     = (rdBfwd) ? rdB->position.end : rdB->position.bgn;

          if ((rdAhi < rdBlo) ||
              (rdBhi < rdAlo))
            b5use = false;
        }

        if (b3use) {
          ufNode     *rdB       = &tig->ufpath[Unitig::pathPosition(b3->fragId())];
          uint32      rdBid     = rdB->ident;
          bool        rdBfwd    = (rdB->position.bgn < rdB->position.end);
          int32       rdBlo     = (rdBfwd) ? rdB->position.bgn : rdB->position.end;
          int32       rdBhi     = (rdBfwd) ? rdB->position.end : rdB->position.bgn;

          if ((rdAhi < rdBlo) ||
              (rdBhi < rdAlo))
            b3use = false;
        }

        //  If we can use this edge, compute the placement of the overlap on the unitig.

        //  Call #1;

        if (b5use) {
          int32   bgn=0, end=0;

          olapToReadCoords(rdA,
                           b5->ahang(), b5->bhang(),
                           bgn, end);

          tig5bgn = (rdAfwd) ? (rdAlo + sc * bgn) : (rdAhi - sc * end);
          tig5end = (rdAfwd) ? (rdAlo + sc * end) : (rdAhi - sc * bgn);
        
          assert(tig5bgn < tig5end);

          if (tig5bgn < 0)                  tig5bgn = 0;
          if (tig5end > tig->getLength())   tig5end = tig->getLength();
        }

        //  Call #2

        if (b3use) {
          int32   bgn=0, end=0;

          olapToReadCoords(rdA,
                           b3->ahang(), b3->bhang(),
                           bgn, end);

          tig3bgn = (rdAfwd) ? (rdAlo + sc * bgn) : (rdAhi - sc * end);
          tig3end = (rdAfwd) ? (rdAlo + sc * end) : (rdAhi - sc * bgn);

          assert(tig3bgn < tig3end);

          if (tig3bgn < 0)                  tig3bgn = 0;
          if (tig3end > tig->getLength())   tig3end = tig->getLength();
        }

        //  If either of the 5' or 3' overlaps (or both!) are in the repeat region, we need to check for
        //  close overlaps on that end.

        uint32  len5 = 0;
        uint32  len3 = 0;

        if ((rMin    < tig5bgn) &&
            (tig5end < rMax) &&
            (b5use))
          len5 = FI->overlapLength(rdAid, b5->fragId(), b5->ahang(), b5->bhang());
        else
          b5use = false;

        if ((rMin    < tig3bgn) &&
            (tig3end < rMax) &&
            (b3use))
          len3 = FI->overlapLength(rdAid, b3->fragId(), b3->ahang(), b3->bhang());
        else
          b3use = false;

        double score5 = len5 * (1 - b5->erate());
        double score3 = len3 * (1 - b3->erate());

        //  Neither of the best edges are in the repeat region; move to the next region and/or read.
        if (len5 + len3 == 0)
          continue;

        //  At least one of the best edge overlaps is in the repeat region.  Scan for other edges
        //  that are of comparable length and quality.

        uint32        ovlLen   = 0;
        BAToverlap   *ovl      = OC->getOverlaps(rdAid, AS_MAX_ERATE, ovlLen);

        for (uint32 oo=0; oo<ovlLen; oo++) {
          uint32   rdBid    = ovl[oo].b_iid;
          uint32   tgBid    = Unitig::fragIn(rdBid);

          //  If the read is in a singleton, skip.  These are unassembled crud.
          if ((tgBid                         == 0) ||
              (unitigs[tgBid]                == NULL) ||
              (unitigs[tgBid]->ufpath.size() == 1))
            continue;

          //  If the read is in an annotated bubble, skip.
          if (unitigs[tgBid]->_isBubble)
            continue;

          //  Skip if this overlap is the best we're trying to match.
          if ((rdBid == b5->fragId()) ||
              (rdBid == b3->fragId()))
            continue;

          //  Skip if this overlap is crappy quality
          if (OG->isOverlapBadQuality(ovl[oo]))
            continue;

          //  Skip if the read is contained or suspicious.
          if ((OG->isContained(rdBid)  == true) ||
              (OG->isSuspicious(rdBid) == true))
            continue;

          //  Skip if the overlap isn't dovetail.
          bool  ovl5 = ovl[oo].AEndIs5prime();
          bool  ovl3 = ovl[oo].AEndIs3prime();

          if ((ovl5 == false) &&
              (ovl3 == false))
            continue;

          //  Skip if we're not using this overlap
          if ((ovl5 == true) && (b5use == false))
            continue;

          if ((ovl3 == true) && (b3use == false))
            continue;


          uint32   rdBpos   =  unitigs[tgBid]->pathPosition(rdBid);
          ufNode  *rdB      = &unitigs[tgBid]->ufpath[rdBpos];

          bool     rdBfwd   = (rdB->position.bgn < rdB->position.end);
          int32    rdBlo    = (rdBfwd) ? rdB->position.bgn : rdB->position.end;
          int32    rdBhi    = (rdBfwd) ? rdB->position.end : rdB->position.bgn;

          //  If the overlap is to a read in a different tig, or
          //     the overlap is to a read in the same tig, but we don't overlap in the tig, check lengths.
          //  Otherwise, the overlap is present in the tig, and can't be confused.
          if ((tgBid == tig->id()) &&
              (rdBlo <= rdAhi) &&
              (rdAlo <= rdBhi))
            continue;

          uint32  len   = FI->overlapLength(rdAid, ovl[oo].b_iid, ovl[oo].a_hang, ovl[oo].b_hang);
          double  score = len * (1 - ovl[oo].erate);

          //  Compute percent difference.

          double  ad5 = fabs(score - score5);
          double  ad3 = fabs(score - score3);

          double  pd5 = 200 * ad5 / (score + score5);
          double  pd3 = 200 * ad3 / (score + score3);

          //  Skip if this overlap is vastly worse than the best.

          if ((ovl5 == true) && ((ad5 >= confusedAbsolute) || (pd3 > confusedPercent))) {
            writeLog("tig %7u read %8u pos %7u-%-7u NOT confused by 5' edge to read %8u - best edge read %8u len %6u erate %.4f score %8.2f - alt edge len %6u erate %.4f score %8.2f - absdiff %8.2f percdiff %8.4f\n",
                     tig->id(), rdAid, rdAlo, rdAhi,
                     rdBid,
                     b5->fragId(), len5, b5->erate(), score5,
                     len, ovl[oo].erate, score,
                     ad5, pd5);
            continue;
          }

          if ((ovl3 == true) && ((ad3 >= confusedAbsolute) || (pd3 > confusedPercent))) {
            writeLog("tig %7u read %8u pos %7u-%-7u NOT confused by 3' edge to read %8u - best edge read %8u len %6u erate %.4f score %8.2f - alt edge len %6u erate %.4f score %8.2f - absdiff %8.2f percdiff %8.4f\n",
                     tig->id(), rdAid, rdAlo, rdAhi,
                     rdBid,
                     b3->fragId(), len3, b3->erate(), score3,
                     len, ovl[oo].erate, score,
                     ad3, pd3);
            continue;
          }

          //  Potential confusion!

          if (ovl5 == true)
            writeLog("tig %7u read %8u pos %7u-%-7u IS confused by 5' edge to read %8u - best edge read %8u len %6u erate %.4f score %8.2f - alt edge len %6u erate %.4f score %8.2f - absdiff %8.2f percdiff %8.4f\n",
                     tig->id(), rdAid, rdAlo, rdAhi,
                     rdBid,
                     b5->fragId(), len5, b5->erate(), score5,
                     len, ovl[oo].erate, score,
                     ad5, pd5);

          if (ovl3 == true)
            writeLog("tig %7u read %8u pos %7u-%-7u IS confused by 3' edge to read %8u - best edge read %8u len %6u erate %.4f score %8.2f - alt edge len %6u erate %.4f score %8.2f - absdiff %8.2f percdiff %8.4f\n",
                     tig->id(), rdAid, rdAlo, rdAhi,
                     rdBid,
                     b3->fragId(), len3, b3->erate(), score3,
                     len, ovl[oo].erate, score,
                     ad3, pd3);

          isConfused[ri]++;
        }
      }  //  Over all marks (ri)
    }  //  Over all reads (fi)


    //  Scan all the regions, and delete any that have no confusion.

    {
      bool  discarded = false;

      for (uint32 ri=0; ri<tigMarksR.numberOfIntervals(); ri++) {
        if (isConfused[ri] == 0) {
          writeLog("discard region %8d:%-8d - no confusion in best edges\n",
                   tigMarksR.lo(ri), tigMarksR.hi(ri));

          tigMarksR.lo(ri) = 0;
          tigMarksR.hi(ri) = 0;

          discarded = true;
        }

        else {
          writeLog("saved   region %8d:%-8d - %u best edges are potentially confused\n",
                   tigMarksR.lo(ri), tigMarksR.hi(ri), isConfused[ri]);
        }
      }

      if (discarded)
        tigMarksR.filterShort(1);
    }

    delete [] isConfused;





    //  Scan reads, join any marks that have their junctions spanned by a sufficiently large amount.
    //
    //  If the read spans this junction be the usual amount, merge the intervals.
    //
    //  The intervals can be overlapping (by up to REPEAT_OVERLAP_MIN (x2?) bases.  For this junction
    //  to be spanned, the read must span from min-ROM to max+ROM, not just hi(ri-1) to lo(ri).
    //
    //  We DO need to filterShort() after every merge, otherwise, we'd have an empty bogus interval
    //  in the middle of our list, which could be preventing some other merge.  OK, we could 
    //
    //  Anything that gets merged is now no longer a true repeat.  It's unique, just bordered by repeats.
    //  We can't track this through the indices (because we delete things).  We track it with a set of
    //  begin coordinates.

    set<int32>  nonRepeatIntervals;

    writeLog("Scan reads to merge repeat regions.\n");

    for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
      ufNode     *frg       = &tig->ufpath[fi];
      bool        frgfwd    = (frg->position.bgn < frg->position.end);
      int32       frglo     = (frgfwd) ? frg->position.bgn : frg->position.end;
      int32       frghi     = (frgfwd) ? frg->position.end : frg->position.bgn;
      bool        merged    = false;

      for (uint32 ri=1; ri<tigMarksR.numberOfIntervals(); ri++) {
        uint32  rMin = min(tigMarksR.hi(ri-1), tigMarksR.lo(ri));
        uint32  rMax = max(tigMarksR.hi(ri-1), tigMarksR.lo(ri));

        if ((frglo + MIN_ANCHOR_HANG <= rMin) && (rMax + MIN_ANCHOR_HANG <= frghi)) {
          writeLog("merge regions %8d:%-8d and %8d:%-8d - junction contained in read %6u %5d-%5d\n",
                   tigMarksR.lo(ri-1), tigMarksR.hi(ri-1),
                   tigMarksR.lo(ri), tigMarksR.hi(ri),
                   frg->ident, frglo, frghi);

          tigMarksR.lo(ri) = tigMarksR.lo(ri-1);

          tigMarksR.lo(ri-1) = 0;   //  CRITICAL to delete this interval (and not ri) because the next
          tigMarksR.hi(ri-1) = 0;   //  iteration will be using ri-1 (== ri here) and ri (== ri+1).

          merged = true;

          nonRepeatIntervals.insert(tigMarksR.lo(ri));
        }
      }

      if (merged)
        tigMarksR.filterShort(1);
    }

    //  Extend the regions by MIN_ANCHOR_HANG.  This makes checking for reads that span and are
    //  anchored in the next region easier.  It also solved a quirk when the first/last repeat
    //  region doesn't extend to the end of the sequence:
    //    0-183     unique  (created from inversion below, but useless and incorrect)
    //    183-9942  repeat

    for (uint32 ii=0; ii<tigMarksR.numberOfIntervals(); ii++) {
      tigMarksR.lo(ii) = max<int32>(tigMarksR.lo(ii) - MIN_ANCHOR_HANG, 0);
      tigMarksR.hi(ii) = min<int32>(tigMarksR.hi(ii) + MIN_ANCHOR_HANG, tig->getLength());
    }

    //  Find the non-repeat intervals.

    tigMarksU = tigMarksR;
    tigMarksU.invert(0, tig->getLength());

    //  Create the list of intervals we'll use to make new unitigs.
    //
    //  The repeat intervals are extended by MIN_ANCHOR_HANG, and then any read fully contained in one of
    //  these is moved here.
    //
    //  The non-repeat intervals are shortened by the same amount, and any read that intersects one
    //  is moved there.
    //
    //  Does order matter?  Not sure.  The repeat intervals are first, then the formerly repeat
    //  merged intervals, then the unique intervals.  Splitting might depend on the repeats being
    //  first.

    writeLog("Make breakpoints.\n");

    vector<breakPointCoords>   BP;

    for (uint32 ii=0; ii<tigMarksR.numberOfIntervals(); ii++)
      if (nonRepeatIntervals.count(tigMarksR.lo(ii)) == 0)
        BP.push_back(breakPointCoords(ti, tigMarksR.lo(ii), tigMarksR.hi(ii), true));

    for (uint32 ii=0; ii<tigMarksR.numberOfIntervals(); ii++)
      if (nonRepeatIntervals.count(tigMarksR.lo(ii)) != 0)
        BP.push_back(breakPointCoords(ti, tigMarksR.lo(ii), tigMarksR.hi(ii), true));

    for (uint32 ii=0; ii<tigMarksU.numberOfIntervals(); ii++) {
      BP.push_back(breakPointCoords(ti, tigMarksU.lo(ii), tigMarksU.hi(ii), false));
    }

    //  If only one region, the whole unitig was declared repeat.  Nothing to do.

    if (BP.size() == 1)
      continue;

    sort(BP.begin(), BP.end());

    //  Report.

    writeLog("break tig %u into up to %u pieces:\n", ti, BP.size());
    for (uint32 ii=0; ii<BP.size(); ii++)
      writeLog("  %8d %8d %s (length %d)\n",
               BP[ii]._bgn, BP[ii]._end,
               BP[ii]._isRepeat ? "repeat" : "unique",
               BP[ii]._end - BP[ii]._bgn);

    //  Scan the reads, counting the number of reads that would be placed in each new tig.  This is done
    //  because there are a few 'splits' that don't move any reads around.

    Unitig **newTigs   = new Unitig * [BP.size()];
    int32   *lowCoord  = new int32    [BP.size()];
    uint32  *nRepeat   = new uint32   [BP.size()];
    uint32  *nUnique   = new uint32   [BP.size()];

    //  First call, count the number of tigs we would create if we let it create them.

    uint32  nTigs = splitUnitigs(unitigs, tig, BP, newTigs, lowCoord, nRepeat, nUnique, false);

    //  Second call, actually create the tigs, if anything would change.

    if (nTigs > 1)
      splitUnitigs(unitigs, tig, BP, newTigs, lowCoord, nRepeat, nUnique, true);

    //  Report the tigs created.

    for (uint32 ii=0; ii<BP.size(); ii++) {
      int32   rgnbgn = BP[ii]._bgn;
      int32   rgnend = BP[ii]._end;
      bool    repeat = BP[ii]._isRepeat;

      if      (nRepeat[ii] + nUnique[ii] == 0)
        writeLog("For tig %5u %s region %8d %8d - %6u/%6u repeat/unique reads - no new unitig created.\n",
                 ti, (repeat == true) ? "repeat" : "unique", rgnbgn, rgnend, nRepeat[ii], nUnique[ii]);

      else if (nTigs > 1)
        writeLog("For tig %5u %s region %8d %8d - %6u/%6u reads repeat/unique - unitig %5u created.\n",
                 ti, (repeat == true) ? "repeat" : "unique", rgnbgn, rgnend, nRepeat[ii], nUnique[ii], newTigs[ii]->id());

      else
        writeLog("For tig %5u %s region %8d %8d - %6u/%6u repeat/unique reads - unitig %5u remains unchanged.\n",
                 ti, (repeat == true) ? "repeat" : "unique", rgnbgn, rgnend, nRepeat[ii], nUnique[ii], tig->id());
    }

    //  Cleanup.

    delete [] newTigs;
    delete [] lowCoord;
    delete [] nRepeat;
    delete [] nUnique;

    //  Remove the old unitig....if we made new ones.

    if (nTigs > 1) {
      delete tig;
      unitigs[ti] = NULL;
    }
  }
}