Ejemplos de Unitig::getLength en C++ (Cpp)

Lenguaje de programación: C++ (Cpp)

Clase / Tipo: Unitig

Método / Función: getLength

Ejemplos en hotexamples.com: 11

C++ (Cpp) Unitig::getLength - 11 ejemplos encontrados. Estos son los ejemplos en C++ (Cpp) del mundo real mejor valorados de Unitig::getLength extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Métodos usados con frecuencia

Mostrar Ocultar

id(19)

getLength(11)

addFrag(7)

fragIn(6)

sort(5)

addContainedFrag(3)

getNumFrags(3)

pathPosition(3)

overlapConsistentWithTig(2)

removeFrag(2)

tigID(2)

bubbleSortLastFrag(1)

computeArrivalRate(1)

computeErrorProfile(1)

reportErrorProfile(1)

reverseComplement(1)

Ejemplo n.º 1

Mostrar archivo

Archivo: AS_BAT_Instrumentation.C Proyecto: xtmgah/canu

void
reportUnitigs(UnitigVector &unitigs, const char *prefix, const char *name) {

  if (logFileFlagSet(LOG_INTERMEDIATE_UNITIGS) == 0)
    return;

  uint32  numFragsT  = 0;
  uint32  numFragsP  = 0;
  uint64  utgLen     = 0;

  //  Compute average frags per partition.
  for (uint32  ti=0; ti<unitigs.size(); ti++) {
    Unitig  *utg = unitigs[ti];

    if (utg == NULL)
      continue;

    numFragsT += utg->ufpath.size();

    if (utg->ufpath.size() > 2)
      utgLen    += utg->getLength();
  }

  if      (utgLen < 16 * 1024 * 1024)
    numFragsP = numFragsT / 7;
  else if (utgLen < 64 * 1024 * 1024)
    numFragsP = numFragsT / 63;
  else
    numFragsP = numFragsT / 127;

  char tigStorePath[FILENAME_MAX];
  sprintf(tigStorePath, "%s.%03u.%s.tigStore", prefix, logFileOrder, name);

  //  Failing to do this results in consensus running about 40 times slower.  Three hours instead of
  //  five minutes.
  setParentAndHang(unitigs);

  writeUnitigsToStore(unitigs, tigStorePath, tigStorePath, numFragsP, false);
}

Ejemplo n.º 2

Mostrar archivo

Archivo: AS_BAT_MateBubble.C Proyecto: ondovb/canu

void
popMateBubbles(UnitigVector &unitigs) {
  uint32      nBubblePopped   = 0;
  uint32      nBubbleTooBig   = 0;
  uint32      nBubbleConflict = 0;

  writeLog("==> SEARCHING FOR MATE BUBBLES\n");

  //  For each unitig, if all (or most) of the external mates are to a single other unitig (not
  //  counting singletons), then this is a potential bubble popping unitig.
  //
  //  At present, this is exploratory only.

  for (uint32 ti=0; ti<unitigs.size(); ti++) {
    Unitig        *tig = unitigs[ti];

    if ((tig == NULL) ||
        (tig->ufpath.size() == 0))
      //   No tig here.
      continue;

    if ((tig->getLength() > 1000) ||
        (tig->ufpath.size() >= 3000))
      //  Tig too big.
      continue;

    //if ((tig->getLength() < 150) ||
    //    (tig->ufpath.size() < 5))
    //  //  Tig too small.
    //  continue;

    uint32        *lkg    = new uint32 [tig->ufpath.size()];
    uint32         lkgLen = 0;
    uint32         lkgExt = 0;

    for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
      ufNode *frg = &tig->ufpath[fi];
      int32         frgID = frg->ident;
      int32         matID = FI->mateIID(frgID);

      uint32        mtigID = 0;
      Unitig       *mtig   = 0L;

      if (matID == 0)
        //  No mate.
        continue;

      mtigID = tig->fragIn(matID);
      mtig   = unitigs[mtigID];

      if (mtigID == tig->id())
        //  Mate is not external.
        continue;

      lkgExt++;

      if (mtig->ufpath.size() < 2)
        //  Mate is in singleton.
        continue;

      lkg[lkgLen++] = mtigID;
    }

    if (lkgLen == 0)
      //  No external mates.
      continue;

    sort(lkg, lkg+lkgLen);

    uint32  last = lkg[0];
    uint32  lcnt = 1;

    for (uint32 i=1; i<lkgLen; i++) {
      if (last != lkg[i]) {
        if ((lcnt > 3))
          writeLog("popMateBubble()-- tig %d len %d might pop bubble in tig %u (%u mates in there out of %d external mates)\n",
                  tig->id(), tig->getLength(), last, lcnt, lkgExt);
        last = lkg[i];
        lcnt = 0;
      }

      lcnt++;
    }

    if ((lcnt > 3))
      writeLog("popMateBubble()-- tig %d len %d might pop bubble in tig %u (%u mates in there out of %d external mates)\n",
              tig->id(), tig->getLength(), last, lcnt, lkgExt);

    delete [] lkg;
  }
}

Ejemplo n.º 3

Mostrar archivo

Archivo: AS_BAT_IntersectSplit.C Proyecto: ondovb/canu

void
breakUnitigs(UnitigVector &unitigs,
             char         *output_prefix,
             bool          enableIntersectionBreaking) {

  writeLog("==> BREAKING UNITIGS.\n");

  intersectionList  *ilist = new intersectionList(unitigs);

  //  Stop when we've seen all current unitigs.  Replace tiMax
  //  in the for loop below with unitigs.size() to recursively
  //  split unitigs.

  uint32 tiMax = unitigs.size();

  for (uint32 ti=0; ti<tiMax; ti++) {
    Unitig             *tig = unitigs[ti];

    if (tig == NULL)
      continue;

    vector<breakPoint>   breaks;

    for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
      ufNode             *frg   = &tig->ufpath[fi];
      intersectionPoint  *isect = ilist->getIntersection(frg->ident, 0);

      if (isect == NULL)
        continue;

      for (; isect->isectFrg == frg->ident; isect++) {
        assert(tig->id() == Unitig::fragIn(isect->isectFrg));

        //  Grab the invading unitig

        Unitig *inv = unitigs[Unitig::fragIn(isect->invadFrg)];
        assert(inv->id() == Unitig::fragIn(isect->invadFrg));

        //  Grab the best edges off the invading fragment.

        BestEdgeOverlap  *best5 = OG->getBestEdgeOverlap(isect->invadFrg, false);
        BestEdgeOverlap  *best3 = OG->getBestEdgeOverlap(isect->invadFrg, true);

        //  Check if the incoming tig is a spur, and we should just ignore it immediately

        if ((inv->ufpath.size() == 1) &&
            ((best5->fragId() == 0) ||
             (best3->fragId() == 0))) {
          if (logFileFlagSet(LOG_INTERSECTION_BREAKING))
            writeLog("unitig %d frag %d end %c' into unitig %d frag %d end %c' -- IS A SPUR, skip it\n",
                    inv->id(), isect->invadFrg, isect->invad3p ? '3' : '5',
                    tig->id(), isect->isectFrg, isect->isect3p ? '3' : '5');
          continue;
        }

        //  Keep only significant intersections

        if ((inv->getLength()   > MIN_BREAK_LENGTH) &&
            (inv->ufpath.size() > MIN_BREAK_FRAGS)) {
          if (logFileFlagSet(LOG_INTERSECTION_BREAKING))
            writeLog("unitig %d frag %d end %c' into unitig %d frag %d end %c'\n",
                    inv->id(), isect->invadFrg, isect->invad3p ? '3' : '5',
                    tig->id(), isect->isectFrg, isect->isect3p ? '3' : '5');
          breaks.push_back(breakPoint(isect->isectFrg, isect->isect3p, true, false));
        }
      }  //  Over all incoming fragments

      //  If this is the last fragment, terminate the break point list with a 'fakeEnd' (in AS_BAT_Breaking.cc) break point
      //  at the end of the unitig.

      if ((fi+1 == tig->ufpath.size()) &&
          (breaks.size() > 0)) {
        breaks.push_back(breakPoint(frg->ident, (frg->position.bgn < frg->position.end), true, false));
      }
    }  //  Over all fragments in the unitig


    if (breaks.size() == 0)
      continue;

    //  Report where breaks occur.  'breaks' is a list, not a vector.
#if 0
    //  We've lost the fields in breaks[i] -- but the reports above aren't updated yet.
    if (logFileFlagSet(LOG_INTERSECTION_BREAKING) ||
        logFileFlagSet(LOG_MATE_SPLIT_COVERAGE_PLOT))
      for (uint32 i=0; i<breaks.size(); i++)
        writeLog("BREAK unitig %d at position %d,%d from inSize %d inFrags %d.\n",
                tig->id(),
                breaks[i].fragPos.bgn,
                breaks[i].fragPos.end,
                breaks[i].inSize,
                breaks[i].inFrags);
#endif

    //  Actually do the breaking.
    if (enableIntersectionBreaking)
      breakUnitigAt(unitigs, tig, breaks, true);

    breaks.clear();
  }  //  Over all unitigs
}

Ejemplo n.º 4

Mostrar archivo

Archivo: AS_BAT_Outputs.C Proyecto: ondovb/canu

void
writeUnitigsToStore(UnitigVector  &unitigs,
                    char          *fileprefix,
                    char          *tigStorePath,
                    uint32         frg_count_target,
                    bool           isFinal) {
  uint32      utg_count              = 0;
  uint32      frg_count              = 0;
  uint32      prt_count              = 1;
  char        filename[FILENAME_MAX] = {0};
  uint32     *partmap                = new uint32 [unitigs.size()];

  //  This code closely follows that in AS_CGB_unitigger.c::output_the_chunks()

  if (isFinal)
    checkUnitigMembership(unitigs);

  // Open up the initial output file

  sprintf(filename, "%s.iidmap", fileprefix);
  FILE *iidm = fopen(filename, "w");
  assert(NULL != iidm);

  sprintf(filename, "%s.partitioning", fileprefix);
  FILE *part = fopen(filename, "w");
  assert(NULL != part);

  sprintf(filename, "%s.partitioningInfo", fileprefix);
  FILE *pari = fopen(filename, "w");
  assert(NULL != pari);

  //  Step through all the unitigs once to build the partition mapping and IID mapping.

  memset(partmap, 0xff, sizeof(uint32) * unitigs.size());

  for (uint32 iumiid=0, ti=0; ti<unitigs.size(); ti++) {
    Unitig  *utg = unitigs[ti];
    uint32   nf  = (utg) ? utg->getNumFrags() : 0;

    if ((utg == NULL) || (nf == 0))
      continue;

    assert(utg->getLength() > 0);
    assert(nf == utg->ufpath.size());

    if ((frg_count + nf >= frg_count_target) &&
        (frg_count      >  0)) {
      fprintf(pari, "Partition %d has %d unitigs and %d fragments.\n",
              prt_count, utg_count, frg_count);

      prt_count++;
      utg_count = 0;
      frg_count = 0;
    }

    uint32 tigid = (isFinal) ? iumiid : ti;

    assert(tigid < unitigs.size());
    partmap[tigid] = prt_count;

    fprintf(iidm, "Unitig "F_U32" == IUM "F_U32" (in partition "F_U32" with "F_U32" frags)\n",
            utg->id(),
            (tigid),
            partmap[(tigid)],
            nf);

    for (uint32 fragIdx=0; fragIdx<nf; fragIdx++) {
      ufNode  *f = &utg->ufpath[fragIdx];

      fprintf(part, "%d\t%d\n", prt_count, f->ident);
    }

    utg_count += 1;
    frg_count += nf;

    iumiid++;
  }

  fprintf(pari, "Partition %d has %d unitigs and %d fragments.\n",
          prt_count, utg_count, frg_count);

  fclose(pari);
  fclose(part);
  fclose(iidm);

  //  Step through all the unitigs once to build the partition mapping and IID mapping.

  tgStore     *tigStore = new tgStore(tigStorePath);
  tgTig       *tig      = new tgTig;

  for (uint32 iumiid=0, ti=0; ti<unitigs.size(); ti++) {
    Unitig  *utg = unitigs[ti];
    uint32   nf  = (utg) ? utg->getNumFrags() : 0;

    if ((utg == NULL) || (nf == 0))
      continue;

    unitigToTig(tig, (isFinal) ? iumiid : ti, utg);

    tigStore->insertTig(tig, false);

    iumiid++;
  }

  delete    tig;
  delete    tigStore;
  delete [] partmap;
}

Ejemplo n.º 5

Mostrar archivo

Archivo: AS_BAT_Outputs.C Proyecto: swang8/canu

void
writeUnitigsToStore(UnitigVector  &unitigs,
                    char          *fileprefix,
                    char          *tigStorePath,
                    uint32         frg_count_target,
                    bool           isFinal) {
  uint32      utg_count              = 0;
  uint32      frg_count              = 0;
  uint32      prt_count              = 1;
  char        filename[FILENAME_MAX] = {0};
  uint32     *partmap                = new uint32 [unitigs.size()];

  //  This code closely follows that in AS_CGB_unitigger.c::output_the_chunks()

  if (isFinal)
    checkUnitigMembership(unitigs);

  // Open up the initial output file

  sprintf(filename, "%s.iidmap", fileprefix);
  FILE *iidm = fopen(filename, "w");
  assert(NULL != iidm);

  sprintf(filename, "%s.partitioning", fileprefix);
  FILE *part = fopen(filename, "w");
  assert(NULL != part);

  sprintf(filename, "%s.partitioningInfo", fileprefix);
  FILE *pari = fopen(filename, "w");
  assert(NULL != pari);

  //  Step through all the unitigs once to build the partition mapping and IID mapping.

  tgStore     *tigStore = new tgStore(tigStorePath);
  tgTig       *tig      = new tgTig;

  for (uint32 tigID=0, ti=0; ti<unitigs.size(); ti++) {
    Unitig  *utg = unitigs[ti];

    if ((utg == NULL) || (utg->getNumFrags() == 0))
      continue;

    assert(utg->getLength() > 0);

    //  Convert the bogart tig to a tgTig and save to the store.

    unitigToTig(tig, (isFinal) ? tigID : ti, utg);
    tigID++;

    tigStore->insertTig(tig, false);

    //  Increment the partition if the current one is too large.

    if ((frg_count + utg->getNumFrags() >= frg_count_target) &&
        (frg_count                      >  0)) {
      fprintf(pari, "Partition %d has %d unitigs and %d fragments.\n",
              prt_count, utg_count, frg_count);

      prt_count++;
      utg_count = 0;
      frg_count = 0;
    }

    //  Note that the tig is included in this partition.

    utg_count += 1;
    frg_count += utg->getNumFrags();

    //  Map the tig to a partition, and log both the tig-to-partition map and the partition-to-read map.

    fprintf(iidm, "bogart "F_U32" -> tig "F_U32" (in partition "F_U32" with "F_U32" frags)\n",
            utg->id(),
            utg->tigID(),
            prt_count,
            utg->getNumFrags());

    for (uint32 fragIdx=0; fragIdx<utg->getNumFrags(); fragIdx++)
      fprintf(part, "%d\t%d\n", prt_count, utg->ufpath[fragIdx].ident);
  }

  fprintf(pari, "Partition %d has %d unitigs and %d fragments.\n",   //  Don't forget to log the last partition!
          prt_count, utg_count, frg_count);

  fclose(pari);
  fclose(part);
  fclose(iidm);

  delete    tig;
  delete    tigStore;
}

Ejemplo n.º 6

Mostrar archivo

Archivo: AS_BAT_Joining.C Proyecto: ondovb/canu

//  Examine the first (few?) fragments of a unitig, evaluate if they indicate a join should be made.
static
bool
joinUnitigs_examineEnd(UnitigVector      &unitigs,
                       Unitig            *fr,
                       uint32             idx,
                       bool               frFirstEnd,
                       vector<joinEntry> &joins) {
  uint32           frgIdx  = (frFirstEnd) ? (idx) : (fr->ufpath.size() - 1 - idx);
  ufNode          *frg     = &fr->ufpath[frgIdx];
  bool             frgRev  = (frg->position.end < frg->position.bgn);

  //  Grab the best edge for this end frag.  The last arg requests the 3' end if true.
  //
  //  If we're looking at the first read, we want to get:
  //    5' - if the frag is forward
  //    3' - if the frag is reverse (frgRev == true)
  //
  //  If we're looking at the lat read, we want to get:
  //    5' - if the frag is reverse
  //    3' - if the frag is forward  (frgRev == false)
  //
  BestEdgeOverlap *bestEdge    = OG->getBestEdgeOverlap(frg->ident, (frgRev == frFirstEnd));

  uint32      tgtId = bestEdge->fragId();
  bool        tgt3p = bestEdge->frag3p();

  if (tgtId == 0)
    //  No best edge?  Skip it.
    return(false);

  //  Grab the unitig for that best edge.

  uint32   toID  = fr->fragIn(tgtId);
  Unitig  *to    = unitigs[toID];

  if (to->ufpath.size() == 1)
    //  Joining to something teeny?  Don't bother checking further.
    return(false);

  if (to->id() == fr->id())
    //  Join to myself?  Nope.
    return(false);

  //  Grab the read we have an edge to, an compute the overlapping length and left over length.

  ufNode  *tgt    = &to->ufpath[to->pathPosition(tgtId)];
  bool     tgtRev = (tgt->position.end < tgt->position.bgn);

  //  If tgt3p (we overlap to the 3' end) is the same as tgtRev (read is reverse) then the unitig is oriented
  //  correctly.  Otherwise, positions need to be reverse-complemented.


  bool     toFlip = false;

  if ((frFirstEnd == true) && (tgt3p == false) && (tgtRev == false))
    //  source read is at the start, overlap to 5' and the read is forward, need to flip the target unitig
    toFlip = true;

  if ((frFirstEnd == true) && (tgt3p == true) && (tgtRev == true))
    //  source read is at the start, overlap to 3' and the read is reverse, need to flip the target unitig
    toFlip = true;


  if ((frFirstEnd == false) && (tgt3p == false) && (tgtRev == true))
    //  source read is at the end, overlap to 5' and the read is reverse, need to flip the target unitig
    toFlip = true;

  if ((frFirstEnd == false) && (tgt3p == true) && (tgtRev == false))
    //  source read is at the end, overlap to 3' and the read is forward, need to flip the target unitig
    toFlip = true;


  uint32   toMin = MIN(tgt->position.bgn, tgt->position.end);
  uint32   toMax = MAX(tgt->position.bgn, tgt->position.end);
  uint32   toLen = to->getLength();
  uint32   frLen = fr->getLength();

  if (toFlip) {
    toMin = toLen - MAX(tgt->position.bgn, tgt->position.end);
    toMax = toLen - MIN(tgt->position.bgn, tgt->position.end);
  }

  assert(toMin < toMax);

  //  Our two unitigs are of length frLen and toLen.  We are appending some portion of 'to' onto
  //  'fr', and 'discarding' the rest.  If the 'discarded' piece is larger than the 'fr' unitig, we
  //  don't want to do the join.
  //
  //  We err on the side of the discarded piece.

  uint32   joinLen = 0;
  uint32   discLen = 0;

  if (frFirstEnd == true) {
    joinLen = toMin + frLen;  //  Prepend the start of 'to' onto 'fr'.
    discLen = toLen - toMin;

  } else {
    joinLen = frLen + toLen - toMax;  //  Append the end of 'to' onto 'fr'.
    discLen = toMax;
  }

  //  If the discard is bigger than us, we do damage by joining.

  if (discLen > frLen)
    return(false);

  //  The joined should be much larger and the discarded much smaller.

  uint32    maxLen = MAX(frLen, toLen);
  uint32    minLen = MIN(frLen, toLen);

  double    joinChange = (double)joinLen / maxLen;
  double    discChange = (double)discLen / minLen;

  bool      isBad = false;

  if ((joinChange < 1.10) ||
      (0.75       < discChange))
    //  Bad if we didn't really change sizes.
    isBad = true;

  if ((1.0        < joinChange) &&
      (discChange < 0.5))
    //  But good if discard is tiny.  This occurs if we merge a small with a big.  The join change
    //  is somewhat small (1.05 say) yet most of the smaller unitig is used.
    isBad = false;

  if (isBad) {
    writeLog("joinUnitigs_examineEnd()-- join unitig %6u (%7ubp) frag %6u %s <-> unitig %6u (%7ubp) frag %6u %s <-> length %5.2f %7u and %5.2f %7u BAD\n",
             fr->id(), fr->getLength(), frg->ident, (frgRev) ? "rev" : "fwd",
             to->id(), to->getLength(), tgt->ident, (tgtRev) ? "rev" : "fwd",
             joinChange, joinLen,
             discChange, discLen);
    return(false);
  }

  //  OK, join.

  writeLog("joinUnitigs_examineEnd()-- join unitig %6u (%7ubp) frag %6u %s <-> unitig %6u (%7ubp) frag %6u %s <-> length %5.2f %7u and %5.2f %7u\n",
           fr->id(), fr->getLength(), frg->ident, (frgRev) ? "rev" : "fwd",
           to->id(), to->getLength(), tgt->ident, (tgtRev) ? "rev" : "fwd",
           joinChange, joinLen,
           discChange, discLen);

  joins.push_back(joinEntry(frg->ident, frFirstEnd, tgt->ident, toFlip, joinLen));

  return(true);
}

Ejemplo n.º 7

Mostrar archivo

Archivo: AS_BAT_Instrumentation.C Proyecto: AndreasHegerGenomics/canu

void
reportUnitigs(UnitigVector &unitigs, const char *prefix, const char *name, uint64 genomeSize) {

  //  Generate n50.  Assumes unitigs have been 'classified' already.

  vector<uint32>   unassembledLength;
  vector<uint32>   bubbleLength;
  vector<uint32>   repeatLength;
  vector<uint32>   circularLength;
  vector<uint32>   contigLength;

  for (uint32  ti=0; ti<unitigs.size(); ti++) {
    Unitig  *utg = unitigs[ti];

    if (utg == NULL)
      continue;

    if (utg->_isUnassembled) {
      unassembledLength.push_back(utg->getLength());
    }

    else if (utg->_isBubble) {
      bubbleLength.push_back(utg->getLength());
    }

    else if (utg->_isRepeat) {
      repeatLength.push_back(utg->getLength());
    }

    else if (utg->_isCircular) {
      circularLength.push_back(utg->getLength());
    }

    else {
      contigLength.push_back(utg->getLength());
    }
  }

  char   N[FILENAME_MAX];

  sprintf(N, "%s.sizes", getLogFilePrefix());

  errno = 0;
  FILE *F = fopen(N, "w");
  if (errno == 0) {
    reportN50(F, unassembledLength, "UNASSEMBLED", genomeSize);
    reportN50(F, bubbleLength,      "BUBBLE",      genomeSize);
    reportN50(F, repeatLength,      "REPEAT",      genomeSize);
    reportN50(F, circularLength,    "CIRCULAR",    genomeSize);
    reportN50(F, contigLength,      "CONTIGS",     genomeSize);

    fclose(F);
  }

  if (logFileFlagSet(LOG_INTERMEDIATE_UNITIGS) == 0)
    return;

  //  Dump to an intermediate store.

  char tigStorePath[FILENAME_MAX];
  sprintf(tigStorePath, "%s.tigStore", getLogFilePrefix());

  fprintf(stderr, "Creating intermediate tigStore '%s'\n", tigStorePath);

  uint32  numFragsT  = 0;
  uint32  numFragsP  = 0;
  uint64  utgLen     = 0;

  //  Compute average frags per partition.

  for (uint32  ti=0; ti<unitigs.size(); ti++) {
    Unitig  *utg = unitigs[ti];

    if (utg == NULL)
      continue;

    numFragsT += utg->ufpath.size();

    if (utg->ufpath.size() > 2)
      utgLen    += utg->getLength();
  }

  if      (utgLen < 16 * 1024 * 1024)
    numFragsP = numFragsT / 7;
  else if (utgLen < 64 * 1024 * 1024)
    numFragsP = numFragsT / 63;
  else
    numFragsP = numFragsT / 127;

  //  Dump the unitigs to an intermediate store.

  setParentAndHang(unitigs);

  writeUnitigsToStore(unitigs, tigStorePath, tigStorePath, numFragsP, false);
}

Ejemplo n.º 8

Mostrar archivo

Archivo: AS_BAT_Instrumentation.C Proyecto: AndreasHegerGenomics/canu

//  Decides if a unitig is unassembled.  The other classifications (isBubble, isCircular, isRepeat)
//  are made when the type is processed (e.g., when bubbles are popped).
//
//  A unitig is unassembled if:
//    1) it has fewer than R reads (R=2)
//    2) it is shorter than S bases (S=1000)
//    3) a single read spans at least fraction F of the lenth (F=1.0)
//    4) at least fraction F of the unitig is below read depth D (F=1.0, D=2)
//
void
classifyUnitigsAsUnassembled(UnitigVector &unitigs,
                             uint32        fewReadsNumber,
                             uint32        tooShortLength,
                             double        spanFraction,
                             double        lowcovFraction,   uint32  lowcovDepth) {
  uint32  nTooFew   = 0;
  uint32  nShort    = 0;
  uint32  nSingle   = 0;
  uint32  nCoverage = 0;
  uint32  nContig   = 0;

  uint64  bTooFew   = 0;
  uint64  bShort    = 0;
  uint64  bSingle   = 0;
  uint64  bCoverage = 0;
  uint64  bContig   = 0;

  char   N[FILENAME_MAX];

  sprintf(N, "%s.unassembled", getLogFilePrefix());

  errno = 0;
  FILE *F = fopen(N, "w");
  if (errno)
    F = NULL;

  for (uint32  ti=0; ti<unitigs.size(); ti++) {
    Unitig  *utg = unitigs[ti];

    if (utg == NULL)
      continue;

    utg->_isUnassembled = false;

    //  Rule 1.  Too few reads.

    if (utg->ufpath.size() < fewReadsNumber) {
      fprintf(F, "unitig "F_U32" unassembled - too few reads ("F_U64" < "F_U32")\n", ti, utg->ufpath.size(), fewReadsNumber);
      utg->_isUnassembled = true;
      nTooFew += 1;
      bTooFew += utg->getLength();
      continue;
    }

    //  Rule 2.  Short.

    if (utg->getLength() < tooShortLength) {
      fprintf(F, "unitig "F_U32" unassembled - too short ("F_U32" < "F_U32")\n", ti, utg->getLength(), tooShortLength);
      utg->_isUnassembled = true;
      nShort += 1;
      bShort += utg->getLength();
      continue;
    }

    //  Rule 3.  Single read spans large fraction of tig.

    for (uint32 oi=0; oi<utg->ufpath.size(); oi++) {
      ufNode  *frg = &utg->ufpath[oi];

      int frgbgn = MIN(frg->position.bgn, frg->position.end);
      int frgend = MAX(frg->position.bgn, frg->position.end);

      if (frgend - frgbgn > utg->getLength() * spanFraction) {
        fprintf(F, "unitig "F_U32" unassembled - single read spans unitig (read "F_U32" "F_U32"-"F_U32" spans fraction %f > %f\n",
                 ti, frg->ident, frg->position.bgn, frg->position.end, (double)(frgend - frgbgn) / utg->getLength(), spanFraction);
        utg->_isUnassembled = true;
        nSingle += 1;
        bSingle += utg->getLength();
        break;
      }
    }
    if (utg->_isUnassembled)
      continue;

    //  Rule 4.  Low coverage.

    intervalList<int32>  IL;

    for (uint32 oi=0; oi<utg->ufpath.size(); oi++) {
      ufNode  *frg = &utg->ufpath[oi];

      int frgbgn = MIN(frg->position.bgn, frg->position.end);
      int frgend = MAX(frg->position.bgn, frg->position.end);

      IL.add(frgbgn, frgend - frgbgn);
    }

    intervalList<int32>  ID(IL);

    uint32  basesLow  = 0;
    uint32  basesHigh = 0;

    for (uint32 ii=0; ii<ID.numberOfIntervals(); ii++)
      if (ID.depth(ii) < lowcovDepth)
        basesLow  += ID.hi(ii) - ID.lo(ii) + 1;
      else
        basesHigh += ID.hi(ii) - ID.lo(ii) + 1;

    double  lowcov = (double)basesLow / (basesLow + basesHigh);

    if (lowcov >= lowcovFraction) {
      fprintf(F, "Unitig "F_U32" unassembled - low coverage (%.4f > %.4f at < "F_U32"x coverage)\n",
               ti, lowcov, lowcovFraction, lowcovDepth);
      utg->_isUnassembled = true;
      nCoverage += 1;
      bCoverage += utg->getLength();
      continue;
    }

    //  Otherwise, unitig is assembled!

    nContig += 1;
    bContig += utg->getLength();
  }

  writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- too few reads\n",        nTooFew,   bTooFew);
  writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- too short\n",            nShort,    bShort);
  writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- single spanning read\n", nSingle,   bSingle);
  writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- low coverage\n",         nCoverage, bCoverage);
  writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- acceptable contigs\n",   nContig,   bContig);
}

Ejemplo n.º 9

Mostrar archivo

Archivo: AS_BAT_PopBubbles.C Proyecto: AndreasHegerGenomics/canu

void
findPotentialBubbles(UnitigVector    &unitigs,
                     BubTargetList   &potentialBubbles) {
  uint32  tiLimit      = unitigs.size();
  uint32  tiNumThreads = omp_get_max_threads();
  uint32  tiBlockSize  = (tiLimit < 100000 * tiNumThreads) ? tiNumThreads : tiLimit / 99999;

  writeStatus("\n");
  writeStatus("bubbleDetect()-- working on "F_U32" unitigs, with "F_U32" threads.\n", tiLimit, tiNumThreads);

  for (uint32 ti=0; ti<tiLimit; ti++) {
    Unitig  *tig = unitigs[ti];

    if ((tig == NULL) ||               //  Not a tig, ignore it.
        (tig->ufpath.size() == 1))     //  Singleton, handled elsewhere.
      continue;

    uint32  nonContainedReads = 0;
    bool    validBubble       = true;

    map<uint32,uint32>  tigOlapsTo;

    uint32  fiLimit      = tig->ufpath.size();
    uint32  fiNumThreads = omp_get_max_threads();
    uint32  fiBlockSize  = (fiLimit < 100 * fiNumThreads) ? fiNumThreads : fiLimit / 99;

    for (uint32 fi=0; (validBubble == true) && (fi<fiLimit); fi++) {
      uint32      rid      = tig->ufpath[fi].ident;

      if (OG->isContained(rid) == true)  //  Don't need to check contained reads.  If their container
        continue;                        //  passes the tests below, the contained read will too.

      nonContainedReads++;

      uint32      ovlLen   = 0;
      BAToverlap *ovl      = OC->getOverlaps(rid, AS_MAX_ERATE, ovlLen);

      set<uint32>  readOlapsTo;

      for (uint32 oi=0; oi<ovlLen; oi++) {
        uint32  ovlTigID = Unitig::fragIn(ovl[oi].b_iid);
        Unitig *ovlTig   = unitigs[ovlTigID];

        //  Skip this overlap if it is to an unplaced read, to a singleton tig, to ourself,
        //  or to a unitig that is shorter than us.  We can not pop this tig as a bubble
        //  in any of those cases.

        if ((ovlTigID == 0) ||
            (ovlTig == NULL) ||
            (ovlTig->ufpath.size() == 1) ||
            (ovlTig->id() == tig->id()) ||
            (ovlTig->getLength() < tig->getLength()))
          continue;

        //  Otherwise, remember that we had an overlap to ovlTig.

        //writeLog("tig %u read %u overlap to tig %u read %u\n",
        //         tig->id(), rid, ovlTigID, ovl[oi].b_iid);

        readOlapsTo.insert(ovlTigID);
      }

      //writeLog("tig %8u read %8u has %u olaps\n", tig->id(), rid, readOlapsTo.size());

      //  Transfer the per-read counts to the per-unitig counts:  add one to the counter for each tig
      //  that we have overlaps to.

      for (set<uint32>::iterator it=readOlapsTo.begin(); it != readOlapsTo.end(); ++it)
        tigOlapsTo[*it]++;

      //  Decide if we're a valid potential bubble.  If tig id (in it->first) has overlaps to every
      //  read we've seen so far (nonContainedReads), we're still a valid bubble.
      //
      //  To _attempt_ to have differences in the bubble, we'll accept it if 3/4 of the reads
      //  have overlaps.

      validBubble = false;

      for (map<uint32,uint32>::iterator it=tigOlapsTo.begin(); it != tigOlapsTo.end(); ++it)
        if (it->second >= BUBBLE_READ_FRACTION * nonContainedReads)
          validBubble = true;

      //  If we've not seen that many reads, pretend it's a valid bubble.  It'll get screened out later.

      if (nonContainedReads < 16)
        validBubble = true;
    }

    //  If not validBubble, report.

#if 0
    if (validBubble == false) {
      writeLog("notValidBubble tig %8d expects %6u reads\n", tig->id(), nonContainedReads);

      for (map<uint32,uint32>::iterator it=tigOlapsTo.begin(); it != tigOlapsTo.end(); ++it)
        writeLog("  to tig %8u overlaps %6u\n", it->first, it->second);
    }
#endif

    //  If validBubble, then there is a tig that every dovetail read has at least one overlap to.
    //  Save those tigs in potentialBubbles.

    uint32  nTigs = 0;

    if (validBubble) {
      for (map<uint32,uint32>::iterator it=tigOlapsTo.begin(); it != tigOlapsTo.end(); ++it)
        if (it->second >= BUBBLE_READ_FRACTION * nonContainedReads)
          nTigs++;
    }

    //  ALWAYS log potential bubbles.

    if (nTigs > 0) {
      writeLog("\n");
      writeLog("potential bubble tig %8u length %9u nReads %7u to %3u tigs:\n",
               tig->id(), tig->getLength(), tig->ufpath.size(), nTigs);

      for (map<uint32,uint32>::iterator it=tigOlapsTo.begin(); it != tigOlapsTo.end(); ++it) {
        if (it->second >= BUBBLE_READ_FRACTION * nonContainedReads) {
          Unitig  *dest = unitigs[it->first];

          writeLog("                 tig %8u length %9u nReads %7u\n", dest->id(), dest->getLength(), dest->ufpath.size());

          potentialBubbles[ti].push_back(dest->id());
        }
      }
    }
  }

  flushLog();
}

Ejemplo n.º 10

Mostrar archivo

Archivo: AS_BAT_PopBubbles.C Proyecto: AndreasHegerGenomics/canu

void
popBubbles(UnitigVector &unitigs,
           double deviationBubble) {

  BubTargetList   potentialBubbles;

  findPotentialBubbles(unitigs, potentialBubbles);

  writeStatus("popBubbles()-- Found "F_SIZE_T" potential bubbles.\n", potentialBubbles.size());

  //if (potentialBubbles.size() == 0)
  //  return;

  writeLog("\n");
  writeLog("Found "F_SIZE_T" potential bubbles.\n", potentialBubbles.size());
  writeLog("\n");

  vector<overlapPlacement>   *placed = findBubbleReadPlacements(unitigs, potentialBubbles, deviationBubble);

  //  We now have, in 'placed', a list of all the places that each read could be placed.  Decide if there is a _single_
  //  place for each bubble to be popped.

  uint32  tiLimit      = unitigs.size();
  //uint32  tiNumThreads = omp_get_max_threads();
  //uint32  tiBlockSize  = (tiLimit < 100000 * tiNumThreads) ? tiNumThreads : tiLimit / 99999;

  //  Clear flags.
  for (uint32 ti=0; ti<tiLimit; ti++) {
    if (unitigs[ti]) {
      unitigs[ti]->_isBubble = false;
      unitigs[ti]->_isRepeat = false;
    }
  }

  //  In parallel, process the placements.

  for (uint32 ti=0; ti<tiLimit; ti++) {
    if (potentialBubbles.count(ti) == 0)   //  Not a potential bubble
      continue;

    //  Scan the bubble, decide if there are _ANY_ read placements.  Log appropriately.

    Unitig  *bubble = unitigs[ti];
    bool     hasPlacements = false;

    for (uint32 fi=0; fi<bubble->ufpath.size(); fi++) {
      uint32  readID  = bubble->ufpath[fi].ident;

      if (placed[readID].size() > 0)
        hasPlacements = true;
    }

    if (hasPlacements == false)
      writeLog("potential bubble %u had no valid placements (all were not contained in target tig)\n", ti);
    else
      writeLog("potential bubble %u\n", ti);

    //  Split the placements into piles for each target and build an interval list for each target.
    //  For each read in the tig, convert the vector of placements into interval lists, one list per target tig.

    map<uint32, intervalList<uint32> *>  targetIntervals;

    for (uint32 fi=0; fi<bubble->ufpath.size(); fi++) {
      uint32  readID  = bubble->ufpath[fi].ident;

      for (uint32 pp=0; pp<placed[readID].size(); pp++) {
        uint32  tid = placed[readID][pp].tigID;

        assert(placed[readID][pp].frgID > 0);

        uint32  bgn = (placed[readID][pp].position.bgn < placed[readID][pp].position.end) ? placed[readID][pp].position.bgn : placed[readID][pp].position.end;
        uint32  end = (placed[readID][pp].position.bgn < placed[readID][pp].position.end) ? placed[readID][pp].position.end : placed[readID][pp].position.bgn;

        if (targetIntervals[tid] == NULL)
          targetIntervals[tid] = new intervalList<uint32>;

        //writeLog("read %u -> tig %u intervals %u-%u\n", readID, tid, bgn, end);

        targetIntervals[tid]->add(bgn, end-bgn);
      }
    }

    vector<candidatePop *>    targets;

    //  Squish the intervals.  Create new candidatePops for each interval that isn't too big or
    //  small.  Assign each overlapPlacements to the correct candidatePop.

    for (map<uint32, intervalList<uint32> *>::iterator it=targetIntervals.begin(); it != targetIntervals.end(); ++it) {
      uint32                 targetID = it->first;
      intervalList<uint32>  *IL       = it->second;

      IL->merge();

      //  Discard intervals that are significantly too small or large.  Save the ones that are
      //  nicely sized.  Logging here isn't terribly useful, it's just repeated (out of order) later
      //  when we try to make sense of the read alignments.

      for (uint32 ii=0; ii<IL->numberOfIntervals(); ii++) {
        if ((IL->hi(ii) - IL->lo(ii) < 0.75 * bubble->getLength()) ||   //  Too small!
            (1.25 * bubble->getLength() < IL->hi(ii) - IL->lo(ii))) {   //  Too big!
          writeLog("tig %8u length %9u -> target %8u piece %2u position %9u-%9u length %8u - size mismatch, discarded\n",
                   bubble->id(), bubble->getLength(),
                   targetID, ii, IL->lo(ii), IL->hi(ii), IL->hi(ii) - IL->lo(ii));
          continue;
        }

        writeLog("tig %8u length %9u -> target %8u piece %2u position %9u-%9u length %8u\n",
                 bubble->id(), bubble->getLength(),
                 targetID, ii, IL->lo(ii), IL->hi(ii), IL->hi(ii) - IL->lo(ii));

        targets.push_back(new candidatePop(bubble, unitigs[targetID], IL->lo(ii), IL->hi(ii)));
      }

      delete IL;
    }

    targetIntervals.clear();

    //  If no targets, nothing to do.

    if (targets.size() == 0)
      continue;

    //  Run through the placements again, and assign them to the correct target.
    //
    //  For each read:
    //  For each acceptable placement:
    //  For each target location:
    //  If the placement is for this target, save it.

    for (uint32 fi=0; fi<bubble->ufpath.size(); fi++) {
      uint32  readID  = bubble->ufpath[fi].ident;

      for (uint32 pp=0; pp<placed[readID].size(); pp++) {
        uint32  tid = placed[readID][pp].tigID;

        uint32  bgn = (placed[readID][pp].position.bgn < placed[readID][pp].position.end) ? placed[readID][pp].position.bgn : placed[readID][pp].position.end;
        uint32  end = (placed[readID][pp].position.bgn < placed[readID][pp].position.end) ? placed[readID][pp].position.end : placed[readID][pp].position.bgn;

        for (uint32 tt=0; tt<targets.size(); tt++)
          if ((targets[tt]->target->id() == tid) &&
              (targets[tt]->bgn < end) && (bgn < targets[tt]->end))
            targets[tt]->placed.push_back(placed[readID][pp]);
      }
    }

    //  Count the number of targets that have all the reads (later: in the correct order, etc, etc).  Remove those
    //  that don't.

    uint32  nTargets = 0;

    set<uint32>  tigReads;  //  Reads in the bubble tig.
    set<uint32>  tgtReads;  //  Reads in the bubble that have a placement in the target.

    //  Remove duplicate placements from each target.

    for (uint32 tt=0; tt<targets.size(); tt++) {
      candidatePop *t = targets[tt];

      //  Detect duplicates, keep the one with lower error.  There are a lot of duplicate
      //  placements, logging isn't terribly useful.

      for (uint32 aa=0; aa<t->placed.size(); aa++) {
        for (uint32 bb=0; bb<t->placed.size(); bb++) {
          if ((aa == bb) ||
              (t->placed[aa].frgID != t->placed[bb].frgID) ||
              (t->placed[aa].frgID == 0) ||
              (t->placed[bb].frgID == 0))
            continue;

          if (t->placed[aa].errors / t->placed[aa].aligned < t->placed[bb].errors / t->placed[bb].aligned) {
#ifdef SHOW_MULTIPLE_PLACEMENTS
            writeLog("duplicate read alignment for tig %u read %u - better %u-%u %.4f - worse %u-%u %.4f\n",
                     t->placed[aa].tigID, t->placed[aa].frgID,
                     t->placed[aa].position.bgn, t->placed[aa].position.end, t->placed[aa].errors / t->placed[aa].aligned,
                     t->placed[bb].position.bgn, t->placed[bb].position.end, t->placed[bb].errors / t->placed[bb].aligned);
#endif
            t->placed[bb] = overlapPlacement();
          } else {
#ifdef SHOW_MULTIPLE_PLACEMENTS
            writeLog("duplicate read alignment for tig %u read %u - better %u-%u %.4f - worse %u-%u %.4f\n",
                     t->placed[aa].tigID, t->placed[aa].frgID,
                     t->placed[bb].position.bgn, t->placed[bb].position.end, t->placed[bb].errors / t->placed[bb].aligned,
                     t->placed[aa].position.bgn, t->placed[aa].position.end, t->placed[aa].errors / t->placed[aa].aligned);
#endif
            t->placed[aa] = overlapPlacement();
          }
        }
      }

      //  Get rid of any now-empty entries.

      for (uint32 aa=t->placed.size(); aa--; ) {
        if (t->placed[aa].frgID == 0) {
          t->placed[aa] = t->placed.back();
          t->placed.pop_back();
        }
      }
    }

    //  Make a set of the reads in the bubble.  We'll compare each target against this to decide if all reads are placed.

    for (uint32 fi=0; fi<bubble->ufpath.size(); fi++)
      tigReads.insert(bubble->ufpath[fi].ident);

    uint32   nOrphan      = 0;   //  Full coverage; bubble can be popped.
    uint32   orphanTarget = 0;

    uint32   nBubble      = 0;   //  Partial coverage, bubble cannot be popped.
    uint32   bubbleTarget = 0;

    for (uint32 tt=0; tt<targets.size(); tt++) {
      tgtReads.clear();

      for (uint32 op=0; op<targets[tt]->placed.size(); op++) {
        if (logFileFlagSet(LOG_BUBBLE_DETAIL))
          writeLog("tig %8u length %9u -> target %8u piece %2u position %9u-%9u length %8u - read %7u at %9u-%9u\n",
                   bubble->id(), bubble->getLength(),
                   targets[tt]->target->id(), tt, targets[tt]->bgn, targets[tt]->end, targets[tt]->end - targets[tt]->bgn,
                   targets[tt]->placed[op].frgID,
                   targets[tt]->placed[op].position.bgn, targets[tt]->placed[op].position.end);

        assert(targets[tt]->placed[op].frgID > 0);
        tgtReads.insert(targets[tt]->placed[op].frgID);
      }

      //  Count the number of consecutive reads from the 5' or 3' end of the bubble that are placed
      //  in the target.
      //
      //  Also, count the number of reads in the bubble that are placed in the target.  Likely the
      //  same as n5 + n3.

      uint32  n5 = 0;
      uint32  n3 = 0;
      uint32  nt = 0;

      for (uint32 fi=0; fi<bubble->ufpath.size(); fi++)
        if (tgtReads.count(bubble->ufpath[fi].ident) > 0)
          n5++;
        else
          break;

      for (uint32 fi=bubble->ufpath.size(); fi-->0; )
        if (tgtReads.count(bubble->ufpath[fi].ident) > 0)
          n3++;
        else
          break;


      for (uint32 fi=0; fi<bubble->ufpath.size(); fi++)
        if (tgtReads.count(bubble->ufpath[fi].ident) > 0)
          nt++;


      //  Report now, before we nuke targets[tt] for being not a bubble!

      if ((nt == bubble->ufpath.size()) ||
          ((n5 > 0) && (n3 > 0)))
        writeLog("tig %8u length %9u -> target %8u piece %2u position %9u-%9u length %8u - expected %3"F_SIZE_TP" reads, had %3"F_SIZE_TP" reads.  n5=%3u n3=%3u nt=%3u\n",
                 bubble->id(), bubble->getLength(),
                 targets[tt]->target->id(), tt, targets[tt]->bgn, targets[tt]->end, targets[tt]->end - targets[tt]->bgn,
                 tigReads.size(),
                 tgtReads.size(), n5, n3, nt);

      //  Decide if this is a bubble, orphan from construction, or repeat.

      if (nt == bubble->ufpath.size()) {
        nOrphan++;
        orphanTarget = tt;
      }

      else if ((n5 > 0) && (n3 > 0)) {
        nBubble++;
        bubbleTarget = tt;
      }
    }

    //  If no placements, pbbbt.

    if (nOrphan + nBubble == 0) {
      //writeLog("tig %8u length %8u reads %6u had no bubble or orphan placements.\n", bubble->id(), bubble->getLength(), bubble->ufpath.size());
      continue;
    }

    //  If multiple orphan and/or bubble placements, it's a repeat.

    if (nOrphan + nBubble > 1) {
      writeLog("tig %8u length %8u reads %6u - repeat - %u orphan %u bubble placements.\n",
               bubble->id(), bubble->getLength(), bubble->ufpath.size(),
               nOrphan, nBubble);
      writeLog("\n");
      bubble->_isRepeat = true;
      continue;
    }

    //  If a bubble placement, mark it as a bubble so it can be skipped during repeat detection.

    if (nBubble > 0) {
      writeLog("tig %8u length %8u reads %6u - bubble\n",
               bubble->id(), bubble->getLength(), bubble->ufpath.size());
      writeLog("\n");
      bubble->_isBubble = true;
      continue;
    }

    //  Otherwise, it's an orphan, move the reads to the proper place.

    writeLog("tig %8u length %8u reads %6u - orphan\n", bubble->id(), bubble->getLength(), bubble->ufpath.size());

    for (uint32 op=0, tt=orphanTarget; op<targets[tt]->placed.size(); op++) {
      ufNode  frg;

      frg.ident        = targets[tt]->placed[op].frgID;
      frg.contained    = 0;
      frg.parent       = 0;
      frg.ahang        = 0;
      frg.bhang        = 0;
      frg.position.bgn = targets[tt]->placed[op].position.bgn;
      frg.position.end = targets[tt]->placed[op].position.end;

      writeLog("move read %u from tig %u to tig %u %u-%u\n",
               frg.ident,
               bubble->id(),
               targets[tt]->target->id(), frg.position.bgn, frg.position.end);

      targets[tt]->target->addFrag(frg, 0, false);
    }

    writeLog("\n");

    unitigs[bubble->id()] = NULL;
    delete bubble;
  }  //  Over all bubbles

  writeLog("\n");   //  Needed if no bubbles are popped.

  delete [] placed;

  //  Sort reads in all the tigs.  Overkill, but correct.

  for (uint32 ti=0; ti<tiLimit; ti++) {
    Unitig  *tig = unitigs[ti];

    if ((tig == NULL) ||               //  Not a tig, ignore it.
        (tig->ufpath.size() == 1))     //  Singleton, already sorted.
      continue;

    tig->sort();
  }
}

Ejemplo n.º 11

Mostrar archivo

Archivo: AS_BAT_MarkRepeatReads.C Proyecto: Ecogpr/canu

void
markRepeatReads(UnitigVector &unitigs,
                double        deviationRepeat,
                uint32        confusedAbsolute,
                double        confusedPercent) {
  uint32  tiLimit = unitigs.size();
  uint32  numThreads = omp_get_max_threads();
  uint32  blockSize = (tiLimit < 100000 * numThreads) ? numThreads : tiLimit / 99999;

  writeLog("repeatDetect()-- working on "F_U32" unitigs, with "F_U32" threads.\n", tiLimit, numThreads);

  vector<olapDat>      repeatOlaps;   //  Overlaps to reads promoted to tig coords

  intervalList<int32>  tigMarksR;     //  Marked repeats based on reads, filtered by spanning reads
  intervalList<int32>  tigMarksU;     //  Non-repeat invervals, just the inversion of tigMarksR


  for (uint32 ti=0; ti<tiLimit; ti++) {
    Unitig  *tig = unitigs[ti];

    if (tig == NULL)
      continue;

    if (tig->ufpath.size() == 1)
      continue;

    vector<olapDat>   repeats;

    writeLog("Annotating repeats in reads for tig %u/%u.\n", ti, tiLimit);

    //  Clear out all the existing marks.  They're not for this tig.


    //  Analyze overlaps for each read.  For each overlap to a read not in this tig, or not
    //  overlapping in this tig, and of acceptable error rate, add the overlap to repeatOlaps.

    repeatOlaps.clear();

    uint32  fiLimit    = tig->ufpath.size();
    uint32  numThreads = omp_get_max_threads();
    uint32  blockSize  = (fiLimit < 100 * numThreads) ? numThreads : fiLimit / 99;

#pragma omp parallel for if(fiLimit > 100) schedule(dynamic, blockSize)
    for (uint32 fi=0; fi<fiLimit; fi++)
      annotateRepeatsOnRead(unitigs, tig, &tig->ufpath[fi], deviationRepeat, repeatOlaps);

    writeLog("Annotated with %lu overlaps.\n", repeatOlaps.size());

    //  Merge marks for the same read into the largest possible.

    sort(repeatOlaps.begin(), repeatOlaps.end(), olapDatByEviRid);

#ifdef SHOW_ANNOTATE
    for (uint32 ii=0; ii<repeatOlaps.size(); ii++)
      if (repeatOlaps[ii].tigbgn < 1000000)
        writeLog("repeatOlaps[%u] %u-%u from tig %u read %u RAW\n",
                 ii,
                 repeatOlaps[ii].tigbgn, repeatOlaps[ii].tigend,
                 repeatOlaps[ii].eviTid, repeatOlaps[ii].eviRid);

    flushLog();
#endif

    for (uint32 dd=0, ss=1; ss<repeatOlaps.size(); ss++) {
      assert(repeatOlaps[dd].eviRid <= repeatOlaps[ss].eviRid);

      //  If different evidence reads, close the destination olap, set up
      //  for a new destination.

      if (repeatOlaps[dd].eviRid != repeatOlaps[ss].eviRid) {
        dd = ss;
        continue;
      }

      //  If the destination ends before the source begins, there is no overlap between the
      //  two regions.  Close dd, set up for a new dd.

      if (repeatOlaps[dd].tigend <= repeatOlaps[ss].tigbgn) {
        dd = ss;
        continue;
      }

      //  Otherwise, there must be an overlap.  Extend the destination region, erase the source
      //  region.

      repeatOlaps[dd].tigbgn = min(repeatOlaps[ss].tigbgn, repeatOlaps[dd].tigbgn);
      repeatOlaps[dd].tigend = max(repeatOlaps[ss].tigend, repeatOlaps[dd].tigend);

      repeatOlaps[ss].tigbgn = UINT32_MAX;
      repeatOlaps[ss].tigend = UINT32_MAX;
      repeatOlaps[ss].eviTid = UINT32_MAX;
      repeatOlaps[ss].eviRid = UINT32_MAX;
    }

    //  Sort overlaps again.  This pushes all those 'erased' regions to the end of the list, which
    //  we can then just pop off.

    sort(repeatOlaps.begin(), repeatOlaps.end(), olapDatByEviRid);

    for (uint32 ii=repeatOlaps.size(); ii--; )
      if (repeatOlaps[ii].eviTid == UINT32_MAX)
        repeatOlaps.pop_back();

    //  For logging, sort by coordinate

    sort(repeatOlaps.begin(), repeatOlaps.end());

#ifdef SHOW_ANNOTATE
    for (uint32 ii=0; ii<repeatOlaps.size(); ii++)
      writeLog("repeatOlaps[%d] %u-%u from tig %u read %u MERGED\n",
               ii,
               repeatOlaps[ii].tigbgn, repeatOlaps[ii].tigend,
               repeatOlaps[ii].eviTid, repeatOlaps[ii].eviRid);
#endif

    //  Make a new set of intervals based on all the detected repeats.

    tigMarksR.clear();

    for (uint32 bb=0, ii=0; ii<repeatOlaps.size(); ii++)
      tigMarksR.add(repeatOlaps[ii].tigbgn, repeatOlaps[ii].tigend - repeatOlaps[ii].tigbgn);

    //  Collapse these markings Collapse all the read markings to intervals on the unitig, merging those that overlap
    //  significantly.

    writeLog("Merge marks.\n");

    tigMarksR.merge(REPEAT_OVERLAP_MIN);

    //  Scan reads, discard any mark that is contained in a read
    //
    //  We don't need to filterShort() after every one is removed, but it's simpler to do it Right Now than
    //  to track if it is needed.

    writeLog("Scan reads to discard spanned repeats.\n");

    for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
      ufNode     *frg       = &tig->ufpath[fi];
      bool        frgfwd    = (frg->position.bgn < frg->position.end);
      int32       frglo     = (frgfwd) ? frg->position.bgn : frg->position.end;
      int32       frghi     = (frgfwd) ? frg->position.end : frg->position.bgn;
      bool        discarded = false;

      for (uint32 ri=0; ri<tigMarksR.numberOfIntervals(); ri++) {
        bool   spanLo = false;
        bool   spanHi = false;

        //  The decision of 'spanned by a read' is broken into two pieces: does the read span the
        //  lower (higher) boundary of the region.  To be spanned, the boundary needs to be spanned
        //  by at least MIN_ANCHOR_HANG additional bases (to anchor the read to non-repeat
        //  sequence).
        //
        //  This is a problem at the start/end of the tig, beacuse no read will extend past the
        //  start/end of the tig.  Instead, if the repeat is contained within the first (last) read
        //  with no extension at the respective end, it is spanned.

        if ((frglo == 0) &&                                   //  Read at start of tig, spans off the high end
            (tigMarksR.hi(ri) + MIN_ANCHOR_HANG <= frghi))
          spanLo = spanHi = true;

        if ((frghi == tig->getLength()) &&                    //  Read at end of tig, spans off the low end
            (frglo + MIN_ANCHOR_HANG <= tigMarksR.lo(ri)))
          spanLo = spanHi = true;

        if (frglo + MIN_ANCHOR_HANG <= tigMarksR.lo(ri))      //  Read spanned off the low end
          spanLo = true;

        if (tigMarksR.hi(ri) + MIN_ANCHOR_HANG <= frghi)      //  Read spanned off the high end
          spanHi = true;

        if (spanLo && spanHi) {
          writeLog("discard region %8d:%-8d - contained in read %6u %8d-%8d\n",
                   tigMarksR.lo(ri), tigMarksR.hi(ri), frg->ident, frglo, frghi);

          tigMarksR.lo(ri) = 0;
          tigMarksR.hi(ri) = 0;

          discarded = true;
        }
      }


      if (discarded)
        tigMarksR.filterShort(1);
    }

    //  Run through again, looking for the thickest overlap(s) to the remaining regions.
    //  This isn't caring about the end effect noted above.

#if 1
    writeLog("thickest edges to the repeat regions:\n");

    for (uint32 ri=0; ri<tigMarksR.numberOfIntervals(); ri++) {
      uint32   t5 = UINT32_MAX, l5 = 0, t5bgn, t5end;
      uint32   t3 = UINT32_MAX, l3 = 0, t3bgn, t3end;

      for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
        ufNode     *frg       = &tig->ufpath[fi];
        bool        frgfwd    = (frg->position.bgn < frg->position.end);
        int32       frglo     = (frgfwd) ? frg->position.bgn : frg->position.end;
        int32       frghi     = (frgfwd) ? frg->position.end : frg->position.bgn;
        bool        discarded = false;

        //  Overlap off the 5' end of the region.
        if (frglo <= tigMarksR.lo(ri) && (tigMarksR.lo(ri) <= frghi)) {
          uint32 olap = frghi - tigMarksR.lo(ri);
          if (l5 < olap) {
            l5    = olap;
            t5    = fi;
            t5bgn = frglo;  //  Easier than recomputing it later on...
            t5end = frghi;
          }
        }

        //  Overlap off the 3' end of the region.
        if (frglo <= tigMarksR.hi(ri) && (tigMarksR.hi(ri) <= frghi)) {
          uint32 olap = tigMarksR.hi(ri) - frglo;
          if (l3 < olap) {
            l3    = olap;
            t3    = fi;
            t3bgn = frglo;
            t3end = frghi;
          }
        }

        if (frglo <= tigMarksR.lo(ri) && (tigMarksR.hi(ri) <= frghi)) {
          writeLog("saved   region %8d:%-8d - closest    read %6u (%+6d) %8d:%-8d (%+6d) (contained)\n",
                   tigMarksR.lo(ri), tigMarksR.hi(ri),
                   frg->ident,
                   tigMarksR.lo(ri) - frglo, frglo,
                   frghi, frghi - tigMarksR.hi(ri));
        }
      }

      if (t5 != UINT32_MAX)
        writeLog("saved   region %8d:%-8d - closest 5' read %6u (%+6d) %8d:%-8d (%+6d)\n",
                 tigMarksR.lo(ri), tigMarksR.hi(ri),
                 tig->ufpath[t5].ident,
                 tigMarksR.lo(ri) - t5bgn, t5bgn,
                 t5end, t5end - tigMarksR.hi(ri));

      if (t3 != UINT32_MAX)
        writeLog("saved   region %8d:%-8d - closest 3' read %6u (%+6d) %8d:%-8d (%+6d)\n",
                 tigMarksR.lo(ri), tigMarksR.hi(ri),
                 tig->ufpath[t3].ident,
                 tigMarksR.lo(ri) - t3bgn, t3bgn,
                 t3end, t3end - tigMarksR.hi(ri));
    }
#endif


    //  Scan reads.  If a read intersects a repeat interval, and the best edge for that read
    //  is entirely in the repeat region, decide if there is a near-best edge to something
    //  not in this tig.
    //
    //  A region with no such near-best edges is _probably_ correct.

    writeLog("search for confused edges:\n");

    uint32  *isConfused  = new uint32 [tigMarksR.numberOfIntervals()];

    memset(isConfused, 0, sizeof(uint32) * tigMarksR.numberOfIntervals());

    for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
      ufNode     *rdA       = &tig->ufpath[fi];
      uint32      rdAid     = rdA->ident;
      bool        rdAfwd    = (rdA->position.bgn < rdA->position.end);
      int32       rdAlo     = (rdAfwd) ? rdA->position.bgn : rdA->position.end;
      int32       rdAhi     = (rdAfwd) ? rdA->position.end : rdA->position.bgn;

      double      sc        = (rdAhi - rdAlo) / (double)FI->fragmentLength(rdAid);

      if ((OG->isContained(rdAid)  == true) ||
          (OG->isSuspicious(rdAid) == true))
        continue;

      for (uint32 ri=0; ri<tigMarksR.numberOfIntervals(); ri++) {
        uint32  rMin = tigMarksR.lo(ri);
        uint32  rMax = tigMarksR.hi(ri);

        if ((rdAhi < rMin) ||   //  Read ends before the region
            (rMax  < rdAlo))    //  Read starts after the region
          continue;              //   -> don't care about this read!

        //  Compute the position (in the tig) of the best overlaps.

        int32  tig5bgn=0, tig5end=0;
        int32  tig3bgn=0, tig3end=0;

        //  Instead of using the best edge - which might not be the edge used in the unitig -
        //  we need to scan the layout to return the previous/next dovetail

        //  Put this in a function - what to return if no best overlap?

        BestEdgeOverlap   *b5 = OG->getBestEdgeOverlap(rdAid, false);
        BestEdgeOverlap   *b3 = OG->getBestEdgeOverlap(rdAid, true);

        //  If the best edge is to a read not in this tig, there is nothing to compare against.
        //  Is this confused by default?  Possibly.  The unitig was constructed somehow, and that
        //  must then be the edge coming into us.  We'll pick it up later.

        bool b5use = true;
        bool b3use = true;

        if (b5->fragId() == 0)
          b5use = false;
        if (b3->fragId() == 0)
          b3use = false;

        if ((b5use) && (Unitig::fragIn(b5->fragId()) != tig->id()))
          b5use = false;
        if ((b3use) && (Unitig::fragIn(b3->fragId()) != tig->id()))
          b3use = false;

        //  The best edge read is in this tig.  If they don't overlap, again, nothing to compare
        //  against.

        if (b5use) {
          ufNode     *rdB       = &tig->ufpath[Unitig::pathPosition(b5->fragId())];
          uint32      rdBid     = rdB->ident;
          bool        rdBfwd    = (rdB->position.bgn < rdB->position.end);
          int32       rdBlo     = (rdBfwd) ? rdB->position.bgn : rdB->position.end;
          int32       rdBhi     = (rdBfwd) ? rdB->position.end : rdB->position.bgn;

          if ((rdAhi < rdBlo) ||
              (rdBhi < rdAlo))
            b5use = false;
        }

        if (b3use) {
          ufNode     *rdB       = &tig->ufpath[Unitig::pathPosition(b3->fragId())];
          uint32      rdBid     = rdB->ident;
          bool        rdBfwd    = (rdB->position.bgn < rdB->position.end);
          int32       rdBlo     = (rdBfwd) ? rdB->position.bgn : rdB->position.end;
          int32       rdBhi     = (rdBfwd) ? rdB->position.end : rdB->position.bgn;

          if ((rdAhi < rdBlo) ||
              (rdBhi < rdAlo))
            b3use = false;
        }

        //  If we can use this edge, compute the placement of the overlap on the unitig.

        //  Call #1;

        if (b5use) {
          int32   bgn=0, end=0;

          olapToReadCoords(rdA,
                           b5->ahang(), b5->bhang(),
                           bgn, end);

          tig5bgn = (rdAfwd) ? (rdAlo + sc * bgn) : (rdAhi - sc * end);
          tig5end = (rdAfwd) ? (rdAlo + sc * end) : (rdAhi - sc * bgn);
        
          assert(tig5bgn < tig5end);

          if (tig5bgn < 0)                  tig5bgn = 0;
          if (tig5end > tig->getLength())   tig5end = tig->getLength();
        }

        //  Call #2

        if (b3use) {
          int32   bgn=0, end=0;

          olapToReadCoords(rdA,
                           b3->ahang(), b3->bhang(),
                           bgn, end);

          tig3bgn = (rdAfwd) ? (rdAlo + sc * bgn) : (rdAhi - sc * end);
          tig3end = (rdAfwd) ? (rdAlo + sc * end) : (rdAhi - sc * bgn);

          assert(tig3bgn < tig3end);

          if (tig3bgn < 0)                  tig3bgn = 0;
          if (tig3end > tig->getLength())   tig3end = tig->getLength();
        }

        //  If either of the 5' or 3' overlaps (or both!) are in the repeat region, we need to check for
        //  close overlaps on that end.

        uint32  len5 = 0;
        uint32  len3 = 0;

        if ((rMin    < tig5bgn) &&
            (tig5end < rMax) &&
            (b5use))
          len5 = FI->overlapLength(rdAid, b5->fragId(), b5->ahang(), b5->bhang());
        else
          b5use = false;

        if ((rMin    < tig3bgn) &&
            (tig3end < rMax) &&
            (b3use))
          len3 = FI->overlapLength(rdAid, b3->fragId(), b3->ahang(), b3->bhang());
        else
          b3use = false;

        double score5 = len5 * (1 - b5->erate());
        double score3 = len3 * (1 - b3->erate());

        //  Neither of the best edges are in the repeat region; move to the next region and/or read.
        if (len5 + len3 == 0)
          continue;

        //  At least one of the best edge overlaps is in the repeat region.  Scan for other edges
        //  that are of comparable length and quality.

        uint32        ovlLen   = 0;
        BAToverlap   *ovl      = OC->getOverlaps(rdAid, AS_MAX_ERATE, ovlLen);

        for (uint32 oo=0; oo<ovlLen; oo++) {
          uint32   rdBid    = ovl[oo].b_iid;
          uint32   tgBid    = Unitig::fragIn(rdBid);

          //  If the read is in a singleton, skip.  These are unassembled crud.
          if ((tgBid                         == 0) ||
              (unitigs[tgBid]                == NULL) ||
              (unitigs[tgBid]->ufpath.size() == 1))
            continue;

          //  If the read is in an annotated bubble, skip.
          if (unitigs[tgBid]->_isBubble)
            continue;

          //  Skip if this overlap is the best we're trying to match.
          if ((rdBid == b5->fragId()) ||
              (rdBid == b3->fragId()))
            continue;

          //  Skip if this overlap is crappy quality
          if (OG->isOverlapBadQuality(ovl[oo]))
            continue;

          //  Skip if the read is contained or suspicious.
          if ((OG->isContained(rdBid)  == true) ||
              (OG->isSuspicious(rdBid) == true))
            continue;

          //  Skip if the overlap isn't dovetail.
          bool  ovl5 = ovl[oo].AEndIs5prime();
          bool  ovl3 = ovl[oo].AEndIs3prime();

          if ((ovl5 == false) &&
              (ovl3 == false))
            continue;

          //  Skip if we're not using this overlap
          if ((ovl5 == true) && (b5use == false))
            continue;

          if ((ovl3 == true) && (b3use == false))
            continue;


          uint32   rdBpos   =  unitigs[tgBid]->pathPosition(rdBid);
          ufNode  *rdB      = &unitigs[tgBid]->ufpath[rdBpos];

          bool     rdBfwd   = (rdB->position.bgn < rdB->position.end);
          int32    rdBlo    = (rdBfwd) ? rdB->position.bgn : rdB->position.end;
          int32    rdBhi    = (rdBfwd) ? rdB->position.end : rdB->position.bgn;

          //  If the overlap is to a read in a different tig, or
          //     the overlap is to a read in the same tig, but we don't overlap in the tig, check lengths.
          //  Otherwise, the overlap is present in the tig, and can't be confused.
          if ((tgBid == tig->id()) &&
              (rdBlo <= rdAhi) &&
              (rdAlo <= rdBhi))
            continue;

          uint32  len   = FI->overlapLength(rdAid, ovl[oo].b_iid, ovl[oo].a_hang, ovl[oo].b_hang);
          double  score = len * (1 - ovl[oo].erate);

          //  Compute percent difference.

          double  ad5 = fabs(score - score5);
          double  ad3 = fabs(score - score3);

          double  pd5 = 200 * ad5 / (score + score5);
          double  pd3 = 200 * ad3 / (score + score3);

          //  Skip if this overlap is vastly worse than the best.

          if ((ovl5 == true) && ((ad5 >= confusedAbsolute) || (pd3 > confusedPercent))) {
            writeLog("tig %7u read %8u pos %7u-%-7u NOT confused by 5' edge to read %8u - best edge read %8u len %6u erate %.4f score %8.2f - alt edge len %6u erate %.4f score %8.2f - absdiff %8.2f percdiff %8.4f\n",
                     tig->id(), rdAid, rdAlo, rdAhi,
                     rdBid,
                     b5->fragId(), len5, b5->erate(), score5,
                     len, ovl[oo].erate, score,
                     ad5, pd5);
            continue;
          }

          if ((ovl3 == true) && ((ad3 >= confusedAbsolute) || (pd3 > confusedPercent))) {
            writeLog("tig %7u read %8u pos %7u-%-7u NOT confused by 3' edge to read %8u - best edge read %8u len %6u erate %.4f score %8.2f - alt edge len %6u erate %.4f score %8.2f - absdiff %8.2f percdiff %8.4f\n",
                     tig->id(), rdAid, rdAlo, rdAhi,
                     rdBid,
                     b3->fragId(), len3, b3->erate(), score3,
                     len, ovl[oo].erate, score,
                     ad3, pd3);
            continue;
          }

          //  Potential confusion!

          if (ovl5 == true)
            writeLog("tig %7u read %8u pos %7u-%-7u IS confused by 5' edge to read %8u - best edge read %8u len %6u erate %.4f score %8.2f - alt edge len %6u erate %.4f score %8.2f - absdiff %8.2f percdiff %8.4f\n",
                     tig->id(), rdAid, rdAlo, rdAhi,
                     rdBid,
                     b5->fragId(), len5, b5->erate(), score5,
                     len, ovl[oo].erate, score,
                     ad5, pd5);

          if (ovl3 == true)
            writeLog("tig %7u read %8u pos %7u-%-7u IS confused by 3' edge to read %8u - best edge read %8u len %6u erate %.4f score %8.2f - alt edge len %6u erate %.4f score %8.2f - absdiff %8.2f percdiff %8.4f\n",
                     tig->id(), rdAid, rdAlo, rdAhi,
                     rdBid,
                     b3->fragId(), len3, b3->erate(), score3,
                     len, ovl[oo].erate, score,
                     ad3, pd3);

          isConfused[ri]++;
        }
      }  //  Over all marks (ri)
    }  //  Over all reads (fi)


    //  Scan all the regions, and delete any that have no confusion.

    {
      bool  discarded = false;

      for (uint32 ri=0; ri<tigMarksR.numberOfIntervals(); ri++) {
        if (isConfused[ri] == 0) {
          writeLog("discard region %8d:%-8d - no confusion in best edges\n",
                   tigMarksR.lo(ri), tigMarksR.hi(ri));

          tigMarksR.lo(ri) = 0;
          tigMarksR.hi(ri) = 0;

          discarded = true;
        }

        else {
          writeLog("saved   region %8d:%-8d - %u best edges are potentially confused\n",
                   tigMarksR.lo(ri), tigMarksR.hi(ri), isConfused[ri]);
        }
      }

      if (discarded)
        tigMarksR.filterShort(1);
    }

    delete [] isConfused;





    //  Scan reads, join any marks that have their junctions spanned by a sufficiently large amount.
    //
    //  If the read spans this junction be the usual amount, merge the intervals.
    //
    //  The intervals can be overlapping (by up to REPEAT_OVERLAP_MIN (x2?) bases.  For this junction
    //  to be spanned, the read must span from min-ROM to max+ROM, not just hi(ri-1) to lo(ri).
    //
    //  We DO need to filterShort() after every merge, otherwise, we'd have an empty bogus interval
    //  in the middle of our list, which could be preventing some other merge.  OK, we could 
    //
    //  Anything that gets merged is now no longer a true repeat.  It's unique, just bordered by repeats.
    //  We can't track this through the indices (because we delete things).  We track it with a set of
    //  begin coordinates.

    set<int32>  nonRepeatIntervals;

    writeLog("Scan reads to merge repeat regions.\n");

    for (uint32 fi=0; fi<tig->ufpath.size(); fi++) {
      ufNode     *frg       = &tig->ufpath[fi];
      bool        frgfwd    = (frg->position.bgn < frg->position.end);
      int32       frglo     = (frgfwd) ? frg->position.bgn : frg->position.end;
      int32       frghi     = (frgfwd) ? frg->position.end : frg->position.bgn;
      bool        merged    = false;

      for (uint32 ri=1; ri<tigMarksR.numberOfIntervals(); ri++) {
        uint32  rMin = min(tigMarksR.hi(ri-1), tigMarksR.lo(ri));
        uint32  rMax = max(tigMarksR.hi(ri-1), tigMarksR.lo(ri));

        if ((frglo + MIN_ANCHOR_HANG <= rMin) && (rMax + MIN_ANCHOR_HANG <= frghi)) {
          writeLog("merge regions %8d:%-8d and %8d:%-8d - junction contained in read %6u %5d-%5d\n",
                   tigMarksR.lo(ri-1), tigMarksR.hi(ri-1),
                   tigMarksR.lo(ri), tigMarksR.hi(ri),
                   frg->ident, frglo, frghi);

          tigMarksR.lo(ri) = tigMarksR.lo(ri-1);

          tigMarksR.lo(ri-1) = 0;   //  CRITICAL to delete this interval (and not ri) because the next
          tigMarksR.hi(ri-1) = 0;   //  iteration will be using ri-1 (== ri here) and ri (== ri+1).

          merged = true;

          nonRepeatIntervals.insert(tigMarksR.lo(ri));
        }
      }

      if (merged)
        tigMarksR.filterShort(1);
    }

    //  Extend the regions by MIN_ANCHOR_HANG.  This makes checking for reads that span and are
    //  anchored in the next region easier.  It also solved a quirk when the first/last repeat
    //  region doesn't extend to the end of the sequence:
    //    0-183     unique  (created from inversion below, but useless and incorrect)
    //    183-9942  repeat

    for (uint32 ii=0; ii<tigMarksR.numberOfIntervals(); ii++) {
      tigMarksR.lo(ii) = max<int32>(tigMarksR.lo(ii) - MIN_ANCHOR_HANG, 0);
      tigMarksR.hi(ii) = min<int32>(tigMarksR.hi(ii) + MIN_ANCHOR_HANG, tig->getLength());
    }

    //  Find the non-repeat intervals.

    tigMarksU = tigMarksR;
    tigMarksU.invert(0, tig->getLength());

    //  Create the list of intervals we'll use to make new unitigs.
    //
    //  The repeat intervals are extended by MIN_ANCHOR_HANG, and then any read fully contained in one of
    //  these is moved here.
    //
    //  The non-repeat intervals are shortened by the same amount, and any read that intersects one
    //  is moved there.
    //
    //  Does order matter?  Not sure.  The repeat intervals are first, then the formerly repeat
    //  merged intervals, then the unique intervals.  Splitting might depend on the repeats being
    //  first.

    writeLog("Make breakpoints.\n");

    vector<breakPointCoords>   BP;

    for (uint32 ii=0; ii<tigMarksR.numberOfIntervals(); ii++)
      if (nonRepeatIntervals.count(tigMarksR.lo(ii)) == 0)
        BP.push_back(breakPointCoords(ti, tigMarksR.lo(ii), tigMarksR.hi(ii), true));

    for (uint32 ii=0; ii<tigMarksR.numberOfIntervals(); ii++)
      if (nonRepeatIntervals.count(tigMarksR.lo(ii)) != 0)
        BP.push_back(breakPointCoords(ti, tigMarksR.lo(ii), tigMarksR.hi(ii), true));

    for (uint32 ii=0; ii<tigMarksU.numberOfIntervals(); ii++) {
      BP.push_back(breakPointCoords(ti, tigMarksU.lo(ii), tigMarksU.hi(ii), false));
    }

    //  If only one region, the whole unitig was declared repeat.  Nothing to do.

    if (BP.size() == 1)
      continue;

    sort(BP.begin(), BP.end());

    //  Report.

    writeLog("break tig %u into up to %u pieces:\n", ti, BP.size());
    for (uint32 ii=0; ii<BP.size(); ii++)
      writeLog("  %8d %8d %s (length %d)\n",
               BP[ii]._bgn, BP[ii]._end,
               BP[ii]._isRepeat ? "repeat" : "unique",
               BP[ii]._end - BP[ii]._bgn);

    //  Scan the reads, counting the number of reads that would be placed in each new tig.  This is done
    //  because there are a few 'splits' that don't move any reads around.

    Unitig **newTigs   = new Unitig * [BP.size()];
    int32   *lowCoord  = new int32    [BP.size()];
    uint32  *nRepeat   = new uint32   [BP.size()];
    uint32  *nUnique   = new uint32   [BP.size()];

    //  First call, count the number of tigs we would create if we let it create them.

    uint32  nTigs = splitUnitigs(unitigs, tig, BP, newTigs, lowCoord, nRepeat, nUnique, false);

    //  Second call, actually create the tigs, if anything would change.

    if (nTigs > 1)
      splitUnitigs(unitigs, tig, BP, newTigs, lowCoord, nRepeat, nUnique, true);

    //  Report the tigs created.

    for (uint32 ii=0; ii<BP.size(); ii++) {
      int32   rgnbgn = BP[ii]._bgn;
      int32   rgnend = BP[ii]._end;
      bool    repeat = BP[ii]._isRepeat;

      if      (nRepeat[ii] + nUnique[ii] == 0)
        writeLog("For tig %5u %s region %8d %8d - %6u/%6u repeat/unique reads - no new unitig created.\n",
                 ti, (repeat == true) ? "repeat" : "unique", rgnbgn, rgnend, nRepeat[ii], nUnique[ii]);

      else if (nTigs > 1)
        writeLog("For tig %5u %s region %8d %8d - %6u/%6u reads repeat/unique - unitig %5u created.\n",
                 ti, (repeat == true) ? "repeat" : "unique", rgnbgn, rgnend, nRepeat[ii], nUnique[ii], newTigs[ii]->id());

      else
        writeLog("For tig %5u %s region %8d %8d - %6u/%6u repeat/unique reads - unitig %5u remains unchanged.\n",
                 ti, (repeat == true) ? "repeat" : "unique", rgnbgn, rgnend, nRepeat[ii], nUnique[ii], tig->id());
    }

    //  Cleanup.

    delete [] newTigs;
    delete [] lowCoord;
    delete [] nRepeat;
    delete [] nUnique;

    //  Remove the old unitig....if we made new ones.

    if (nTigs > 1) {
      delete tig;
      unitigs[ti] = NULL;
    }
  }
}