void reportUnitigs(UnitigVector &unitigs, const char *prefix, const char *name) { if (logFileFlagSet(LOG_INTERMEDIATE_UNITIGS) == 0) return; uint32 numFragsT = 0; uint32 numFragsP = 0; uint64 utgLen = 0; // Compute average frags per partition. for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg == NULL) continue; numFragsT += utg->ufpath.size(); if (utg->ufpath.size() > 2) utgLen += utg->getLength(); } if (utgLen < 16 * 1024 * 1024) numFragsP = numFragsT / 7; else if (utgLen < 64 * 1024 * 1024) numFragsP = numFragsT / 63; else numFragsP = numFragsT / 127; char tigStorePath[FILENAME_MAX]; sprintf(tigStorePath, "%s.%03u.%s.tigStore", prefix, logFileOrder, name); // Failing to do this results in consensus running about 40 times slower. Three hours instead of // five minutes. setParentAndHang(unitigs); writeUnitigsToStore(unitigs, tigStorePath, tigStorePath, numFragsP, false); }
void popMateBubbles(UnitigVector &unitigs) { uint32 nBubblePopped = 0; uint32 nBubbleTooBig = 0; uint32 nBubbleConflict = 0; writeLog("==> SEARCHING FOR MATE BUBBLES\n"); // For each unitig, if all (or most) of the external mates are to a single other unitig (not // counting singletons), then this is a potential bubble popping unitig. // // At present, this is exploratory only. for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *tig = unitigs[ti]; if ((tig == NULL) || (tig->ufpath.size() == 0)) // No tig here. continue; if ((tig->getLength() > 1000) || (tig->ufpath.size() >= 3000)) // Tig too big. continue; //if ((tig->getLength() < 150) || // (tig->ufpath.size() < 5)) // // Tig too small. // continue; uint32 *lkg = new uint32 [tig->ufpath.size()]; uint32 lkgLen = 0; uint32 lkgExt = 0; for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; int32 frgID = frg->ident; int32 matID = FI->mateIID(frgID); uint32 mtigID = 0; Unitig *mtig = 0L; if (matID == 0) // No mate. continue; mtigID = tig->fragIn(matID); mtig = unitigs[mtigID]; if (mtigID == tig->id()) // Mate is not external. continue; lkgExt++; if (mtig->ufpath.size() < 2) // Mate is in singleton. continue; lkg[lkgLen++] = mtigID; } if (lkgLen == 0) // No external mates. continue; sort(lkg, lkg+lkgLen); uint32 last = lkg[0]; uint32 lcnt = 1; for (uint32 i=1; i<lkgLen; i++) { if (last != lkg[i]) { if ((lcnt > 3)) writeLog("popMateBubble()-- tig %d len %d might pop bubble in tig %u (%u mates in there out of %d external mates)\n", tig->id(), tig->getLength(), last, lcnt, lkgExt); last = lkg[i]; lcnt = 0; } lcnt++; } if ((lcnt > 3)) writeLog("popMateBubble()-- tig %d len %d might pop bubble in tig %u (%u mates in there out of %d external mates)\n", tig->id(), tig->getLength(), last, lcnt, lkgExt); delete [] lkg; } }
void breakUnitigs(UnitigVector &unitigs, char *output_prefix, bool enableIntersectionBreaking) { writeLog("==> BREAKING UNITIGS.\n"); intersectionList *ilist = new intersectionList(unitigs); // Stop when we've seen all current unitigs. Replace tiMax // in the for loop below with unitigs.size() to recursively // split unitigs. uint32 tiMax = unitigs.size(); for (uint32 ti=0; ti<tiMax; ti++) { Unitig *tig = unitigs[ti]; if (tig == NULL) continue; vector<breakPoint> breaks; for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; intersectionPoint *isect = ilist->getIntersection(frg->ident, 0); if (isect == NULL) continue; for (; isect->isectFrg == frg->ident; isect++) { assert(tig->id() == Unitig::fragIn(isect->isectFrg)); // Grab the invading unitig Unitig *inv = unitigs[Unitig::fragIn(isect->invadFrg)]; assert(inv->id() == Unitig::fragIn(isect->invadFrg)); // Grab the best edges off the invading fragment. BestEdgeOverlap *best5 = OG->getBestEdgeOverlap(isect->invadFrg, false); BestEdgeOverlap *best3 = OG->getBestEdgeOverlap(isect->invadFrg, true); // Check if the incoming tig is a spur, and we should just ignore it immediately if ((inv->ufpath.size() == 1) && ((best5->fragId() == 0) || (best3->fragId() == 0))) { if (logFileFlagSet(LOG_INTERSECTION_BREAKING)) writeLog("unitig %d frag %d end %c' into unitig %d frag %d end %c' -- IS A SPUR, skip it\n", inv->id(), isect->invadFrg, isect->invad3p ? '3' : '5', tig->id(), isect->isectFrg, isect->isect3p ? '3' : '5'); continue; } // Keep only significant intersections if ((inv->getLength() > MIN_BREAK_LENGTH) && (inv->ufpath.size() > MIN_BREAK_FRAGS)) { if (logFileFlagSet(LOG_INTERSECTION_BREAKING)) writeLog("unitig %d frag %d end %c' into unitig %d frag %d end %c'\n", inv->id(), isect->invadFrg, isect->invad3p ? '3' : '5', tig->id(), isect->isectFrg, isect->isect3p ? '3' : '5'); breaks.push_back(breakPoint(isect->isectFrg, isect->isect3p, true, false)); } } // Over all incoming fragments // If this is the last fragment, terminate the break point list with a 'fakeEnd' (in AS_BAT_Breaking.cc) break point // at the end of the unitig. if ((fi+1 == tig->ufpath.size()) && (breaks.size() > 0)) { breaks.push_back(breakPoint(frg->ident, (frg->position.bgn < frg->position.end), true, false)); } } // Over all fragments in the unitig if (breaks.size() == 0) continue; // Report where breaks occur. 'breaks' is a list, not a vector. #if 0 // We've lost the fields in breaks[i] -- but the reports above aren't updated yet. if (logFileFlagSet(LOG_INTERSECTION_BREAKING) || logFileFlagSet(LOG_MATE_SPLIT_COVERAGE_PLOT)) for (uint32 i=0; i<breaks.size(); i++) writeLog("BREAK unitig %d at position %d,%d from inSize %d inFrags %d.\n", tig->id(), breaks[i].fragPos.bgn, breaks[i].fragPos.end, breaks[i].inSize, breaks[i].inFrags); #endif // Actually do the breaking. if (enableIntersectionBreaking) breakUnitigAt(unitigs, tig, breaks, true); breaks.clear(); } // Over all unitigs }
void writeUnitigsToStore(UnitigVector &unitigs, char *fileprefix, char *tigStorePath, uint32 frg_count_target, bool isFinal) { uint32 utg_count = 0; uint32 frg_count = 0; uint32 prt_count = 1; char filename[FILENAME_MAX] = {0}; uint32 *partmap = new uint32 [unitigs.size()]; // This code closely follows that in AS_CGB_unitigger.c::output_the_chunks() if (isFinal) checkUnitigMembership(unitigs); // Open up the initial output file sprintf(filename, "%s.iidmap", fileprefix); FILE *iidm = fopen(filename, "w"); assert(NULL != iidm); sprintf(filename, "%s.partitioning", fileprefix); FILE *part = fopen(filename, "w"); assert(NULL != part); sprintf(filename, "%s.partitioningInfo", fileprefix); FILE *pari = fopen(filename, "w"); assert(NULL != pari); // Step through all the unitigs once to build the partition mapping and IID mapping. memset(partmap, 0xff, sizeof(uint32) * unitigs.size()); for (uint32 iumiid=0, ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; uint32 nf = (utg) ? utg->getNumFrags() : 0; if ((utg == NULL) || (nf == 0)) continue; assert(utg->getLength() > 0); assert(nf == utg->ufpath.size()); if ((frg_count + nf >= frg_count_target) && (frg_count > 0)) { fprintf(pari, "Partition %d has %d unitigs and %d fragments.\n", prt_count, utg_count, frg_count); prt_count++; utg_count = 0; frg_count = 0; } uint32 tigid = (isFinal) ? iumiid : ti; assert(tigid < unitigs.size()); partmap[tigid] = prt_count; fprintf(iidm, "Unitig "F_U32" == IUM "F_U32" (in partition "F_U32" with "F_U32" frags)\n", utg->id(), (tigid), partmap[(tigid)], nf); for (uint32 fragIdx=0; fragIdx<nf; fragIdx++) { ufNode *f = &utg->ufpath[fragIdx]; fprintf(part, "%d\t%d\n", prt_count, f->ident); } utg_count += 1; frg_count += nf; iumiid++; } fprintf(pari, "Partition %d has %d unitigs and %d fragments.\n", prt_count, utg_count, frg_count); fclose(pari); fclose(part); fclose(iidm); // Step through all the unitigs once to build the partition mapping and IID mapping. tgStore *tigStore = new tgStore(tigStorePath); tgTig *tig = new tgTig; for (uint32 iumiid=0, ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; uint32 nf = (utg) ? utg->getNumFrags() : 0; if ((utg == NULL) || (nf == 0)) continue; unitigToTig(tig, (isFinal) ? iumiid : ti, utg); tigStore->insertTig(tig, false); iumiid++; } delete tig; delete tigStore; delete [] partmap; }
void writeUnitigsToStore(UnitigVector &unitigs, char *fileprefix, char *tigStorePath, uint32 frg_count_target, bool isFinal) { uint32 utg_count = 0; uint32 frg_count = 0; uint32 prt_count = 1; char filename[FILENAME_MAX] = {0}; uint32 *partmap = new uint32 [unitigs.size()]; // This code closely follows that in AS_CGB_unitigger.c::output_the_chunks() if (isFinal) checkUnitigMembership(unitigs); // Open up the initial output file sprintf(filename, "%s.iidmap", fileprefix); FILE *iidm = fopen(filename, "w"); assert(NULL != iidm); sprintf(filename, "%s.partitioning", fileprefix); FILE *part = fopen(filename, "w"); assert(NULL != part); sprintf(filename, "%s.partitioningInfo", fileprefix); FILE *pari = fopen(filename, "w"); assert(NULL != pari); // Step through all the unitigs once to build the partition mapping and IID mapping. tgStore *tigStore = new tgStore(tigStorePath); tgTig *tig = new tgTig; for (uint32 tigID=0, ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if ((utg == NULL) || (utg->getNumFrags() == 0)) continue; assert(utg->getLength() > 0); // Convert the bogart tig to a tgTig and save to the store. unitigToTig(tig, (isFinal) ? tigID : ti, utg); tigID++; tigStore->insertTig(tig, false); // Increment the partition if the current one is too large. if ((frg_count + utg->getNumFrags() >= frg_count_target) && (frg_count > 0)) { fprintf(pari, "Partition %d has %d unitigs and %d fragments.\n", prt_count, utg_count, frg_count); prt_count++; utg_count = 0; frg_count = 0; } // Note that the tig is included in this partition. utg_count += 1; frg_count += utg->getNumFrags(); // Map the tig to a partition, and log both the tig-to-partition map and the partition-to-read map. fprintf(iidm, "bogart "F_U32" -> tig "F_U32" (in partition "F_U32" with "F_U32" frags)\n", utg->id(), utg->tigID(), prt_count, utg->getNumFrags()); for (uint32 fragIdx=0; fragIdx<utg->getNumFrags(); fragIdx++) fprintf(part, "%d\t%d\n", prt_count, utg->ufpath[fragIdx].ident); } fprintf(pari, "Partition %d has %d unitigs and %d fragments.\n", // Don't forget to log the last partition! prt_count, utg_count, frg_count); fclose(pari); fclose(part); fclose(iidm); delete tig; delete tigStore; }
// Examine the first (few?) fragments of a unitig, evaluate if they indicate a join should be made. static bool joinUnitigs_examineEnd(UnitigVector &unitigs, Unitig *fr, uint32 idx, bool frFirstEnd, vector<joinEntry> &joins) { uint32 frgIdx = (frFirstEnd) ? (idx) : (fr->ufpath.size() - 1 - idx); ufNode *frg = &fr->ufpath[frgIdx]; bool frgRev = (frg->position.end < frg->position.bgn); // Grab the best edge for this end frag. The last arg requests the 3' end if true. // // If we're looking at the first read, we want to get: // 5' - if the frag is forward // 3' - if the frag is reverse (frgRev == true) // // If we're looking at the lat read, we want to get: // 5' - if the frag is reverse // 3' - if the frag is forward (frgRev == false) // BestEdgeOverlap *bestEdge = OG->getBestEdgeOverlap(frg->ident, (frgRev == frFirstEnd)); uint32 tgtId = bestEdge->fragId(); bool tgt3p = bestEdge->frag3p(); if (tgtId == 0) // No best edge? Skip it. return(false); // Grab the unitig for that best edge. uint32 toID = fr->fragIn(tgtId); Unitig *to = unitigs[toID]; if (to->ufpath.size() == 1) // Joining to something teeny? Don't bother checking further. return(false); if (to->id() == fr->id()) // Join to myself? Nope. return(false); // Grab the read we have an edge to, an compute the overlapping length and left over length. ufNode *tgt = &to->ufpath[to->pathPosition(tgtId)]; bool tgtRev = (tgt->position.end < tgt->position.bgn); // If tgt3p (we overlap to the 3' end) is the same as tgtRev (read is reverse) then the unitig is oriented // correctly. Otherwise, positions need to be reverse-complemented. bool toFlip = false; if ((frFirstEnd == true) && (tgt3p == false) && (tgtRev == false)) // source read is at the start, overlap to 5' and the read is forward, need to flip the target unitig toFlip = true; if ((frFirstEnd == true) && (tgt3p == true) && (tgtRev == true)) // source read is at the start, overlap to 3' and the read is reverse, need to flip the target unitig toFlip = true; if ((frFirstEnd == false) && (tgt3p == false) && (tgtRev == true)) // source read is at the end, overlap to 5' and the read is reverse, need to flip the target unitig toFlip = true; if ((frFirstEnd == false) && (tgt3p == true) && (tgtRev == false)) // source read is at the end, overlap to 3' and the read is forward, need to flip the target unitig toFlip = true; uint32 toMin = MIN(tgt->position.bgn, tgt->position.end); uint32 toMax = MAX(tgt->position.bgn, tgt->position.end); uint32 toLen = to->getLength(); uint32 frLen = fr->getLength(); if (toFlip) { toMin = toLen - MAX(tgt->position.bgn, tgt->position.end); toMax = toLen - MIN(tgt->position.bgn, tgt->position.end); } assert(toMin < toMax); // Our two unitigs are of length frLen and toLen. We are appending some portion of 'to' onto // 'fr', and 'discarding' the rest. If the 'discarded' piece is larger than the 'fr' unitig, we // don't want to do the join. // // We err on the side of the discarded piece. uint32 joinLen = 0; uint32 discLen = 0; if (frFirstEnd == true) { joinLen = toMin + frLen; // Prepend the start of 'to' onto 'fr'. discLen = toLen - toMin; } else { joinLen = frLen + toLen - toMax; // Append the end of 'to' onto 'fr'. discLen = toMax; } // If the discard is bigger than us, we do damage by joining. if (discLen > frLen) return(false); // The joined should be much larger and the discarded much smaller. uint32 maxLen = MAX(frLen, toLen); uint32 minLen = MIN(frLen, toLen); double joinChange = (double)joinLen / maxLen; double discChange = (double)discLen / minLen; bool isBad = false; if ((joinChange < 1.10) || (0.75 < discChange)) // Bad if we didn't really change sizes. isBad = true; if ((1.0 < joinChange) && (discChange < 0.5)) // But good if discard is tiny. This occurs if we merge a small with a big. The join change // is somewhat small (1.05 say) yet most of the smaller unitig is used. isBad = false; if (isBad) { writeLog("joinUnitigs_examineEnd()-- join unitig %6u (%7ubp) frag %6u %s <-> unitig %6u (%7ubp) frag %6u %s <-> length %5.2f %7u and %5.2f %7u BAD\n", fr->id(), fr->getLength(), frg->ident, (frgRev) ? "rev" : "fwd", to->id(), to->getLength(), tgt->ident, (tgtRev) ? "rev" : "fwd", joinChange, joinLen, discChange, discLen); return(false); } // OK, join. writeLog("joinUnitigs_examineEnd()-- join unitig %6u (%7ubp) frag %6u %s <-> unitig %6u (%7ubp) frag %6u %s <-> length %5.2f %7u and %5.2f %7u\n", fr->id(), fr->getLength(), frg->ident, (frgRev) ? "rev" : "fwd", to->id(), to->getLength(), tgt->ident, (tgtRev) ? "rev" : "fwd", joinChange, joinLen, discChange, discLen); joins.push_back(joinEntry(frg->ident, frFirstEnd, tgt->ident, toFlip, joinLen)); return(true); }
void reportUnitigs(UnitigVector &unitigs, const char *prefix, const char *name, uint64 genomeSize) { // Generate n50. Assumes unitigs have been 'classified' already. vector<uint32> unassembledLength; vector<uint32> bubbleLength; vector<uint32> repeatLength; vector<uint32> circularLength; vector<uint32> contigLength; for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg == NULL) continue; if (utg->_isUnassembled) { unassembledLength.push_back(utg->getLength()); } else if (utg->_isBubble) { bubbleLength.push_back(utg->getLength()); } else if (utg->_isRepeat) { repeatLength.push_back(utg->getLength()); } else if (utg->_isCircular) { circularLength.push_back(utg->getLength()); } else { contigLength.push_back(utg->getLength()); } } char N[FILENAME_MAX]; sprintf(N, "%s.sizes", getLogFilePrefix()); errno = 0; FILE *F = fopen(N, "w"); if (errno == 0) { reportN50(F, unassembledLength, "UNASSEMBLED", genomeSize); reportN50(F, bubbleLength, "BUBBLE", genomeSize); reportN50(F, repeatLength, "REPEAT", genomeSize); reportN50(F, circularLength, "CIRCULAR", genomeSize); reportN50(F, contigLength, "CONTIGS", genomeSize); fclose(F); } if (logFileFlagSet(LOG_INTERMEDIATE_UNITIGS) == 0) return; // Dump to an intermediate store. char tigStorePath[FILENAME_MAX]; sprintf(tigStorePath, "%s.tigStore", getLogFilePrefix()); fprintf(stderr, "Creating intermediate tigStore '%s'\n", tigStorePath); uint32 numFragsT = 0; uint32 numFragsP = 0; uint64 utgLen = 0; // Compute average frags per partition. for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg == NULL) continue; numFragsT += utg->ufpath.size(); if (utg->ufpath.size() > 2) utgLen += utg->getLength(); } if (utgLen < 16 * 1024 * 1024) numFragsP = numFragsT / 7; else if (utgLen < 64 * 1024 * 1024) numFragsP = numFragsT / 63; else numFragsP = numFragsT / 127; // Dump the unitigs to an intermediate store. setParentAndHang(unitigs); writeUnitigsToStore(unitigs, tigStorePath, tigStorePath, numFragsP, false); }
// Decides if a unitig is unassembled. The other classifications (isBubble, isCircular, isRepeat) // are made when the type is processed (e.g., when bubbles are popped). // // A unitig is unassembled if: // 1) it has fewer than R reads (R=2) // 2) it is shorter than S bases (S=1000) // 3) a single read spans at least fraction F of the lenth (F=1.0) // 4) at least fraction F of the unitig is below read depth D (F=1.0, D=2) // void classifyUnitigsAsUnassembled(UnitigVector &unitigs, uint32 fewReadsNumber, uint32 tooShortLength, double spanFraction, double lowcovFraction, uint32 lowcovDepth) { uint32 nTooFew = 0; uint32 nShort = 0; uint32 nSingle = 0; uint32 nCoverage = 0; uint32 nContig = 0; uint64 bTooFew = 0; uint64 bShort = 0; uint64 bSingle = 0; uint64 bCoverage = 0; uint64 bContig = 0; char N[FILENAME_MAX]; sprintf(N, "%s.unassembled", getLogFilePrefix()); errno = 0; FILE *F = fopen(N, "w"); if (errno) F = NULL; for (uint32 ti=0; ti<unitigs.size(); ti++) { Unitig *utg = unitigs[ti]; if (utg == NULL) continue; utg->_isUnassembled = false; // Rule 1. Too few reads. if (utg->ufpath.size() < fewReadsNumber) { fprintf(F, "unitig "F_U32" unassembled - too few reads ("F_U64" < "F_U32")\n", ti, utg->ufpath.size(), fewReadsNumber); utg->_isUnassembled = true; nTooFew += 1; bTooFew += utg->getLength(); continue; } // Rule 2. Short. if (utg->getLength() < tooShortLength) { fprintf(F, "unitig "F_U32" unassembled - too short ("F_U32" < "F_U32")\n", ti, utg->getLength(), tooShortLength); utg->_isUnassembled = true; nShort += 1; bShort += utg->getLength(); continue; } // Rule 3. Single read spans large fraction of tig. for (uint32 oi=0; oi<utg->ufpath.size(); oi++) { ufNode *frg = &utg->ufpath[oi]; int frgbgn = MIN(frg->position.bgn, frg->position.end); int frgend = MAX(frg->position.bgn, frg->position.end); if (frgend - frgbgn > utg->getLength() * spanFraction) { fprintf(F, "unitig "F_U32" unassembled - single read spans unitig (read "F_U32" "F_U32"-"F_U32" spans fraction %f > %f\n", ti, frg->ident, frg->position.bgn, frg->position.end, (double)(frgend - frgbgn) / utg->getLength(), spanFraction); utg->_isUnassembled = true; nSingle += 1; bSingle += utg->getLength(); break; } } if (utg->_isUnassembled) continue; // Rule 4. Low coverage. intervalList<int32> IL; for (uint32 oi=0; oi<utg->ufpath.size(); oi++) { ufNode *frg = &utg->ufpath[oi]; int frgbgn = MIN(frg->position.bgn, frg->position.end); int frgend = MAX(frg->position.bgn, frg->position.end); IL.add(frgbgn, frgend - frgbgn); } intervalList<int32> ID(IL); uint32 basesLow = 0; uint32 basesHigh = 0; for (uint32 ii=0; ii<ID.numberOfIntervals(); ii++) if (ID.depth(ii) < lowcovDepth) basesLow += ID.hi(ii) - ID.lo(ii) + 1; else basesHigh += ID.hi(ii) - ID.lo(ii) + 1; double lowcov = (double)basesLow / (basesLow + basesHigh); if (lowcov >= lowcovFraction) { fprintf(F, "Unitig "F_U32" unassembled - low coverage (%.4f > %.4f at < "F_U32"x coverage)\n", ti, lowcov, lowcovFraction, lowcovDepth); utg->_isUnassembled = true; nCoverage += 1; bCoverage += utg->getLength(); continue; } // Otherwise, unitig is assembled! nContig += 1; bContig += utg->getLength(); } writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- too few reads\n", nTooFew, bTooFew); writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- too short\n", nShort, bShort); writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- single spanning read\n", nSingle, bSingle); writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- low coverage\n", nCoverage, bCoverage); writeStatus("classifyAsUnassembled()-- %6u tigs %11lu bases -- acceptable contigs\n", nContig, bContig); }
void findPotentialBubbles(UnitigVector &unitigs, BubTargetList &potentialBubbles) { uint32 tiLimit = unitigs.size(); uint32 tiNumThreads = omp_get_max_threads(); uint32 tiBlockSize = (tiLimit < 100000 * tiNumThreads) ? tiNumThreads : tiLimit / 99999; writeStatus("\n"); writeStatus("bubbleDetect()-- working on "F_U32" unitigs, with "F_U32" threads.\n", tiLimit, tiNumThreads); for (uint32 ti=0; ti<tiLimit; ti++) { Unitig *tig = unitigs[ti]; if ((tig == NULL) || // Not a tig, ignore it. (tig->ufpath.size() == 1)) // Singleton, handled elsewhere. continue; uint32 nonContainedReads = 0; bool validBubble = true; map<uint32,uint32> tigOlapsTo; uint32 fiLimit = tig->ufpath.size(); uint32 fiNumThreads = omp_get_max_threads(); uint32 fiBlockSize = (fiLimit < 100 * fiNumThreads) ? fiNumThreads : fiLimit / 99; for (uint32 fi=0; (validBubble == true) && (fi<fiLimit); fi++) { uint32 rid = tig->ufpath[fi].ident; if (OG->isContained(rid) == true) // Don't need to check contained reads. If their container continue; // passes the tests below, the contained read will too. nonContainedReads++; uint32 ovlLen = 0; BAToverlap *ovl = OC->getOverlaps(rid, AS_MAX_ERATE, ovlLen); set<uint32> readOlapsTo; for (uint32 oi=0; oi<ovlLen; oi++) { uint32 ovlTigID = Unitig::fragIn(ovl[oi].b_iid); Unitig *ovlTig = unitigs[ovlTigID]; // Skip this overlap if it is to an unplaced read, to a singleton tig, to ourself, // or to a unitig that is shorter than us. We can not pop this tig as a bubble // in any of those cases. if ((ovlTigID == 0) || (ovlTig == NULL) || (ovlTig->ufpath.size() == 1) || (ovlTig->id() == tig->id()) || (ovlTig->getLength() < tig->getLength())) continue; // Otherwise, remember that we had an overlap to ovlTig. //writeLog("tig %u read %u overlap to tig %u read %u\n", // tig->id(), rid, ovlTigID, ovl[oi].b_iid); readOlapsTo.insert(ovlTigID); } //writeLog("tig %8u read %8u has %u olaps\n", tig->id(), rid, readOlapsTo.size()); // Transfer the per-read counts to the per-unitig counts: add one to the counter for each tig // that we have overlaps to. for (set<uint32>::iterator it=readOlapsTo.begin(); it != readOlapsTo.end(); ++it) tigOlapsTo[*it]++; // Decide if we're a valid potential bubble. If tig id (in it->first) has overlaps to every // read we've seen so far (nonContainedReads), we're still a valid bubble. // // To _attempt_ to have differences in the bubble, we'll accept it if 3/4 of the reads // have overlaps. validBubble = false; for (map<uint32,uint32>::iterator it=tigOlapsTo.begin(); it != tigOlapsTo.end(); ++it) if (it->second >= BUBBLE_READ_FRACTION * nonContainedReads) validBubble = true; // If we've not seen that many reads, pretend it's a valid bubble. It'll get screened out later. if (nonContainedReads < 16) validBubble = true; } // If not validBubble, report. #if 0 if (validBubble == false) { writeLog("notValidBubble tig %8d expects %6u reads\n", tig->id(), nonContainedReads); for (map<uint32,uint32>::iterator it=tigOlapsTo.begin(); it != tigOlapsTo.end(); ++it) writeLog(" to tig %8u overlaps %6u\n", it->first, it->second); } #endif // If validBubble, then there is a tig that every dovetail read has at least one overlap to. // Save those tigs in potentialBubbles. uint32 nTigs = 0; if (validBubble) { for (map<uint32,uint32>::iterator it=tigOlapsTo.begin(); it != tigOlapsTo.end(); ++it) if (it->second >= BUBBLE_READ_FRACTION * nonContainedReads) nTigs++; } // ALWAYS log potential bubbles. if (nTigs > 0) { writeLog("\n"); writeLog("potential bubble tig %8u length %9u nReads %7u to %3u tigs:\n", tig->id(), tig->getLength(), tig->ufpath.size(), nTigs); for (map<uint32,uint32>::iterator it=tigOlapsTo.begin(); it != tigOlapsTo.end(); ++it) { if (it->second >= BUBBLE_READ_FRACTION * nonContainedReads) { Unitig *dest = unitigs[it->first]; writeLog(" tig %8u length %9u nReads %7u\n", dest->id(), dest->getLength(), dest->ufpath.size()); potentialBubbles[ti].push_back(dest->id()); } } } } flushLog(); }
void popBubbles(UnitigVector &unitigs, double deviationBubble) { BubTargetList potentialBubbles; findPotentialBubbles(unitigs, potentialBubbles); writeStatus("popBubbles()-- Found "F_SIZE_T" potential bubbles.\n", potentialBubbles.size()); //if (potentialBubbles.size() == 0) // return; writeLog("\n"); writeLog("Found "F_SIZE_T" potential bubbles.\n", potentialBubbles.size()); writeLog("\n"); vector<overlapPlacement> *placed = findBubbleReadPlacements(unitigs, potentialBubbles, deviationBubble); // We now have, in 'placed', a list of all the places that each read could be placed. Decide if there is a _single_ // place for each bubble to be popped. uint32 tiLimit = unitigs.size(); //uint32 tiNumThreads = omp_get_max_threads(); //uint32 tiBlockSize = (tiLimit < 100000 * tiNumThreads) ? tiNumThreads : tiLimit / 99999; // Clear flags. for (uint32 ti=0; ti<tiLimit; ti++) { if (unitigs[ti]) { unitigs[ti]->_isBubble = false; unitigs[ti]->_isRepeat = false; } } // In parallel, process the placements. for (uint32 ti=0; ti<tiLimit; ti++) { if (potentialBubbles.count(ti) == 0) // Not a potential bubble continue; // Scan the bubble, decide if there are _ANY_ read placements. Log appropriately. Unitig *bubble = unitigs[ti]; bool hasPlacements = false; for (uint32 fi=0; fi<bubble->ufpath.size(); fi++) { uint32 readID = bubble->ufpath[fi].ident; if (placed[readID].size() > 0) hasPlacements = true; } if (hasPlacements == false) writeLog("potential bubble %u had no valid placements (all were not contained in target tig)\n", ti); else writeLog("potential bubble %u\n", ti); // Split the placements into piles for each target and build an interval list for each target. // For each read in the tig, convert the vector of placements into interval lists, one list per target tig. map<uint32, intervalList<uint32> *> targetIntervals; for (uint32 fi=0; fi<bubble->ufpath.size(); fi++) { uint32 readID = bubble->ufpath[fi].ident; for (uint32 pp=0; pp<placed[readID].size(); pp++) { uint32 tid = placed[readID][pp].tigID; assert(placed[readID][pp].frgID > 0); uint32 bgn = (placed[readID][pp].position.bgn < placed[readID][pp].position.end) ? placed[readID][pp].position.bgn : placed[readID][pp].position.end; uint32 end = (placed[readID][pp].position.bgn < placed[readID][pp].position.end) ? placed[readID][pp].position.end : placed[readID][pp].position.bgn; if (targetIntervals[tid] == NULL) targetIntervals[tid] = new intervalList<uint32>; //writeLog("read %u -> tig %u intervals %u-%u\n", readID, tid, bgn, end); targetIntervals[tid]->add(bgn, end-bgn); } } vector<candidatePop *> targets; // Squish the intervals. Create new candidatePops for each interval that isn't too big or // small. Assign each overlapPlacements to the correct candidatePop. for (map<uint32, intervalList<uint32> *>::iterator it=targetIntervals.begin(); it != targetIntervals.end(); ++it) { uint32 targetID = it->first; intervalList<uint32> *IL = it->second; IL->merge(); // Discard intervals that are significantly too small or large. Save the ones that are // nicely sized. Logging here isn't terribly useful, it's just repeated (out of order) later // when we try to make sense of the read alignments. for (uint32 ii=0; ii<IL->numberOfIntervals(); ii++) { if ((IL->hi(ii) - IL->lo(ii) < 0.75 * bubble->getLength()) || // Too small! (1.25 * bubble->getLength() < IL->hi(ii) - IL->lo(ii))) { // Too big! writeLog("tig %8u length %9u -> target %8u piece %2u position %9u-%9u length %8u - size mismatch, discarded\n", bubble->id(), bubble->getLength(), targetID, ii, IL->lo(ii), IL->hi(ii), IL->hi(ii) - IL->lo(ii)); continue; } writeLog("tig %8u length %9u -> target %8u piece %2u position %9u-%9u length %8u\n", bubble->id(), bubble->getLength(), targetID, ii, IL->lo(ii), IL->hi(ii), IL->hi(ii) - IL->lo(ii)); targets.push_back(new candidatePop(bubble, unitigs[targetID], IL->lo(ii), IL->hi(ii))); } delete IL; } targetIntervals.clear(); // If no targets, nothing to do. if (targets.size() == 0) continue; // Run through the placements again, and assign them to the correct target. // // For each read: // For each acceptable placement: // For each target location: // If the placement is for this target, save it. for (uint32 fi=0; fi<bubble->ufpath.size(); fi++) { uint32 readID = bubble->ufpath[fi].ident; for (uint32 pp=0; pp<placed[readID].size(); pp++) { uint32 tid = placed[readID][pp].tigID; uint32 bgn = (placed[readID][pp].position.bgn < placed[readID][pp].position.end) ? placed[readID][pp].position.bgn : placed[readID][pp].position.end; uint32 end = (placed[readID][pp].position.bgn < placed[readID][pp].position.end) ? placed[readID][pp].position.end : placed[readID][pp].position.bgn; for (uint32 tt=0; tt<targets.size(); tt++) if ((targets[tt]->target->id() == tid) && (targets[tt]->bgn < end) && (bgn < targets[tt]->end)) targets[tt]->placed.push_back(placed[readID][pp]); } } // Count the number of targets that have all the reads (later: in the correct order, etc, etc). Remove those // that don't. uint32 nTargets = 0; set<uint32> tigReads; // Reads in the bubble tig. set<uint32> tgtReads; // Reads in the bubble that have a placement in the target. // Remove duplicate placements from each target. for (uint32 tt=0; tt<targets.size(); tt++) { candidatePop *t = targets[tt]; // Detect duplicates, keep the one with lower error. There are a lot of duplicate // placements, logging isn't terribly useful. for (uint32 aa=0; aa<t->placed.size(); aa++) { for (uint32 bb=0; bb<t->placed.size(); bb++) { if ((aa == bb) || (t->placed[aa].frgID != t->placed[bb].frgID) || (t->placed[aa].frgID == 0) || (t->placed[bb].frgID == 0)) continue; if (t->placed[aa].errors / t->placed[aa].aligned < t->placed[bb].errors / t->placed[bb].aligned) { #ifdef SHOW_MULTIPLE_PLACEMENTS writeLog("duplicate read alignment for tig %u read %u - better %u-%u %.4f - worse %u-%u %.4f\n", t->placed[aa].tigID, t->placed[aa].frgID, t->placed[aa].position.bgn, t->placed[aa].position.end, t->placed[aa].errors / t->placed[aa].aligned, t->placed[bb].position.bgn, t->placed[bb].position.end, t->placed[bb].errors / t->placed[bb].aligned); #endif t->placed[bb] = overlapPlacement(); } else { #ifdef SHOW_MULTIPLE_PLACEMENTS writeLog("duplicate read alignment for tig %u read %u - better %u-%u %.4f - worse %u-%u %.4f\n", t->placed[aa].tigID, t->placed[aa].frgID, t->placed[bb].position.bgn, t->placed[bb].position.end, t->placed[bb].errors / t->placed[bb].aligned, t->placed[aa].position.bgn, t->placed[aa].position.end, t->placed[aa].errors / t->placed[aa].aligned); #endif t->placed[aa] = overlapPlacement(); } } } // Get rid of any now-empty entries. for (uint32 aa=t->placed.size(); aa--; ) { if (t->placed[aa].frgID == 0) { t->placed[aa] = t->placed.back(); t->placed.pop_back(); } } } // Make a set of the reads in the bubble. We'll compare each target against this to decide if all reads are placed. for (uint32 fi=0; fi<bubble->ufpath.size(); fi++) tigReads.insert(bubble->ufpath[fi].ident); uint32 nOrphan = 0; // Full coverage; bubble can be popped. uint32 orphanTarget = 0; uint32 nBubble = 0; // Partial coverage, bubble cannot be popped. uint32 bubbleTarget = 0; for (uint32 tt=0; tt<targets.size(); tt++) { tgtReads.clear(); for (uint32 op=0; op<targets[tt]->placed.size(); op++) { if (logFileFlagSet(LOG_BUBBLE_DETAIL)) writeLog("tig %8u length %9u -> target %8u piece %2u position %9u-%9u length %8u - read %7u at %9u-%9u\n", bubble->id(), bubble->getLength(), targets[tt]->target->id(), tt, targets[tt]->bgn, targets[tt]->end, targets[tt]->end - targets[tt]->bgn, targets[tt]->placed[op].frgID, targets[tt]->placed[op].position.bgn, targets[tt]->placed[op].position.end); assert(targets[tt]->placed[op].frgID > 0); tgtReads.insert(targets[tt]->placed[op].frgID); } // Count the number of consecutive reads from the 5' or 3' end of the bubble that are placed // in the target. // // Also, count the number of reads in the bubble that are placed in the target. Likely the // same as n5 + n3. uint32 n5 = 0; uint32 n3 = 0; uint32 nt = 0; for (uint32 fi=0; fi<bubble->ufpath.size(); fi++) if (tgtReads.count(bubble->ufpath[fi].ident) > 0) n5++; else break; for (uint32 fi=bubble->ufpath.size(); fi-->0; ) if (tgtReads.count(bubble->ufpath[fi].ident) > 0) n3++; else break; for (uint32 fi=0; fi<bubble->ufpath.size(); fi++) if (tgtReads.count(bubble->ufpath[fi].ident) > 0) nt++; // Report now, before we nuke targets[tt] for being not a bubble! if ((nt == bubble->ufpath.size()) || ((n5 > 0) && (n3 > 0))) writeLog("tig %8u length %9u -> target %8u piece %2u position %9u-%9u length %8u - expected %3"F_SIZE_TP" reads, had %3"F_SIZE_TP" reads. n5=%3u n3=%3u nt=%3u\n", bubble->id(), bubble->getLength(), targets[tt]->target->id(), tt, targets[tt]->bgn, targets[tt]->end, targets[tt]->end - targets[tt]->bgn, tigReads.size(), tgtReads.size(), n5, n3, nt); // Decide if this is a bubble, orphan from construction, or repeat. if (nt == bubble->ufpath.size()) { nOrphan++; orphanTarget = tt; } else if ((n5 > 0) && (n3 > 0)) { nBubble++; bubbleTarget = tt; } } // If no placements, pbbbt. if (nOrphan + nBubble == 0) { //writeLog("tig %8u length %8u reads %6u had no bubble or orphan placements.\n", bubble->id(), bubble->getLength(), bubble->ufpath.size()); continue; } // If multiple orphan and/or bubble placements, it's a repeat. if (nOrphan + nBubble > 1) { writeLog("tig %8u length %8u reads %6u - repeat - %u orphan %u bubble placements.\n", bubble->id(), bubble->getLength(), bubble->ufpath.size(), nOrphan, nBubble); writeLog("\n"); bubble->_isRepeat = true; continue; } // If a bubble placement, mark it as a bubble so it can be skipped during repeat detection. if (nBubble > 0) { writeLog("tig %8u length %8u reads %6u - bubble\n", bubble->id(), bubble->getLength(), bubble->ufpath.size()); writeLog("\n"); bubble->_isBubble = true; continue; } // Otherwise, it's an orphan, move the reads to the proper place. writeLog("tig %8u length %8u reads %6u - orphan\n", bubble->id(), bubble->getLength(), bubble->ufpath.size()); for (uint32 op=0, tt=orphanTarget; op<targets[tt]->placed.size(); op++) { ufNode frg; frg.ident = targets[tt]->placed[op].frgID; frg.contained = 0; frg.parent = 0; frg.ahang = 0; frg.bhang = 0; frg.position.bgn = targets[tt]->placed[op].position.bgn; frg.position.end = targets[tt]->placed[op].position.end; writeLog("move read %u from tig %u to tig %u %u-%u\n", frg.ident, bubble->id(), targets[tt]->target->id(), frg.position.bgn, frg.position.end); targets[tt]->target->addFrag(frg, 0, false); } writeLog("\n"); unitigs[bubble->id()] = NULL; delete bubble; } // Over all bubbles writeLog("\n"); // Needed if no bubbles are popped. delete [] placed; // Sort reads in all the tigs. Overkill, but correct. for (uint32 ti=0; ti<tiLimit; ti++) { Unitig *tig = unitigs[ti]; if ((tig == NULL) || // Not a tig, ignore it. (tig->ufpath.size() == 1)) // Singleton, already sorted. continue; tig->sort(); } }
void markRepeatReads(UnitigVector &unitigs, double deviationRepeat, uint32 confusedAbsolute, double confusedPercent) { uint32 tiLimit = unitigs.size(); uint32 numThreads = omp_get_max_threads(); uint32 blockSize = (tiLimit < 100000 * numThreads) ? numThreads : tiLimit / 99999; writeLog("repeatDetect()-- working on "F_U32" unitigs, with "F_U32" threads.\n", tiLimit, numThreads); vector<olapDat> repeatOlaps; // Overlaps to reads promoted to tig coords intervalList<int32> tigMarksR; // Marked repeats based on reads, filtered by spanning reads intervalList<int32> tigMarksU; // Non-repeat invervals, just the inversion of tigMarksR for (uint32 ti=0; ti<tiLimit; ti++) { Unitig *tig = unitigs[ti]; if (tig == NULL) continue; if (tig->ufpath.size() == 1) continue; vector<olapDat> repeats; writeLog("Annotating repeats in reads for tig %u/%u.\n", ti, tiLimit); // Clear out all the existing marks. They're not for this tig. // Analyze overlaps for each read. For each overlap to a read not in this tig, or not // overlapping in this tig, and of acceptable error rate, add the overlap to repeatOlaps. repeatOlaps.clear(); uint32 fiLimit = tig->ufpath.size(); uint32 numThreads = omp_get_max_threads(); uint32 blockSize = (fiLimit < 100 * numThreads) ? numThreads : fiLimit / 99; #pragma omp parallel for if(fiLimit > 100) schedule(dynamic, blockSize) for (uint32 fi=0; fi<fiLimit; fi++) annotateRepeatsOnRead(unitigs, tig, &tig->ufpath[fi], deviationRepeat, repeatOlaps); writeLog("Annotated with %lu overlaps.\n", repeatOlaps.size()); // Merge marks for the same read into the largest possible. sort(repeatOlaps.begin(), repeatOlaps.end(), olapDatByEviRid); #ifdef SHOW_ANNOTATE for (uint32 ii=0; ii<repeatOlaps.size(); ii++) if (repeatOlaps[ii].tigbgn < 1000000) writeLog("repeatOlaps[%u] %u-%u from tig %u read %u RAW\n", ii, repeatOlaps[ii].tigbgn, repeatOlaps[ii].tigend, repeatOlaps[ii].eviTid, repeatOlaps[ii].eviRid); flushLog(); #endif for (uint32 dd=0, ss=1; ss<repeatOlaps.size(); ss++) { assert(repeatOlaps[dd].eviRid <= repeatOlaps[ss].eviRid); // If different evidence reads, close the destination olap, set up // for a new destination. if (repeatOlaps[dd].eviRid != repeatOlaps[ss].eviRid) { dd = ss; continue; } // If the destination ends before the source begins, there is no overlap between the // two regions. Close dd, set up for a new dd. if (repeatOlaps[dd].tigend <= repeatOlaps[ss].tigbgn) { dd = ss; continue; } // Otherwise, there must be an overlap. Extend the destination region, erase the source // region. repeatOlaps[dd].tigbgn = min(repeatOlaps[ss].tigbgn, repeatOlaps[dd].tigbgn); repeatOlaps[dd].tigend = max(repeatOlaps[ss].tigend, repeatOlaps[dd].tigend); repeatOlaps[ss].tigbgn = UINT32_MAX; repeatOlaps[ss].tigend = UINT32_MAX; repeatOlaps[ss].eviTid = UINT32_MAX; repeatOlaps[ss].eviRid = UINT32_MAX; } // Sort overlaps again. This pushes all those 'erased' regions to the end of the list, which // we can then just pop off. sort(repeatOlaps.begin(), repeatOlaps.end(), olapDatByEviRid); for (uint32 ii=repeatOlaps.size(); ii--; ) if (repeatOlaps[ii].eviTid == UINT32_MAX) repeatOlaps.pop_back(); // For logging, sort by coordinate sort(repeatOlaps.begin(), repeatOlaps.end()); #ifdef SHOW_ANNOTATE for (uint32 ii=0; ii<repeatOlaps.size(); ii++) writeLog("repeatOlaps[%d] %u-%u from tig %u read %u MERGED\n", ii, repeatOlaps[ii].tigbgn, repeatOlaps[ii].tigend, repeatOlaps[ii].eviTid, repeatOlaps[ii].eviRid); #endif // Make a new set of intervals based on all the detected repeats. tigMarksR.clear(); for (uint32 bb=0, ii=0; ii<repeatOlaps.size(); ii++) tigMarksR.add(repeatOlaps[ii].tigbgn, repeatOlaps[ii].tigend - repeatOlaps[ii].tigbgn); // Collapse these markings Collapse all the read markings to intervals on the unitig, merging those that overlap // significantly. writeLog("Merge marks.\n"); tigMarksR.merge(REPEAT_OVERLAP_MIN); // Scan reads, discard any mark that is contained in a read // // We don't need to filterShort() after every one is removed, but it's simpler to do it Right Now than // to track if it is needed. writeLog("Scan reads to discard spanned repeats.\n"); for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; bool frgfwd = (frg->position.bgn < frg->position.end); int32 frglo = (frgfwd) ? frg->position.bgn : frg->position.end; int32 frghi = (frgfwd) ? frg->position.end : frg->position.bgn; bool discarded = false; for (uint32 ri=0; ri<tigMarksR.numberOfIntervals(); ri++) { bool spanLo = false; bool spanHi = false; // The decision of 'spanned by a read' is broken into two pieces: does the read span the // lower (higher) boundary of the region. To be spanned, the boundary needs to be spanned // by at least MIN_ANCHOR_HANG additional bases (to anchor the read to non-repeat // sequence). // // This is a problem at the start/end of the tig, beacuse no read will extend past the // start/end of the tig. Instead, if the repeat is contained within the first (last) read // with no extension at the respective end, it is spanned. if ((frglo == 0) && // Read at start of tig, spans off the high end (tigMarksR.hi(ri) + MIN_ANCHOR_HANG <= frghi)) spanLo = spanHi = true; if ((frghi == tig->getLength()) && // Read at end of tig, spans off the low end (frglo + MIN_ANCHOR_HANG <= tigMarksR.lo(ri))) spanLo = spanHi = true; if (frglo + MIN_ANCHOR_HANG <= tigMarksR.lo(ri)) // Read spanned off the low end spanLo = true; if (tigMarksR.hi(ri) + MIN_ANCHOR_HANG <= frghi) // Read spanned off the high end spanHi = true; if (spanLo && spanHi) { writeLog("discard region %8d:%-8d - contained in read %6u %8d-%8d\n", tigMarksR.lo(ri), tigMarksR.hi(ri), frg->ident, frglo, frghi); tigMarksR.lo(ri) = 0; tigMarksR.hi(ri) = 0; discarded = true; } } if (discarded) tigMarksR.filterShort(1); } // Run through again, looking for the thickest overlap(s) to the remaining regions. // This isn't caring about the end effect noted above. #if 1 writeLog("thickest edges to the repeat regions:\n"); for (uint32 ri=0; ri<tigMarksR.numberOfIntervals(); ri++) { uint32 t5 = UINT32_MAX, l5 = 0, t5bgn, t5end; uint32 t3 = UINT32_MAX, l3 = 0, t3bgn, t3end; for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; bool frgfwd = (frg->position.bgn < frg->position.end); int32 frglo = (frgfwd) ? frg->position.bgn : frg->position.end; int32 frghi = (frgfwd) ? frg->position.end : frg->position.bgn; bool discarded = false; // Overlap off the 5' end of the region. if (frglo <= tigMarksR.lo(ri) && (tigMarksR.lo(ri) <= frghi)) { uint32 olap = frghi - tigMarksR.lo(ri); if (l5 < olap) { l5 = olap; t5 = fi; t5bgn = frglo; // Easier than recomputing it later on... t5end = frghi; } } // Overlap off the 3' end of the region. if (frglo <= tigMarksR.hi(ri) && (tigMarksR.hi(ri) <= frghi)) { uint32 olap = tigMarksR.hi(ri) - frglo; if (l3 < olap) { l3 = olap; t3 = fi; t3bgn = frglo; t3end = frghi; } } if (frglo <= tigMarksR.lo(ri) && (tigMarksR.hi(ri) <= frghi)) { writeLog("saved region %8d:%-8d - closest read %6u (%+6d) %8d:%-8d (%+6d) (contained)\n", tigMarksR.lo(ri), tigMarksR.hi(ri), frg->ident, tigMarksR.lo(ri) - frglo, frglo, frghi, frghi - tigMarksR.hi(ri)); } } if (t5 != UINT32_MAX) writeLog("saved region %8d:%-8d - closest 5' read %6u (%+6d) %8d:%-8d (%+6d)\n", tigMarksR.lo(ri), tigMarksR.hi(ri), tig->ufpath[t5].ident, tigMarksR.lo(ri) - t5bgn, t5bgn, t5end, t5end - tigMarksR.hi(ri)); if (t3 != UINT32_MAX) writeLog("saved region %8d:%-8d - closest 3' read %6u (%+6d) %8d:%-8d (%+6d)\n", tigMarksR.lo(ri), tigMarksR.hi(ri), tig->ufpath[t3].ident, tigMarksR.lo(ri) - t3bgn, t3bgn, t3end, t3end - tigMarksR.hi(ri)); } #endif // Scan reads. If a read intersects a repeat interval, and the best edge for that read // is entirely in the repeat region, decide if there is a near-best edge to something // not in this tig. // // A region with no such near-best edges is _probably_ correct. writeLog("search for confused edges:\n"); uint32 *isConfused = new uint32 [tigMarksR.numberOfIntervals()]; memset(isConfused, 0, sizeof(uint32) * tigMarksR.numberOfIntervals()); for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *rdA = &tig->ufpath[fi]; uint32 rdAid = rdA->ident; bool rdAfwd = (rdA->position.bgn < rdA->position.end); int32 rdAlo = (rdAfwd) ? rdA->position.bgn : rdA->position.end; int32 rdAhi = (rdAfwd) ? rdA->position.end : rdA->position.bgn; double sc = (rdAhi - rdAlo) / (double)FI->fragmentLength(rdAid); if ((OG->isContained(rdAid) == true) || (OG->isSuspicious(rdAid) == true)) continue; for (uint32 ri=0; ri<tigMarksR.numberOfIntervals(); ri++) { uint32 rMin = tigMarksR.lo(ri); uint32 rMax = tigMarksR.hi(ri); if ((rdAhi < rMin) || // Read ends before the region (rMax < rdAlo)) // Read starts after the region continue; // -> don't care about this read! // Compute the position (in the tig) of the best overlaps. int32 tig5bgn=0, tig5end=0; int32 tig3bgn=0, tig3end=0; // Instead of using the best edge - which might not be the edge used in the unitig - // we need to scan the layout to return the previous/next dovetail // Put this in a function - what to return if no best overlap? BestEdgeOverlap *b5 = OG->getBestEdgeOverlap(rdAid, false); BestEdgeOverlap *b3 = OG->getBestEdgeOverlap(rdAid, true); // If the best edge is to a read not in this tig, there is nothing to compare against. // Is this confused by default? Possibly. The unitig was constructed somehow, and that // must then be the edge coming into us. We'll pick it up later. bool b5use = true; bool b3use = true; if (b5->fragId() == 0) b5use = false; if (b3->fragId() == 0) b3use = false; if ((b5use) && (Unitig::fragIn(b5->fragId()) != tig->id())) b5use = false; if ((b3use) && (Unitig::fragIn(b3->fragId()) != tig->id())) b3use = false; // The best edge read is in this tig. If they don't overlap, again, nothing to compare // against. if (b5use) { ufNode *rdB = &tig->ufpath[Unitig::pathPosition(b5->fragId())]; uint32 rdBid = rdB->ident; bool rdBfwd = (rdB->position.bgn < rdB->position.end); int32 rdBlo = (rdBfwd) ? rdB->position.bgn : rdB->position.end; int32 rdBhi = (rdBfwd) ? rdB->position.end : rdB->position.bgn; if ((rdAhi < rdBlo) || (rdBhi < rdAlo)) b5use = false; } if (b3use) { ufNode *rdB = &tig->ufpath[Unitig::pathPosition(b3->fragId())]; uint32 rdBid = rdB->ident; bool rdBfwd = (rdB->position.bgn < rdB->position.end); int32 rdBlo = (rdBfwd) ? rdB->position.bgn : rdB->position.end; int32 rdBhi = (rdBfwd) ? rdB->position.end : rdB->position.bgn; if ((rdAhi < rdBlo) || (rdBhi < rdAlo)) b3use = false; } // If we can use this edge, compute the placement of the overlap on the unitig. // Call #1; if (b5use) { int32 bgn=0, end=0; olapToReadCoords(rdA, b5->ahang(), b5->bhang(), bgn, end); tig5bgn = (rdAfwd) ? (rdAlo + sc * bgn) : (rdAhi - sc * end); tig5end = (rdAfwd) ? (rdAlo + sc * end) : (rdAhi - sc * bgn); assert(tig5bgn < tig5end); if (tig5bgn < 0) tig5bgn = 0; if (tig5end > tig->getLength()) tig5end = tig->getLength(); } // Call #2 if (b3use) { int32 bgn=0, end=0; olapToReadCoords(rdA, b3->ahang(), b3->bhang(), bgn, end); tig3bgn = (rdAfwd) ? (rdAlo + sc * bgn) : (rdAhi - sc * end); tig3end = (rdAfwd) ? (rdAlo + sc * end) : (rdAhi - sc * bgn); assert(tig3bgn < tig3end); if (tig3bgn < 0) tig3bgn = 0; if (tig3end > tig->getLength()) tig3end = tig->getLength(); } // If either of the 5' or 3' overlaps (or both!) are in the repeat region, we need to check for // close overlaps on that end. uint32 len5 = 0; uint32 len3 = 0; if ((rMin < tig5bgn) && (tig5end < rMax) && (b5use)) len5 = FI->overlapLength(rdAid, b5->fragId(), b5->ahang(), b5->bhang()); else b5use = false; if ((rMin < tig3bgn) && (tig3end < rMax) && (b3use)) len3 = FI->overlapLength(rdAid, b3->fragId(), b3->ahang(), b3->bhang()); else b3use = false; double score5 = len5 * (1 - b5->erate()); double score3 = len3 * (1 - b3->erate()); // Neither of the best edges are in the repeat region; move to the next region and/or read. if (len5 + len3 == 0) continue; // At least one of the best edge overlaps is in the repeat region. Scan for other edges // that are of comparable length and quality. uint32 ovlLen = 0; BAToverlap *ovl = OC->getOverlaps(rdAid, AS_MAX_ERATE, ovlLen); for (uint32 oo=0; oo<ovlLen; oo++) { uint32 rdBid = ovl[oo].b_iid; uint32 tgBid = Unitig::fragIn(rdBid); // If the read is in a singleton, skip. These are unassembled crud. if ((tgBid == 0) || (unitigs[tgBid] == NULL) || (unitigs[tgBid]->ufpath.size() == 1)) continue; // If the read is in an annotated bubble, skip. if (unitigs[tgBid]->_isBubble) continue; // Skip if this overlap is the best we're trying to match. if ((rdBid == b5->fragId()) || (rdBid == b3->fragId())) continue; // Skip if this overlap is crappy quality if (OG->isOverlapBadQuality(ovl[oo])) continue; // Skip if the read is contained or suspicious. if ((OG->isContained(rdBid) == true) || (OG->isSuspicious(rdBid) == true)) continue; // Skip if the overlap isn't dovetail. bool ovl5 = ovl[oo].AEndIs5prime(); bool ovl3 = ovl[oo].AEndIs3prime(); if ((ovl5 == false) && (ovl3 == false)) continue; // Skip if we're not using this overlap if ((ovl5 == true) && (b5use == false)) continue; if ((ovl3 == true) && (b3use == false)) continue; uint32 rdBpos = unitigs[tgBid]->pathPosition(rdBid); ufNode *rdB = &unitigs[tgBid]->ufpath[rdBpos]; bool rdBfwd = (rdB->position.bgn < rdB->position.end); int32 rdBlo = (rdBfwd) ? rdB->position.bgn : rdB->position.end; int32 rdBhi = (rdBfwd) ? rdB->position.end : rdB->position.bgn; // If the overlap is to a read in a different tig, or // the overlap is to a read in the same tig, but we don't overlap in the tig, check lengths. // Otherwise, the overlap is present in the tig, and can't be confused. if ((tgBid == tig->id()) && (rdBlo <= rdAhi) && (rdAlo <= rdBhi)) continue; uint32 len = FI->overlapLength(rdAid, ovl[oo].b_iid, ovl[oo].a_hang, ovl[oo].b_hang); double score = len * (1 - ovl[oo].erate); // Compute percent difference. double ad5 = fabs(score - score5); double ad3 = fabs(score - score3); double pd5 = 200 * ad5 / (score + score5); double pd3 = 200 * ad3 / (score + score3); // Skip if this overlap is vastly worse than the best. if ((ovl5 == true) && ((ad5 >= confusedAbsolute) || (pd3 > confusedPercent))) { writeLog("tig %7u read %8u pos %7u-%-7u NOT confused by 5' edge to read %8u - best edge read %8u len %6u erate %.4f score %8.2f - alt edge len %6u erate %.4f score %8.2f - absdiff %8.2f percdiff %8.4f\n", tig->id(), rdAid, rdAlo, rdAhi, rdBid, b5->fragId(), len5, b5->erate(), score5, len, ovl[oo].erate, score, ad5, pd5); continue; } if ((ovl3 == true) && ((ad3 >= confusedAbsolute) || (pd3 > confusedPercent))) { writeLog("tig %7u read %8u pos %7u-%-7u NOT confused by 3' edge to read %8u - best edge read %8u len %6u erate %.4f score %8.2f - alt edge len %6u erate %.4f score %8.2f - absdiff %8.2f percdiff %8.4f\n", tig->id(), rdAid, rdAlo, rdAhi, rdBid, b3->fragId(), len3, b3->erate(), score3, len, ovl[oo].erate, score, ad3, pd3); continue; } // Potential confusion! if (ovl5 == true) writeLog("tig %7u read %8u pos %7u-%-7u IS confused by 5' edge to read %8u - best edge read %8u len %6u erate %.4f score %8.2f - alt edge len %6u erate %.4f score %8.2f - absdiff %8.2f percdiff %8.4f\n", tig->id(), rdAid, rdAlo, rdAhi, rdBid, b5->fragId(), len5, b5->erate(), score5, len, ovl[oo].erate, score, ad5, pd5); if (ovl3 == true) writeLog("tig %7u read %8u pos %7u-%-7u IS confused by 3' edge to read %8u - best edge read %8u len %6u erate %.4f score %8.2f - alt edge len %6u erate %.4f score %8.2f - absdiff %8.2f percdiff %8.4f\n", tig->id(), rdAid, rdAlo, rdAhi, rdBid, b3->fragId(), len3, b3->erate(), score3, len, ovl[oo].erate, score, ad3, pd3); isConfused[ri]++; } } // Over all marks (ri) } // Over all reads (fi) // Scan all the regions, and delete any that have no confusion. { bool discarded = false; for (uint32 ri=0; ri<tigMarksR.numberOfIntervals(); ri++) { if (isConfused[ri] == 0) { writeLog("discard region %8d:%-8d - no confusion in best edges\n", tigMarksR.lo(ri), tigMarksR.hi(ri)); tigMarksR.lo(ri) = 0; tigMarksR.hi(ri) = 0; discarded = true; } else { writeLog("saved region %8d:%-8d - %u best edges are potentially confused\n", tigMarksR.lo(ri), tigMarksR.hi(ri), isConfused[ri]); } } if (discarded) tigMarksR.filterShort(1); } delete [] isConfused; // Scan reads, join any marks that have their junctions spanned by a sufficiently large amount. // // If the read spans this junction be the usual amount, merge the intervals. // // The intervals can be overlapping (by up to REPEAT_OVERLAP_MIN (x2?) bases. For this junction // to be spanned, the read must span from min-ROM to max+ROM, not just hi(ri-1) to lo(ri). // // We DO need to filterShort() after every merge, otherwise, we'd have an empty bogus interval // in the middle of our list, which could be preventing some other merge. OK, we could // // Anything that gets merged is now no longer a true repeat. It's unique, just bordered by repeats. // We can't track this through the indices (because we delete things). We track it with a set of // begin coordinates. set<int32> nonRepeatIntervals; writeLog("Scan reads to merge repeat regions.\n"); for (uint32 fi=0; fi<tig->ufpath.size(); fi++) { ufNode *frg = &tig->ufpath[fi]; bool frgfwd = (frg->position.bgn < frg->position.end); int32 frglo = (frgfwd) ? frg->position.bgn : frg->position.end; int32 frghi = (frgfwd) ? frg->position.end : frg->position.bgn; bool merged = false; for (uint32 ri=1; ri<tigMarksR.numberOfIntervals(); ri++) { uint32 rMin = min(tigMarksR.hi(ri-1), tigMarksR.lo(ri)); uint32 rMax = max(tigMarksR.hi(ri-1), tigMarksR.lo(ri)); if ((frglo + MIN_ANCHOR_HANG <= rMin) && (rMax + MIN_ANCHOR_HANG <= frghi)) { writeLog("merge regions %8d:%-8d and %8d:%-8d - junction contained in read %6u %5d-%5d\n", tigMarksR.lo(ri-1), tigMarksR.hi(ri-1), tigMarksR.lo(ri), tigMarksR.hi(ri), frg->ident, frglo, frghi); tigMarksR.lo(ri) = tigMarksR.lo(ri-1); tigMarksR.lo(ri-1) = 0; // CRITICAL to delete this interval (and not ri) because the next tigMarksR.hi(ri-1) = 0; // iteration will be using ri-1 (== ri here) and ri (== ri+1). merged = true; nonRepeatIntervals.insert(tigMarksR.lo(ri)); } } if (merged) tigMarksR.filterShort(1); } // Extend the regions by MIN_ANCHOR_HANG. This makes checking for reads that span and are // anchored in the next region easier. It also solved a quirk when the first/last repeat // region doesn't extend to the end of the sequence: // 0-183 unique (created from inversion below, but useless and incorrect) // 183-9942 repeat for (uint32 ii=0; ii<tigMarksR.numberOfIntervals(); ii++) { tigMarksR.lo(ii) = max<int32>(tigMarksR.lo(ii) - MIN_ANCHOR_HANG, 0); tigMarksR.hi(ii) = min<int32>(tigMarksR.hi(ii) + MIN_ANCHOR_HANG, tig->getLength()); } // Find the non-repeat intervals. tigMarksU = tigMarksR; tigMarksU.invert(0, tig->getLength()); // Create the list of intervals we'll use to make new unitigs. // // The repeat intervals are extended by MIN_ANCHOR_HANG, and then any read fully contained in one of // these is moved here. // // The non-repeat intervals are shortened by the same amount, and any read that intersects one // is moved there. // // Does order matter? Not sure. The repeat intervals are first, then the formerly repeat // merged intervals, then the unique intervals. Splitting might depend on the repeats being // first. writeLog("Make breakpoints.\n"); vector<breakPointCoords> BP; for (uint32 ii=0; ii<tigMarksR.numberOfIntervals(); ii++) if (nonRepeatIntervals.count(tigMarksR.lo(ii)) == 0) BP.push_back(breakPointCoords(ti, tigMarksR.lo(ii), tigMarksR.hi(ii), true)); for (uint32 ii=0; ii<tigMarksR.numberOfIntervals(); ii++) if (nonRepeatIntervals.count(tigMarksR.lo(ii)) != 0) BP.push_back(breakPointCoords(ti, tigMarksR.lo(ii), tigMarksR.hi(ii), true)); for (uint32 ii=0; ii<tigMarksU.numberOfIntervals(); ii++) { BP.push_back(breakPointCoords(ti, tigMarksU.lo(ii), tigMarksU.hi(ii), false)); } // If only one region, the whole unitig was declared repeat. Nothing to do. if (BP.size() == 1) continue; sort(BP.begin(), BP.end()); // Report. writeLog("break tig %u into up to %u pieces:\n", ti, BP.size()); for (uint32 ii=0; ii<BP.size(); ii++) writeLog(" %8d %8d %s (length %d)\n", BP[ii]._bgn, BP[ii]._end, BP[ii]._isRepeat ? "repeat" : "unique", BP[ii]._end - BP[ii]._bgn); // Scan the reads, counting the number of reads that would be placed in each new tig. This is done // because there are a few 'splits' that don't move any reads around. Unitig **newTigs = new Unitig * [BP.size()]; int32 *lowCoord = new int32 [BP.size()]; uint32 *nRepeat = new uint32 [BP.size()]; uint32 *nUnique = new uint32 [BP.size()]; // First call, count the number of tigs we would create if we let it create them. uint32 nTigs = splitUnitigs(unitigs, tig, BP, newTigs, lowCoord, nRepeat, nUnique, false); // Second call, actually create the tigs, if anything would change. if (nTigs > 1) splitUnitigs(unitigs, tig, BP, newTigs, lowCoord, nRepeat, nUnique, true); // Report the tigs created. for (uint32 ii=0; ii<BP.size(); ii++) { int32 rgnbgn = BP[ii]._bgn; int32 rgnend = BP[ii]._end; bool repeat = BP[ii]._isRepeat; if (nRepeat[ii] + nUnique[ii] == 0) writeLog("For tig %5u %s region %8d %8d - %6u/%6u repeat/unique reads - no new unitig created.\n", ti, (repeat == true) ? "repeat" : "unique", rgnbgn, rgnend, nRepeat[ii], nUnique[ii]); else if (nTigs > 1) writeLog("For tig %5u %s region %8d %8d - %6u/%6u reads repeat/unique - unitig %5u created.\n", ti, (repeat == true) ? "repeat" : "unique", rgnbgn, rgnend, nRepeat[ii], nUnique[ii], newTigs[ii]->id()); else writeLog("For tig %5u %s region %8d %8d - %6u/%6u repeat/unique reads - unitig %5u remains unchanged.\n", ti, (repeat == true) ? "repeat" : "unique", rgnbgn, rgnend, nRepeat[ii], nUnique[ii], tig->id()); } // Cleanup. delete [] newTigs; delete [] lowCoord; delete [] nRepeat; delete [] nUnique; // Remove the old unitig....if we made new ones. if (nTigs > 1) { delete tig; unitigs[ti] = NULL; } } }