bool MultiAlignContig(MultiAlignT *ma, gkStore *UNUSED, CNS_Options *opp) { int32 num_bases = 0; int32 num_unitigs = GetNumIntUnitigPoss(ma->u_list); int32 num_frags = GetNumIntMultiPoss(ma->f_list); int32 num_columns = 0; IntMultiPos *flist = GetIntMultiPos(ma->f_list, 0); IntUnitigPos *ulist = GetIntUnitigPos(ma->u_list, 0); IntMultiVar *vlist = GetIntMultiVar(ma->v_list, 0); SeqInterval *offsets = (SeqInterval *) safe_calloc(num_unitigs,sizeof(SeqInterval)); for (int32 i=0;i<num_unitigs;i++) { int32 flen = (ulist[i].position.bgn < ulist[i].position.end) ? (ulist[i].position.end < ulist[i].position.bgn) : (ulist[i].position.bgn - ulist[i].position.end); num_bases += flen + 2 * AS_CNS_ERROR_RATE * flen; num_columns = (ulist[i].position.bgn > num_columns) ? ulist[i].position.bgn : num_columns; num_columns = (ulist[i].position.end > num_columns) ? ulist[i].position.end : num_columns; //fprintf(stderr, "CTG %d UTG %d %d-%d\n", // ma->maID, ulist[i].ident, ulist[i].position.bgn, ulist[i].position.end); } for (int32 i=0;i<num_frags;i++) { int32 flen = (flist[i].position.bgn < flist[i].position.end) ? (flist[i].position.end < flist[i].position.bgn) : (flist[i].position.bgn - flist[i].position.end); num_bases += flen + 2 * AS_CNS_ERROR_RATE * flen; } ResetStores(num_bases, num_unitigs, num_columns); fragmentMap = CreateScalarHashTable_AS(); fragmentToIMP = CreateScalarHashTable_AS(); for (int32 i=0; i<num_frags; i++) { // Add all fragments in the contigs f_list to the fragmentMap. This tells us if a fragment is // not placed in a surrogate (because they aren't in the contigs f_list, but will appear in a // surrogate unitigs f_list). // if (HASH_SUCCESS != InsertInHashTable_AS(fragmentMap, flist[i].ident, 0, 1, 0)) { fprintf(stderr, "MultiAlignContig()-- Contig %d FAILED. Fragment %d is a duplicate.\n", ma->maID, flist[i].ident); return(false); } // SK store IID to IMP message mapping InsertInHashTable_AS(fragmentToIMP, flist[i].ident, 0, (uint64)&flist[i], 0); } for (int32 i=0;i<num_unitigs;i++) { uint32 complement = (ulist[i].position.bgn<ulist[i].position.end)?0:1; uint32 fid = AppendFragToLocalStore(AS_UNITIG, ulist[i].ident, complement, 0, ulist[i].type); offsets[fid].bgn = complement?ulist[i].position.end:ulist[i].position.bgn; offsets[fid].end = complement?ulist[i].position.bgn:ulist[i].position.end; } MANode *manode = CreateMANode(ma->maID); // Seed multiAlignment with 1st fragment of 1st unitig SeedMAWithFragment(manode->lid, GetFragment(fragmentStore,0)->lid, opp); PlaceFragments(GetFragment(fragmentStore,0)->lid, ulist + GetFragment(fragmentStore,0)->lid, opp); // Now, loop on remaining fragments, aligning to: // a) containing frag (if contained) // or b) previously aligned frag VA_TYPE(int32) *trace = CreateVA_int32(AS_READ_MAX_NORMAL_LEN+1); for (int32 i=1;i<num_unitigs;i++) { Fragment *afrag = NULL; Fragment *bfrag = GetFragment(fragmentStore,i); int32 ahang = 0; int32 bhang = 0; int32 ovl = 0; int32 alid = 0; int32 blid = bfrag->lid; OverlapType otype; int32 olap_success = 0; int32 try_contained = 0; int32 align_to = i - 1; Fragment *afrag_first = NULL; int32 ahang_first = 0; int32 bhang_first = 0; while (!olap_success) { nextFrag: if (try_contained == 0) // Skip contained stuff. while ((align_to > 0) && ((GetFragment(fragmentStore, align_to)->is_contained) || (GetFragment(fragmentStore, align_to)->container_iid > 0))) align_to--; if (align_to < 0) { if (VERBOSE_MULTIALIGN_OUTPUT) fprintf(stderr, "MultiAlignContig: hit the beginning of unitig list: no unitig upstream overlaps with current unitig %d\n", bfrag->iid); if (try_contained == 0) { if (VERBOSE_MULTIALIGN_OUTPUT) fprintf(stderr, "MultiAlignContig: trying contained afrags for bfrag %d\n", bfrag->iid); try_contained = 1; align_to = i-1; goto nextFrag; } break; } afrag = GetFragment(fragmentStore, align_to); alid = afrag->lid; ahang = offsets[blid].bgn - offsets[alid].bgn; bhang = offsets[blid].end - offsets[alid].end; if (afrag_first == NULL) { afrag_first = afrag; ahang_first = ahang; bhang_first = bhang; } // This code copied from MultiAlignUnitig. if (offsets[afrag->lid].bgn < offsets[bfrag->lid].bgn) if (offsets[afrag->lid].end < offsets[bfrag->lid].end) ovl = offsets[afrag->lid].end - offsets[bfrag->lid].bgn; else //ovl = offsets[bfrag->lid].end - offsets[bfrag->lid].bgn; ovl = bfrag->length; else if (offsets[afrag->lid].end < offsets[bfrag->lid].end) //ovl = offsets[afrag->lid].end - offsets[afrag->lid].bgn; ovl = afrag->length; else ovl = offsets[bfrag->lid].end - offsets[afrag->lid].bgn; // End of copy if (ovl <= 0) { if (VERBOSE_MULTIALIGN_OUTPUT) fprintf(stderr, "MultiAlignContig: positions of afrag %d and bfrag %d do not overlap. Proceed to the next upstream afrag\n", afrag->iid, bfrag->iid); align_to--; goto nextFrag; } olap_success = GetAlignmentTraceDriver(afrag, NULL, bfrag, &ahang, &bhang, ovl, trace, &otype, GETALIGNTRACE_CONTIGU, (blid + 1 < num_unitigs) ? (offsets[blid + 1].bgn - offsets[blid].bgn) : 800); // Nope, fail. if (!olap_success) { if (VERBOSE_MULTIALIGN_OUTPUT) fprintf(stderr, "MultiAlignContig: Positions of afrag %d (%c) and bfrag %d (%c) overlap, but GetAlignmentTrace returns no overlap success.\n", afrag->iid, afrag->type, bfrag->iid, bfrag->type); align_to--; if ((align_to < 0) && (!try_contained)) { if (VERBOSE_MULTIALIGN_OUTPUT) fprintf(stderr, "MultiAlignContig: Try contained afrags for bfrag %d\n", bfrag->iid); try_contained = 1; align_to = i-1; } } } // while !olap_success if ((!olap_success) && (FORCE_UNITIG_ABUT == 0)) { fprintf(stderr,"MultiAlignContig: Could (really) not find overlap between %d (%c) and %d (%c), estimated ahang %d\n", afrag->iid,afrag->type,bfrag->iid,bfrag->type, ahang); fprintf(stderr,"MultiAlignContig: You can (possibly) force these to abut with '-D forceunitigabut', but that code is buggy at best.\n"); goto returnFailure; } #if 1 if ((!olap_success) && (FORCE_UNITIG_ABUT == 1)) { if (afrag_first) { afrag = afrag_first; ahang = ahang_first; bhang = bhang_first; } else { // Dang, we're really screwed. Nobody overlapped with us. // Cross our fingers and find the closest end point. // int32 maxOvl = -offsets[blid].bgn; //if (VERBOSE_MULTIALIGN_OUTPUT) // fprintf(stderr, "MultiAlignContig: YIKES! Your unitig doesn't overlap with anything! Picking the closest thing!\n"); align_to = i-1; while (align_to >= 0) { if ((try_contained == 0) && ((GetFragment(fragmentStore, align_to)->is_contained) || (GetFragment(fragmentStore, align_to)->container_iid > 0))) { // NOP! Found a contained frag, and we want to skip it. } else if (maxOvl < offsets[alid].end - offsets[blid].bgn) { afrag = GetFragment(fragmentStore, align_to); alid = afrag->lid; ahang = offsets[blid].bgn - offsets[alid].bgn; maxOvl = offsets[alid].end - offsets[blid].bgn; //fprintf(stderr, "MultiAlignContig: RESET align_to=%d alid=%d maxOvl=%d ahang=%d\n", align_to, alid, maxOvl, ahang); } align_to--; } // while align_to >= 0 } fprintf(stderr, "MultiAlignContig: Forcing abut between afrag %d (%c) and bfrag %d (%c) in contig %d.\n", afrag->iid, afrag->type, bfrag->iid, bfrag->type, ma->maID); // Force a 1bp overlap. We'd like to strictly abut, but ApplyAlignment() requires that there // be an overlap, and removing checks for that seem like a bad idea. // ahang = afrag->length - 1; otype = AS_DOVETAIL; int32 zero = 0; ResetVA_int32(trace); AppendVA_int32(trace, &zero); assert(*Getint32(trace,0) == 0); assert(GetNumint32s(trace) == 1); } #endif // Unitig is placed, or we just forced it to be placed. if (otype == AS_CONTAINMENT) { bfrag->is_contained = 1; if (bfrag->container_iid == 0) bfrag->container_iid = 1; // Not sure why 1 and not afrag->iid } ApplyAlignment(afrag->lid, 0, NULL, bfrag->lid, ahang, Getint32(trace,0)); PlaceFragments(bfrag->lid, ulist + bfrag->lid, opp); } // over all unitigs // Now, must find fragments in regions of overlapping unitigs, and adjust // their alignments as needed RefreshMANode(manode->lid, 0, opp, NULL, NULL, 0, 0); //fprintf(stderr,"MultiAlignContig: Initial pairwise induced alignment\n"); //PrintAlignment(stderr,manode->lid,0,-1); AbacusRefine(manode,0,-1,CNS_SMOOTH, opp); MergeRefine(manode->lid, NULL, 0, opp, 1); AbacusRefine(manode,0,-1,CNS_POLYX, opp); //fprintf(stderr,"MultiAlignContig: POLYX refined alignment\n"); //PrintAlignment(stderr,manode->lid,0,-1); { IntMultiVar *vl = NULL; int32 nv = 0; RefreshMANode(manode->lid, 0, opp, &nv, &vl, 0, 0); AbacusRefine(manode,0,-1,CNS_INDEL, opp); MergeRefine(manode->lid, ma->v_list, 0, opp, 2); } //fprintf(stderr,"MultiAlignContig: Final refined alignment\n"); //PrintAlignment(stderr,manode->lid,0,-1); //if (num_frags == 0) // PrintAlignment(stderr,manode->lid,0,-1); GetMANodeConsensus(manode->lid, ma->consensus, ma->quality); GetMANodePositions(manode->lid, ma); DeleteMANode(manode->lid); safe_free(offsets); Delete_VA(trace); DeleteHashTable_AS(fragmentMap); fragmentMap = NULL; DeleteHashTable_AS(fragmentToIMP); fragmentToIMP = NULL; return(true); returnFailure: safe_free(offsets); Delete_VA(trace); DeleteHashTable_AS(fragmentMap); fragmentMap = NULL; DeleteHashTable_AS(fragmentToIMP); fragmentToIMP = NULL; return(false); }
int updateFragmentWithParent(IntUnitigMesg *iunitig, int thisFrag, OverlapStore *ovs) { uint32 ovlMax = 0; uint32 ovlLen = 0; OVSoverlap *ovl = NULL; int testFrag = thisFrag - 1; int testOvl = 0; int oldParent = iunitig->f_list[thisFrag].parent; int oldAHang = iunitig->f_list[thisFrag].ahang; int oldBHang = iunitig->f_list[thisFrag].bhang; uint32 consensusCutoff = AS_OVS_encodeQuality(AS_CNS_ERROR_RATE); int contained = 0; int fragment = -1; int overlap = -1; int overlapIdentity = consensusCutoff; int overlapBHang = AS_READ_MAX_NORMAL_LEN; HashTable_AS *ovlBefore = CreateScalarHashTable_AS(); HashTable_AS *ovlAfter = CreateScalarHashTable_AS(); HashTable_AS *iidIndex = CreateScalarHashTable_AS(); int hangSlop = 0; int failed = -1; fprintf(stderr, "\n"); fprintf(stderr, "WORKING on fragment %d == %d\n", thisFrag, iunitig->f_list[thisFrag].ident); // Save in the hash table the fragments before/after this one. // for (testFrag=0; testFrag<iunitig->num_frags; testFrag++) { InsertInHashTable_AS(iidIndex, (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64), (uint64)testFrag, 0); if (testFrag < thisFrag) InsertInHashTable_AS(ovlBefore, (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64), ~(uint64)0, 0); if (testFrag > thisFrag) InsertInHashTable_AS(ovlAfter, (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64), ~(uint64)0, 0); } // Get the overlaps for this fragment. // AS_OVS_setRangeOverlapStore(ovs, iunitig->f_list[thisFrag].ident, iunitig->f_list[thisFrag].ident); if (ovlMax < AS_OVS_numOverlapsInRange(ovs)) { ovlMax = AS_OVS_numOverlapsInRange(ovs) * 2; ovl = (OVSoverlap *)safe_realloc(ovl, sizeof(OVSoverlap) * ovlMax); } ovlLen = 0; while (AS_OVS_readOverlapFromStore(ovs, ovl+ovlLen, AS_OVS_TYPE_OVL)) { int aid=0, bid=0; int afwd=0, bfwd=0; int correct=0; // Reorient the overlap so the b_iid is thisFrag. // { AS_IID x = ovl[ovlLen].a_iid; ovl[ovlLen].a_iid = ovl[ovlLen].b_iid; ovl[ovlLen].b_iid = x; if (ovl[ovlLen].dat.ovl.flipped) { int x = ovl[ovlLen].dat.ovl.a_hang; ovl[ovlLen].dat.ovl.a_hang = ovl[ovlLen].dat.ovl.b_hang; ovl[ovlLen].dat.ovl.b_hang = x; } else { ovl[ovlLen].dat.ovl.a_hang = -ovl[ovlLen].dat.ovl.a_hang; ovl[ovlLen].dat.ovl.b_hang = -ovl[ovlLen].dat.ovl.b_hang; } } // Make sure we get the correct overlap. We seem to be allowed // to have both an I and an N overlap for a given pair of // fragments. At least, I hope that's all we're allowed. // aid = LookupValueInHashTable_AS(iidIndex, (uint64)ovl[ovlLen].a_iid, sizeof(uint64)); bid = LookupValueInHashTable_AS(iidIndex, (uint64)ovl[ovlLen].b_iid, sizeof(uint64)); afwd = (iunitig->f_list[aid].position.bgn < iunitig->f_list[aid].position.end); bfwd = (iunitig->f_list[bid].position.bgn < iunitig->f_list[bid].position.end); if ((afwd == bfwd) && (ovl[ovlLen].dat.ovl.flipped == 0)) correct = 1; if ((afwd != bfwd) && (ovl[ovlLen].dat.ovl.flipped == 1)) correct = 1; if (ExistsInHashTable_AS(ovlBefore, (uint64)ovl[ovlLen].a_iid, sizeof(uint64))) { if (correct) ReplaceInHashTable_AS(ovlBefore, (uint64)ovl[ovlLen].a_iid, sizeof(uint64), (uint64)ovlLen, 0); fprintf(stderr, "%s before overlap for %d (%c) to %d (%c) ("F_S64","F_S64",%c) at ovl position %d\n", correct ? "save" : "skip", ovl[ovlLen].a_iid, afwd ? 'F' : 'R', ovl[ovlLen].b_iid, bfwd ? 'F' : 'R', ovl[ovlLen].dat.ovl.a_hang, ovl[ovlLen].dat.ovl.b_hang, ovl[ovlLen].dat.ovl.flipped ? 'I' : 'N', ovlLen); } if (ExistsInHashTable_AS(ovlAfter, (uint64)ovl[ovlLen].a_iid, sizeof(uint64))) { if (correct) ReplaceInHashTable_AS(ovlAfter, (uint64)ovl[ovlLen].a_iid, sizeof(uint64), (uint64)ovlLen, 0); fprintf(stderr, "%s after overlap for %d (%c) to %d (%c) ("F_S64","F_S64",%c) at ovl position %d\n", correct ? "save" : "skip", ovl[ovlLen].a_iid, afwd ? 'F' : 'R', ovl[ovlLen].b_iid, bfwd ? 'F' : 'R', ovl[ovlLen].dat.ovl.a_hang, ovl[ovlLen].dat.ovl.b_hang, ovl[ovlLen].dat.ovl.flipped ? 'I' : 'N', ovlLen); } ovlLen++; } tryAgain: // See if we're contained in any of these overlaps. if (overlap == -1) { for (testFrag=thisFrag-1; testFrag>=0; testFrag--) { if (ExistsInHashTable_AS(ovlBefore, (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64))) { testOvl = LookupValueInHashTable_AS(ovlBefore, (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64)); // Fragment has no overlap if (testOvl == -1) continue; fprintf(stderr, "found testFrag = %d testOvl = %d erates "F_U64" %u hang "F_S64" "F_S64" (CONTAIN) slop=%d\n", testFrag, testOvl, ovl[testOvl].dat.ovl.orig_erate, consensusCutoff, ovl[testOvl].dat.ovl.a_hang, ovl[testOvl].dat.ovl.b_hang, hangSlop); // Three if's for documentation: // 1) If we're an overlap we care about // 2) If we're a contained overlap // 3) If we're better than what we've seen so far // Then save the overlap // if (ovl[testOvl].dat.ovl.orig_erate < consensusCutoff) { if ((ovl[testOvl].dat.ovl.a_hang >= -hangSlop) && (ovl[testOvl].dat.ovl.b_hang <= hangSlop)) { if (ovl[testOvl].dat.ovl.orig_erate < overlapIdentity) { contained = 1; fragment = testFrag; overlap = testOvl; overlapBHang = 0; overlapIdentity = ovl[testOvl].dat.ovl.orig_erate; } } } } } } // If not contained, scan the overlaps again, looking for the // thickest/bestest. This will be the overlap with the smallest a // or b hang -- depending on the orientation of the parent // fragment. // // Instead of working through overlaps, we work through fragments. // if (overlap == -1) { for (testFrag=thisFrag-1; testFrag>=0; testFrag--) { if (ExistsInHashTable_AS(ovlBefore, (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64))) { int ahang = 0; int bhang = 0; testOvl = LookupValueInHashTable_AS(ovlBefore, (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64)); // Fragment has no overlap if (testOvl == -1) continue; // Overlap is too noisy if (ovl[testOvl].dat.ovl.orig_erate >= consensusCutoff) continue; if (iunitig->f_list[testFrag].position.bgn < iunitig->f_list[testFrag].position.end) { ahang = ovl[testOvl].dat.ovl.a_hang; bhang = ovl[testOvl].dat.ovl.b_hang; } else { ahang = -ovl[testOvl].dat.ovl.b_hang; bhang = -ovl[testOvl].dat.ovl.a_hang; } // Overlap isn't dovetail -- negative ahang if (ahang < 0) continue; // Overlap isn't dovetail -- containment if (bhang < 0) continue; fprintf(stderr, "found testFrag = %d testOvl = %d erates "F_U64" %u hang "F_S64" "F_S64" (DOVETAIL) slop=%d\n", testFrag, testOvl, ovl[testOvl].dat.ovl.orig_erate, consensusCutoff, ovl[testOvl].dat.ovl.a_hang, ovl[testOvl].dat.ovl.b_hang, hangSlop); if (bhang < overlapBHang) { contained = 0; fragment = testFrag; overlap = testOvl; overlapIdentity = ovl[testOvl].dat.ovl.orig_erate; overlapBHang = bhang; } } } } // Now, if we have found the parent fragment, update. // if (overlap >= 0) { testOvl = overlap; testFrag = fragment; iunitig->f_list[thisFrag].parent = ovl[testOvl].a_iid; if (contained) iunitig->f_list[thisFrag].contained = iunitig->f_list[thisFrag].parent; else iunitig->f_list[thisFrag].contained = 0; // Reorient again based on the orientation of the testFrag. // if (iunitig->f_list[testFrag].position.bgn < iunitig->f_list[testFrag].position.end) { // testFrag is forward iunitig->f_list[thisFrag].ahang = ovl[testOvl].dat.ovl.a_hang; iunitig->f_list[thisFrag].bhang = ovl[testOvl].dat.ovl.b_hang; } else { // testFrag is reverse iunitig->f_list[thisFrag].ahang = -ovl[testOvl].dat.ovl.b_hang; iunitig->f_list[thisFrag].bhang = -ovl[testOvl].dat.ovl.a_hang; } // Report we did something. // fprintf(stderr, "Updated fragment "F_IID" from "F_IID",%d,%d to "F_IID",%d,%d\n", iunitig->f_list[thisFrag].ident, oldParent, oldAHang, oldBHang, iunitig->f_list[thisFrag].parent, iunitig->f_list[thisFrag].ahang, iunitig->f_list[thisFrag].bhang); goto successfullyUpdated; } // Otherwise, try to find an overlap again, this time allowing a // bit of slop in the hangs. // if (hangSlop == 0) { hangSlop = 10; goto tryAgain; } // Now, we're convinced there is no decent overlap between this // fragment and any fragment before it. // // Scan forward for the first thing we overlap. for (testFrag=thisFrag+1; testFrag < iunitig->num_frags; testFrag++) { int ahang = 0; int bhang = 0; testOvl = LookupValueInHashTable_AS(ovlAfter, (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64)); // Fragment has no overlap if (testOvl == -1) continue; // Overlap is too noisy if (ovl[testOvl].dat.ovl.orig_erate >= consensusCutoff) continue; if (iunitig->f_list[testFrag].position.bgn < iunitig->f_list[testFrag].position.end) { ahang = ovl[testOvl].dat.ovl.a_hang; bhang = ovl[testOvl].dat.ovl.b_hang; } else { ahang = -ovl[testOvl].dat.ovl.b_hang; bhang = -ovl[testOvl].dat.ovl.a_hang; } // Don't allow negative ahangs. At all. This catches the case // where the parent might be contained in us, and generally makes // consensus happier. // // Don't allow empty hangs - this can lead to infinite loops // where we keep swapping the same two fragments. OK, not // infinite, since we eventually run out of stack space and // crash. // if (ahang <= 0) continue; fprintf(stderr, "shifttest ovl=%d testFrag="F_IID" pos %d-%d thisFrag="F_IID" pos %d-%d hangs %d,%d\n", testOvl, iunitig->f_list[testFrag].ident, iunitig->f_list[testFrag].position.bgn, iunitig->f_list[testFrag].position.end, iunitig->f_list[thisFrag].ident, iunitig->f_list[thisFrag].position.bgn, iunitig->f_list[thisFrag].position.end, ahang, bhang); IntMultiPos fragCopy = iunitig->f_list[thisFrag]; memmove(iunitig->f_list + thisFrag, iunitig->f_list + thisFrag + 1, sizeof(IntMultiPos) * (testFrag - thisFrag)); iunitig->f_list[testFrag] = fragCopy; fprintf(stderr, "Shifted fragment "F_IID" from position %d to position %d\n", iunitig->f_list[testFrag].ident, thisFrag, testFrag); // Since we moved things around, we must process the new fragment // at 'thisFrag's location. // failed = updateFragmentWithParent(iunitig, thisFrag, ovs); if (failed == -1) goto successfullyUpdated; break; } // And we failed. Good luck with this one. // fprintf(stderr, "Failed to update fragment "F_IID" from "F_IID",%d,%d.\n", iunitig->f_list[thisFrag].ident, oldParent, oldAHang, oldBHang); failed = thisFrag; successfullyUpdated: DeleteHashTable_AS(ovlBefore); DeleteHashTable_AS(ovlAfter); safe_free(ovl); return(failed); }
static int abAbacus::SetUngappedFragmentPositions(FragType type,int32 n_frags, MultiAlignT *uma) { int32 num_frags = GetNumIntMultiPoss(uma->f_list); int32 num_unitigs = GetNumIntUnitigPoss(uma->u_list); HashTable_AS *unitigFrags = CreateScalarHashTable_AS(); int32 num_columns = GetMultiAlignLength(uma); int32 ungapped_pos = 0; int32 *gapped_positions = new int32 [num_columns + 1]; char *consensus = Getchar(uma->consensus,0); for (int32 i=0; i<num_columns+1; i++) { gapped_positions[i] = ungapped_pos; if (consensus[i] != '-') ungapped_pos++; } // Remember the first fragment we add. int32 first_frag = GetNumCNS_AlignedContigElements(fragment_positions); for (int32 ifrag=0; ifrag<num_frags; ifrag++) { CNS_AlignedContigElement epos; IntMultiPos *frag = GetIntMultiPos(uma->f_list, ifrag); if (ExistsInHashTable_AS(unitigFrags, frag->ident, 0)) { fprintf(stderr,"SetUngappedFragmentPositions()-- ident %d already in hashtable\n", frag->ident); assert(0); } if (HASH_SUCCESS != InsertInHashTable_AS(unitigFrags, frag->ident, 0, 1, 0)) { fprintf(stderr,"SetUngappedFragmentPositions()-- Failure to insert ident %d in hashtable\n", frag->ident); assert(0); } assert(frag->position.bgn >= 0); assert(frag->position.bgn < num_columns + 1); assert(frag->position.end >= 0); assert(frag->position.end < num_columns + 1); epos.frg_or_utg = CNS_ELEMENT_IS_FRAGMENT; epos.idx.fragment.frgIdent = frag->ident; epos.idx.fragment.frgType = frag->type; epos.idx.fragment.frgContained = frag->contained; epos.idx.fragment.frgInUnitig = (type == AS_CONTIG) ? -1 : uma->maID; epos.position.bgn = gapped_positions[frag->position.bgn]; epos.position.end = gapped_positions[frag->position.end]; //fprintf(stderr, "SetUngappedFragmentPositions()-- FRG id=%d type=%c pos=%d,%d (orig pos=%d,%d)\n", // frag->ident, frag->type, epos.position.bgn, epos.position.end, frag->position.bgn, frag->position.end); // Adjust the ungapped position if we fall within a gap // if (epos.position.bgn == epos.position.end) { fprintf(stderr,"SetUngappedFragmentPositions()-- Encountered bgn==end=="F_S32" in ungapped coords within SetUngappedFragmentPositions for "F_CID "(gapped coords "F_S32","F_S32")\n", epos.position.bgn,frag->ident,frag->position.bgn,frag->position.end); assert(frag->position.bgn != frag->position.end); if (frag->position.bgn < frag->position.end) { if (epos.position.bgn > 0) epos.position.bgn--; else epos.position.end++; } else { if (epos.position.end > 0) epos.position.end--; else epos.position.bgn++; } fprintf(stderr,"SetUngappedFragmentPositions()-- Reset to "F_S32","F_S32"\n", epos.position.bgn, epos.position.end); } AppendVA_CNS_AlignedContigElement(fragment_positions, &epos); } for (int32 ifrag=0; ifrag < num_unitigs; ifrag++){ CNS_AlignedContigElement epos; IntUnitigPos *unitig = GetIntUnitigPos(uma->u_list, ifrag); epos.frg_or_utg = CNS_ELEMENT_IS_UNITIG; epos.idx.unitig.utgIdent = unitig->ident; epos.idx.unitig.utgType = unitig->type; epos.position.bgn = gapped_positions[unitig->position.bgn]; epos.position.end = gapped_positions[unitig->position.end]; //fprintf(stderr, "SetUngappedFragmentPositions()-- UTG id=%d type=%c pos=%d,%d (orig pos=%d,%d)\n", // unitig->ident, unitig->type, epos.position.bgn, epos.position.end, unitig->position.bgn, unitig->position.end); AppendVA_CNS_AlignedContigElement(fragment_positions,&epos); } // This is used only by ReplaceEndUnitigInContig(). Mark fragments in the "anchoring" contig // that belong to this unitig. // if (type != AS_CONTIG) { Fragment *anchor = GetFragment(fragmentStore,0); if ((anchor != NULL) && (anchor->type == AS_CONTIG)) { CNS_AlignedContigElement *af = GetCNS_AlignedContigElement(fragment_positions, anchor->components); for (int32 ifrag=0; ifrag < anchor->n_components; ifrag++, af++) { if ((af->frg_or_utg == CNS_ELEMENT_IS_FRAGMENT) && (ExistsInHashTable_AS(unitigFrags, af->idx.fragment.frgIdent, 0))) af->idx.fragment.frgInUnitig = uma->maID; } } } DeleteHashTable_AS(unitigFrags); delete [] gapped_positions; return first_frag; }
int main (int argc, char *argv[]) { char *asmFileName = NULL; char *tigStoreName = NULL; uint32 tigStoreVers = 2; int minLength = DEFAULT_UNITIG_LENGTH; int numInstances = DEFAULT_NUM_INSTANCES; int distanceToEnds = DEFAULT_DISTANCE_TO_ENDS; uint32 numToggled = 0; argc = AS_configure(argc, argv); int arg=1; int err=0; while (arg < argc) { if (strcmp(argv[arg], "-a") == 0) { asmFileName = argv[++arg]; } else if (strcmp(argv[arg], "-t") == 0) { tigStoreName = argv[++arg]; tigStoreVers = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-l") == 0) { minLength = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-n") == 0) { numInstances = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-d") == 0) { distanceToEnds = atoi(argv[++arg]); } else { fprintf(stderr, "%s: unknown option '%s'\n", argv[0], argv[arg]); err++; } arg++; } if (minLength <= 0) err++; if (numInstances < 0) err++; if (distanceToEnds <= 0) err++; if ((asmFileName == NULL) || (tigStoreName == NULL) || (err > 0)) { fprintf(stderr, "usage: %s -a asmFile -t tigStore version [-l minLength] [-n numInstances] [-d distanceToEnd]\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, " -a asmFile path to the assembly .asm file\n"); fprintf(stderr, " -t tigStore version path to the tigStore and version to modify\n"); fprintf(stderr, " -l minLength minimum size of a unitig to be toggled, default=%d)\n", DEFAULT_UNITIG_LENGTH); fprintf(stderr, " -n numInstances number of instances of a surrogate that is toggled, default = %d\n", DEFAULT_NUM_INSTANCES); fprintf(stderr, " -d distanceToEnd max number of bases the surrogate can be from the end of a scaffold for toggling, default = %d\n", DEFAULT_DISTANCE_TO_ENDS); fprintf(stderr, "\n"); fprintf(stderr, " Labels surrogate unitigs as non-repeat if they match any of the following conditions:\n"); fprintf(stderr, " 1. the unitig meets all the -l, -n and -d conditions\n"); fprintf(stderr, " 2. When -n = 0, all surrogate unitigs with more than one read\n"); fprintf(stderr, " 3. the unitig appears exactly twice, within '-d' bases from the end of a scaffold\n"); exit(1); } HashTable_AS *UIDtoIID = CreateScalarHashTable_AS(); HashTable_AS *CTGtoFirstUTG = CreateScalarHashTable_AS(); HashTable_AS *CTGtoLastUTG = CreateScalarHashTable_AS(); VA_TYPE(int32) *unitigLength = CreateVA_int32(8192); VA_TYPE(uint32) *surrogateCount = CreateVA_uint32(8192); VA_TYPE(uint32) *surrogateAtScaffoldEnds = CreateVA_uint32(8192); GenericMesg *pmesg; FILE *infp = fopen(asmFileName, "r"); while ((EOF != ReadProtoMesg_AS(infp, &pmesg))) { SnapUnitigMesg *utg = NULL; SnapConConMesg *ctg = NULL; SnapScaffoldMesg *scf = NULL; uint32 count = 0; uint32 forward = TRUE; uint32 lastCtg = 0; switch(pmesg->t) { case MESG_UTG: utg = (SnapUnitigMesg*)(pmesg->m); Setint32(unitigLength, utg->iaccession, &utg->length); if (utg->length >= minLength && (utg->status == AS_NOTREZ || utg->status == AS_SEP)) { // store the mapping for this unitig's UID to IID and initialize it's instance counter at 0 count = 0; InsertInHashTable_AS(UIDtoIID, AS_UID_toInteger(utg->eaccession), 0, (uint64)utg->iaccession, 0); Setuint32(surrogateCount, utg->iaccession, &count); } break; case MESG_CCO: ctg = (SnapConConMesg *)(pmesg->m); for (int32 i = 0; i < ctg->num_unitigs; i++) { // increment the surrogate unitigs instance counter if (ExistsInHashTable_AS(UIDtoIID, AS_UID_toInteger(ctg->unitigs[i].eident), 0)) { uint32 *ret = Getuint32(surrogateCount, (uint32) LookupValueInHashTable_AS(UIDtoIID, AS_UID_toInteger(ctg->unitigs[i].eident), 0)); assert(ret != NULL); (*ret)++; // store first surrogate in a contig if (!ExistsInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(ctg->eaccession), 0) && MIN(ctg->unitigs[i].position.bgn, ctg->unitigs[i].position.end) < distanceToEnds) { InsertInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(ctg->eaccession), 0, LookupValueInHashTable_AS(UIDtoIID, AS_UID_toInteger(ctg->unitigs[i].eident), 0), 0); } // also store the last if ((ctg->length - MAX(ctg->unitigs[i].position.bgn, ctg->unitigs[i].position.end)) < distanceToEnds) { ReplaceInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(ctg->eaccession), 0, LookupValueInHashTable_AS(UIDtoIID, AS_UID_toInteger(ctg->unitigs[i].eident), 0), 0); } } } break; case MESG_SCF: scf = (SnapScaffoldMesg *)(pmesg->m); count = scf->iaccession; if (scf->contig_pairs[0].orient.isAnti() || scf->contig_pairs[0].orient.isOuttie()) { forward = FALSE; } lastCtg = MAX(scf->num_contig_pairs - 1, 0); // All four cases below follow the same pattern // The first time a surrogate is found at the end of a scaffold, we record the scaffold ID // When the surrogate is seen at the end of a second scaffold, we record that it has been found at the ends of two scaffolds (UINT32_MAX) // If the surrogate is seen more than once in a single scaffold, it is eliminated (it can't connect two scaffolds) // If the surrogate is only seen once at the end of a scaffold (and again in the middle), it is eliminated // 1. Contig is first in scaffold and is forward, take the surrogate from the beginning of contig, if it exists if (ExistsInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0) && forward) { uint32 *myval = Getuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0)); if (myval != NULL && (*myval) == scf->iaccession) { count = 0; } else if (myval != NULL && (*myval) != 0 && (*myval) != scf->iaccession) { count = UINT32_MAX; } Setuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0), &count); count = scf->iaccession; } // 2. Contig is last in scaffold and is reversed, take the surrogate from the beginning of the contig, if it exists if (ExistsInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0) && !forward) { uint32 *myval = Getuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0)); if (myval != NULL && (*myval) == scf->iaccession) { count = 0; } else if (myval != NULL && (*myval) != 0 && (*myval) != scf->iaccession) { count = UINT32_MAX; } Setuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0), &count); count = scf->iaccession; } // 3. Contig is first in scaffold and is reversed, take the surrogate from the end of the contig, if it exists if (ExistsInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0) && !forward) { uint32 *myval = Getuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0)); if (myval != NULL && (*myval) == scf->iaccession) { count = 0; } else if (myval != NULL && (*myval) != 0 && (*myval) != scf->iaccession) { count = UINT32_MAX; } Setuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0), &count); count = scf->iaccession; } // 4. Contig is last in scaffold and is forward, take the surrogate from the end of the contig, if it exists if (ExistsInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0) && forward) { uint32 *myval = Getuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0)); if (myval != NULL && (*myval) == scf->iaccession) { count = 0; } else if (myval != NULL && (*myval) != 0 && (*myval) != scf->iaccession) { count = UINT32_MAX; } Setuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0), &count); count = scf->iaccession; } break; default: break; } } fclose(infp); uint32 *ret = NULL; uint32 *atScfEnd = NULL; // open the tig store for in-place writing (we don't increment the version since CGW always reads a fixed version initially) // this also removes any partitioning MultiAlignStore *tigStore = new MultiAlignStore(tigStoreName, tigStoreVers, 0, 0, TRUE, TRUE); for (uint32 i = 0; i < tigStore->numUnitigs(); i++) { uint32 *ret = Getuint32(surrogateCount, i); uint32 *atScfEnd = Getuint32(surrogateAtScaffoldEnds, i); uint32 *length = Getuint32(unitigLength, i); bool toggled = false; if (ret != NULL && (*ret) == (uint32)numInstances && numInstances != 0) { toggled = TRUE; } // if we find a surrogate that has two instances and it is at scaffold ends mark toggle it as well else if (ret != NULL && (*ret) == NUM_INSTANCES_AT_SCAFFOLD_ENDS && atScfEnd != NULL && (*atScfEnd) == UINT32_MAX) { toggled = TRUE; } // special case, mark non-singleton unitigs as unique if we are given no instances else if (numInstances == 0 && (length != NULL && (*length) >= minLength) && tigStore->getNumFrags(i, TRUE) > 1) { toggled = TRUE; } if (toggled) { tigStore->setUnitigFUR(i, AS_FORCED_UNIQUE); numToggled++; } } DeleteHashTable_AS(UIDtoIID); DeleteHashTable_AS(CTGtoFirstUTG); DeleteHashTable_AS(CTGtoLastUTG); delete tigStore; fprintf(stderr, "Toggled %d\n", numToggled); return 0; }