AS_CGB_Bubble_List_t _collect_bubbles(BubGraph_t bg, BubVertexSet *fwd, BubVertexSet *rvs, IntFragment_ID *top, int num_valid) { IntFragment_ID f, bub_start; HashTable_AS *init_nodes = NULL; IntFragment_ID *i_node = NULL; AS_CGB_Bubble_List result; AS_CGB_Bubble_List_t *ins_h = &(result.next); BVSPair *bp_ins_keys = NULL, bp_find_key; memset(&result,0,sizeof(AS_CGB_Bubble_List)); init_nodes = CreateGenericHashTable_AS(_hash_vset_hash, _hash_vset_cmp); bp_ins_keys = (BVSPair *)safe_malloc(sizeof(BVSPair) * num_valid ); result.next = NULL; for (f = 0; f < num_valid; ++f) if (_is_initiation_node(BG_inDegree(bg, top[f], AS_CGB_BUBBLE_E_VALID), BG_outDegree(bg, top[f], AS_CGB_BUBBLE_E_VALID)) && !BVS_empty(&(fwd[top[f]])) && !BVS_empty(&(rvs[top[f]]))) { #if AS_CGB_BUBBLE_VERY_VERBOSE fprintf(stderr, "Inserting "F_IID " ("F_IID ") into the table.\n", top[f], get_iid_fragment(BG_vertices(bg), top[f])); #endif bp_ins_keys[f].f = &(fwd[top[f]]); bp_ins_keys[f].r = &(rvs[top[f]]); InsertInHashTable_AS(init_nodes, (uint64)(INTPTR)&bp_ins_keys[f], sizeof(BVSPair), (uint64)(INTPTR)&top[f], 0); } for (f = 0; f < num_valid; ++f) if (_is_termination_node(BG_inDegree(bg, top[f], AS_CGB_BUBBLE_E_VALID), BG_outDegree(bg, top[f], AS_CGB_BUBBLE_E_VALID))&& !BVS_empty(&(fwd[top[f]])) && !BVS_empty(&(rvs[top[f]]))) { #if AS_CGB_BUBBLE_VERY_VERBOSE fprintf(stderr, "Looking for matches for "F_IID " ("F_IID ") in the table. ", top[f], get_iid_fragment(BG_vertices(bg), top[f])); #endif bp_find_key.f = &(fwd[top[f]]); bp_find_key.r = &(rvs[top[f]]); i_node = (IntFragment_ID *)(INTPTR)LookupValueInHashTable_AS(init_nodes, (uint64)(INTPTR)&bp_find_key, sizeof(BVSPair)); #if AS_CGB_BUBBLE_VERY_VERBOSE if (!i_node) fprintf(stderr, "None found.\n"); else fprintf(stderr, "Found init node = "F_IID " ("F_IID ").\n", *i_node, get_iid_fragment(BG_vertices(bg), *i_node)); #endif if (i_node) { AS_CGB_Bubble_List_t new_bub = NULL; new_bub = (AS_CGB_Bubble_List *)safe_malloc(sizeof(AS_CGB_Bubble_List)); bub_start = *i_node; new_bub->start = bub_start; new_bub->end = top[f]; *ins_h = new_bub; ins_h = &(new_bub->next); *ins_h = NULL; } } DeleteHashTable_AS(init_nodes); safe_free(bp_ins_keys); return result.next; }
int updateFragmentWithParent(IntUnitigMesg *iunitig, int thisFrag, OverlapStore *ovs) { uint32 ovlMax = 0; uint32 ovlLen = 0; OVSoverlap *ovl = NULL; int testFrag = thisFrag - 1; int testOvl = 0; int oldParent = iunitig->f_list[thisFrag].parent; int oldAHang = iunitig->f_list[thisFrag].ahang; int oldBHang = iunitig->f_list[thisFrag].bhang; uint32 consensusCutoff = AS_OVS_encodeQuality(AS_CNS_ERROR_RATE); int contained = 0; int fragment = -1; int overlap = -1; int overlapIdentity = consensusCutoff; int overlapBHang = AS_READ_MAX_NORMAL_LEN; HashTable_AS *ovlBefore = CreateScalarHashTable_AS(); HashTable_AS *ovlAfter = CreateScalarHashTable_AS(); HashTable_AS *iidIndex = CreateScalarHashTable_AS(); int hangSlop = 0; int failed = -1; fprintf(stderr, "\n"); fprintf(stderr, "WORKING on fragment %d == %d\n", thisFrag, iunitig->f_list[thisFrag].ident); // Save in the hash table the fragments before/after this one. // for (testFrag=0; testFrag<iunitig->num_frags; testFrag++) { InsertInHashTable_AS(iidIndex, (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64), (uint64)testFrag, 0); if (testFrag < thisFrag) InsertInHashTable_AS(ovlBefore, (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64), ~(uint64)0, 0); if (testFrag > thisFrag) InsertInHashTable_AS(ovlAfter, (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64), ~(uint64)0, 0); } // Get the overlaps for this fragment. // AS_OVS_setRangeOverlapStore(ovs, iunitig->f_list[thisFrag].ident, iunitig->f_list[thisFrag].ident); if (ovlMax < AS_OVS_numOverlapsInRange(ovs)) { ovlMax = AS_OVS_numOverlapsInRange(ovs) * 2; ovl = (OVSoverlap *)safe_realloc(ovl, sizeof(OVSoverlap) * ovlMax); } ovlLen = 0; while (AS_OVS_readOverlapFromStore(ovs, ovl+ovlLen, AS_OVS_TYPE_OVL)) { int aid=0, bid=0; int afwd=0, bfwd=0; int correct=0; // Reorient the overlap so the b_iid is thisFrag. // { AS_IID x = ovl[ovlLen].a_iid; ovl[ovlLen].a_iid = ovl[ovlLen].b_iid; ovl[ovlLen].b_iid = x; if (ovl[ovlLen].dat.ovl.flipped) { int x = ovl[ovlLen].dat.ovl.a_hang; ovl[ovlLen].dat.ovl.a_hang = ovl[ovlLen].dat.ovl.b_hang; ovl[ovlLen].dat.ovl.b_hang = x; } else { ovl[ovlLen].dat.ovl.a_hang = -ovl[ovlLen].dat.ovl.a_hang; ovl[ovlLen].dat.ovl.b_hang = -ovl[ovlLen].dat.ovl.b_hang; } } // Make sure we get the correct overlap. We seem to be allowed // to have both an I and an N overlap for a given pair of // fragments. At least, I hope that's all we're allowed. // aid = LookupValueInHashTable_AS(iidIndex, (uint64)ovl[ovlLen].a_iid, sizeof(uint64)); bid = LookupValueInHashTable_AS(iidIndex, (uint64)ovl[ovlLen].b_iid, sizeof(uint64)); afwd = (iunitig->f_list[aid].position.bgn < iunitig->f_list[aid].position.end); bfwd = (iunitig->f_list[bid].position.bgn < iunitig->f_list[bid].position.end); if ((afwd == bfwd) && (ovl[ovlLen].dat.ovl.flipped == 0)) correct = 1; if ((afwd != bfwd) && (ovl[ovlLen].dat.ovl.flipped == 1)) correct = 1; if (ExistsInHashTable_AS(ovlBefore, (uint64)ovl[ovlLen].a_iid, sizeof(uint64))) { if (correct) ReplaceInHashTable_AS(ovlBefore, (uint64)ovl[ovlLen].a_iid, sizeof(uint64), (uint64)ovlLen, 0); fprintf(stderr, "%s before overlap for %d (%c) to %d (%c) ("F_S64","F_S64",%c) at ovl position %d\n", correct ? "save" : "skip", ovl[ovlLen].a_iid, afwd ? 'F' : 'R', ovl[ovlLen].b_iid, bfwd ? 'F' : 'R', ovl[ovlLen].dat.ovl.a_hang, ovl[ovlLen].dat.ovl.b_hang, ovl[ovlLen].dat.ovl.flipped ? 'I' : 'N', ovlLen); } if (ExistsInHashTable_AS(ovlAfter, (uint64)ovl[ovlLen].a_iid, sizeof(uint64))) { if (correct) ReplaceInHashTable_AS(ovlAfter, (uint64)ovl[ovlLen].a_iid, sizeof(uint64), (uint64)ovlLen, 0); fprintf(stderr, "%s after overlap for %d (%c) to %d (%c) ("F_S64","F_S64",%c) at ovl position %d\n", correct ? "save" : "skip", ovl[ovlLen].a_iid, afwd ? 'F' : 'R', ovl[ovlLen].b_iid, bfwd ? 'F' : 'R', ovl[ovlLen].dat.ovl.a_hang, ovl[ovlLen].dat.ovl.b_hang, ovl[ovlLen].dat.ovl.flipped ? 'I' : 'N', ovlLen); } ovlLen++; } tryAgain: // See if we're contained in any of these overlaps. if (overlap == -1) { for (testFrag=thisFrag-1; testFrag>=0; testFrag--) { if (ExistsInHashTable_AS(ovlBefore, (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64))) { testOvl = LookupValueInHashTable_AS(ovlBefore, (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64)); // Fragment has no overlap if (testOvl == -1) continue; fprintf(stderr, "found testFrag = %d testOvl = %d erates "F_U64" %u hang "F_S64" "F_S64" (CONTAIN) slop=%d\n", testFrag, testOvl, ovl[testOvl].dat.ovl.orig_erate, consensusCutoff, ovl[testOvl].dat.ovl.a_hang, ovl[testOvl].dat.ovl.b_hang, hangSlop); // Three if's for documentation: // 1) If we're an overlap we care about // 2) If we're a contained overlap // 3) If we're better than what we've seen so far // Then save the overlap // if (ovl[testOvl].dat.ovl.orig_erate < consensusCutoff) { if ((ovl[testOvl].dat.ovl.a_hang >= -hangSlop) && (ovl[testOvl].dat.ovl.b_hang <= hangSlop)) { if (ovl[testOvl].dat.ovl.orig_erate < overlapIdentity) { contained = 1; fragment = testFrag; overlap = testOvl; overlapBHang = 0; overlapIdentity = ovl[testOvl].dat.ovl.orig_erate; } } } } } } // If not contained, scan the overlaps again, looking for the // thickest/bestest. This will be the overlap with the smallest a // or b hang -- depending on the orientation of the parent // fragment. // // Instead of working through overlaps, we work through fragments. // if (overlap == -1) { for (testFrag=thisFrag-1; testFrag>=0; testFrag--) { if (ExistsInHashTable_AS(ovlBefore, (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64))) { int ahang = 0; int bhang = 0; testOvl = LookupValueInHashTable_AS(ovlBefore, (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64)); // Fragment has no overlap if (testOvl == -1) continue; // Overlap is too noisy if (ovl[testOvl].dat.ovl.orig_erate >= consensusCutoff) continue; if (iunitig->f_list[testFrag].position.bgn < iunitig->f_list[testFrag].position.end) { ahang = ovl[testOvl].dat.ovl.a_hang; bhang = ovl[testOvl].dat.ovl.b_hang; } else { ahang = -ovl[testOvl].dat.ovl.b_hang; bhang = -ovl[testOvl].dat.ovl.a_hang; } // Overlap isn't dovetail -- negative ahang if (ahang < 0) continue; // Overlap isn't dovetail -- containment if (bhang < 0) continue; fprintf(stderr, "found testFrag = %d testOvl = %d erates "F_U64" %u hang "F_S64" "F_S64" (DOVETAIL) slop=%d\n", testFrag, testOvl, ovl[testOvl].dat.ovl.orig_erate, consensusCutoff, ovl[testOvl].dat.ovl.a_hang, ovl[testOvl].dat.ovl.b_hang, hangSlop); if (bhang < overlapBHang) { contained = 0; fragment = testFrag; overlap = testOvl; overlapIdentity = ovl[testOvl].dat.ovl.orig_erate; overlapBHang = bhang; } } } } // Now, if we have found the parent fragment, update. // if (overlap >= 0) { testOvl = overlap; testFrag = fragment; iunitig->f_list[thisFrag].parent = ovl[testOvl].a_iid; if (contained) iunitig->f_list[thisFrag].contained = iunitig->f_list[thisFrag].parent; else iunitig->f_list[thisFrag].contained = 0; // Reorient again based on the orientation of the testFrag. // if (iunitig->f_list[testFrag].position.bgn < iunitig->f_list[testFrag].position.end) { // testFrag is forward iunitig->f_list[thisFrag].ahang = ovl[testOvl].dat.ovl.a_hang; iunitig->f_list[thisFrag].bhang = ovl[testOvl].dat.ovl.b_hang; } else { // testFrag is reverse iunitig->f_list[thisFrag].ahang = -ovl[testOvl].dat.ovl.b_hang; iunitig->f_list[thisFrag].bhang = -ovl[testOvl].dat.ovl.a_hang; } // Report we did something. // fprintf(stderr, "Updated fragment "F_IID" from "F_IID",%d,%d to "F_IID",%d,%d\n", iunitig->f_list[thisFrag].ident, oldParent, oldAHang, oldBHang, iunitig->f_list[thisFrag].parent, iunitig->f_list[thisFrag].ahang, iunitig->f_list[thisFrag].bhang); goto successfullyUpdated; } // Otherwise, try to find an overlap again, this time allowing a // bit of slop in the hangs. // if (hangSlop == 0) { hangSlop = 10; goto tryAgain; } // Now, we're convinced there is no decent overlap between this // fragment and any fragment before it. // // Scan forward for the first thing we overlap. for (testFrag=thisFrag+1; testFrag < iunitig->num_frags; testFrag++) { int ahang = 0; int bhang = 0; testOvl = LookupValueInHashTable_AS(ovlAfter, (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64)); // Fragment has no overlap if (testOvl == -1) continue; // Overlap is too noisy if (ovl[testOvl].dat.ovl.orig_erate >= consensusCutoff) continue; if (iunitig->f_list[testFrag].position.bgn < iunitig->f_list[testFrag].position.end) { ahang = ovl[testOvl].dat.ovl.a_hang; bhang = ovl[testOvl].dat.ovl.b_hang; } else { ahang = -ovl[testOvl].dat.ovl.b_hang; bhang = -ovl[testOvl].dat.ovl.a_hang; } // Don't allow negative ahangs. At all. This catches the case // where the parent might be contained in us, and generally makes // consensus happier. // // Don't allow empty hangs - this can lead to infinite loops // where we keep swapping the same two fragments. OK, not // infinite, since we eventually run out of stack space and // crash. // if (ahang <= 0) continue; fprintf(stderr, "shifttest ovl=%d testFrag="F_IID" pos %d-%d thisFrag="F_IID" pos %d-%d hangs %d,%d\n", testOvl, iunitig->f_list[testFrag].ident, iunitig->f_list[testFrag].position.bgn, iunitig->f_list[testFrag].position.end, iunitig->f_list[thisFrag].ident, iunitig->f_list[thisFrag].position.bgn, iunitig->f_list[thisFrag].position.end, ahang, bhang); IntMultiPos fragCopy = iunitig->f_list[thisFrag]; memmove(iunitig->f_list + thisFrag, iunitig->f_list + thisFrag + 1, sizeof(IntMultiPos) * (testFrag - thisFrag)); iunitig->f_list[testFrag] = fragCopy; fprintf(stderr, "Shifted fragment "F_IID" from position %d to position %d\n", iunitig->f_list[testFrag].ident, thisFrag, testFrag); // Since we moved things around, we must process the new fragment // at 'thisFrag's location. // failed = updateFragmentWithParent(iunitig, thisFrag, ovs); if (failed == -1) goto successfullyUpdated; break; } // And we failed. Good luck with this one. // fprintf(stderr, "Failed to update fragment "F_IID" from "F_IID",%d,%d.\n", iunitig->f_list[thisFrag].ident, oldParent, oldAHang, oldBHang); failed = thisFrag; successfullyUpdated: DeleteHashTable_AS(ovlBefore); DeleteHashTable_AS(ovlAfter); safe_free(ovl); return(failed); }
static void PlaceFragments(int32 fid, IntUnitigPos *aiup, CNS_Options *opp) { Fragment *afrag = GetFragment(fragmentStore,fid); Fragment *bfrag = NULL; CNS_AlignedContigElement *belem = GetCNS_AlignedContigElement(fragment_positions, afrag->components); if (afrag->n_components == 0) return; VA_TYPE(int32) *trace = CreateVA_int32(AS_READ_MAX_NORMAL_LEN+1); for (; belem->frg_or_utg == CNS_ELEMENT_IS_FRAGMENT; belem++) { if (FALSE == ExistsInHashTable_AS(fragmentMap, belem->idx.fragment.frgIdent, 0)) // Fragment is not in the contigs f_list. It is an unplaced read from a surrogate. continue; // if it exists in the fragmentMap it should exist in this map as well since it was added at the same time // look up where this fragment is placed within the entire contig, see if that matches where we're about to place it // this is necessary for surrogates that are multiply placed in a single contig for example: // contig: --------------*****--------*****--------> where ***** represents a surrogate // ----> readA // when placing readA within the surrogate unitig, we see if readA belongs in surrogate instance A or B // by computing the position of the unitig within the contig and adding ahang to it // if this computed position matches the position that the IMP record retrieved below tells us, proceed, otherwise skip placement IntMultiPos *bimp = (IntMultiPos *)LookupValueInHashTable_AS(fragmentToIMP, belem->idx.fragment.frgIdent, 0); int32 bbgn = (bimp->position.bgn < bimp->position.end ? bimp->position.bgn : bimp->position.end); int32 abgn = (aiup->position.bgn < aiup->position.end ? aiup->position.bgn : aiup->position.end); int32 fcomplement = afrag->complement; int32 bcomplement = (belem->position.bgn < belem->position.end) ? 0 : 1; int32 ahang = 0; int32 bhang = 0; int32 ovl = 0; OverlapType otype; // all of fid's component frags will be aligned to it (not to // each other) // // fcomplement==0 fcomplement==1 // // A) fid C) fid // ------------------> <---------------- // ---> <--- // bid (bcomplement==0) bid // // B) fid D) fid // ------------------> <---------------- // <--- ---> // bid (bcomplement==1) bid // // The afrag is a unitig, so b is always contained (length of // overlap is length of belem). if (fcomplement && bcomplement) { ahang = afrag->length - belem->position.bgn; /* Case D */ bhang = belem->position.end - afrag->length; ovl = belem->position.bgn - belem->position.end; } else if (fcomplement && !bcomplement) { ahang = afrag->length - belem->position.end; /* Case C */ bhang = belem->position.bgn - afrag->length; ovl = belem->position.end - belem->position.bgn; } else if (!fcomplement && bcomplement) { ahang = belem->position.end; /* Case B */ bhang = belem->position.bgn - afrag->length; ovl = belem->position.bgn - belem->position.end; } else { ahang = belem->position.bgn; /* Case A */ bhang = belem->position.end - afrag->length; ovl = belem->position.end - belem->position.bgn; } assert(ahang >= 0); assert(bhang <= 0); assert(ovl > 0); if (aiup->num_instances > 1 && abs(ahang + abgn - bbgn) > MAX_SURROGATE_FUDGE_FACTOR) { if (VERBOSE_MULTIALIGN_OUTPUT) fprintf(stderr, "Not placing fragment %d into unitig %d because the positions (%d, %d) do not match (%d, %d)\n", belem->idx.fragment.frgIdent, afrag->iid, bimp->position.bgn, bimp->position.end, ahang + GetColumn(columnStore,(GetBead(beadStore,afrag->firstbead.get() ))->column_index)->ma_index, bhang + GetColumn(columnStore,(GetBead(beadStore,afrag->firstbead.get()+afrag->length-1))->column_index)->ma_index+1); continue; } int32 blid = AppendFragToLocalStore(belem->idx.fragment.frgType, belem->idx.fragment.frgIdent, (bcomplement != fcomplement), belem->idx.fragment.frgContained, AS_OTHER_UNITIG); afrag = GetFragment(fragmentStore, fid); // AppendFragToLocalStore can change the pointer on us. bfrag = GetFragment(fragmentStore, blid); if (!GetAlignmentTraceDriver(afrag, NULL, bfrag, &ahang, &bhang, ovl, trace, &otype, GETALIGNTRACE_CONTIGF, 0)) { //if (!GetAlignmentTrace(afrag->lid, 0, blid, &ahang, &bhang, ovl, trace, &otype, DP_Compare, DONT_SHOW_OLAP, 0, AS_CONSENSUS, AS_CNS_ERROR_RATE) && // !GetAlignmentTrace(afrag->lid, 0, blid, &ahang, &bhang, ovl, trace, &otype, Local_Overlap_AS_forCNS, DONT_SHOW_OLAP, 0, AS_CONSENSUS, AS_CNS_ERROR_RATE)) { Bead *afirst = GetBead(beadStore, afrag->firstbead.get() + ahang); Column *col = GetColumn(columnStore, afirst->column_index); MANode *manode = GetMANode(manodeStore, col->ma_id); RefreshMANode(manode->lid, 0, opp, NULL, NULL, 0, 0); // BPW not sure why we need this fprintf(stderr, "Could (really) not find overlap between %d (%c) and %d (%c) estimated ahang: %d (ejecting frag %d from contig)\n", afrag->iid, afrag->type, belem->idx.fragment.frgIdent, belem->idx.fragment.frgType, ahang, belem->idx.fragment.frgIdent); GetFragment(fragmentStore,blid)->deleted = 1; } else { ApplyAlignment(afrag->lid, 0, NULL, blid, ahang, Getint32(trace,0)); } } // over all fragments Delete_VA(trace); }
int main (int argc, char *argv[]) { char *asmFileName = NULL; char *tigStoreName = NULL; uint32 tigStoreVers = 2; int minLength = DEFAULT_UNITIG_LENGTH; int numInstances = DEFAULT_NUM_INSTANCES; int distanceToEnds = DEFAULT_DISTANCE_TO_ENDS; uint32 numToggled = 0; argc = AS_configure(argc, argv); int arg=1; int err=0; while (arg < argc) { if (strcmp(argv[arg], "-a") == 0) { asmFileName = argv[++arg]; } else if (strcmp(argv[arg], "-t") == 0) { tigStoreName = argv[++arg]; tigStoreVers = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-l") == 0) { minLength = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-n") == 0) { numInstances = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-d") == 0) { distanceToEnds = atoi(argv[++arg]); } else { fprintf(stderr, "%s: unknown option '%s'\n", argv[0], argv[arg]); err++; } arg++; } if (minLength <= 0) err++; if (numInstances < 0) err++; if (distanceToEnds <= 0) err++; if ((asmFileName == NULL) || (tigStoreName == NULL) || (err > 0)) { fprintf(stderr, "usage: %s -a asmFile -t tigStore version [-l minLength] [-n numInstances] [-d distanceToEnd]\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, " -a asmFile path to the assembly .asm file\n"); fprintf(stderr, " -t tigStore version path to the tigStore and version to modify\n"); fprintf(stderr, " -l minLength minimum size of a unitig to be toggled, default=%d)\n", DEFAULT_UNITIG_LENGTH); fprintf(stderr, " -n numInstances number of instances of a surrogate that is toggled, default = %d\n", DEFAULT_NUM_INSTANCES); fprintf(stderr, " -d distanceToEnd max number of bases the surrogate can be from the end of a scaffold for toggling, default = %d\n", DEFAULT_DISTANCE_TO_ENDS); fprintf(stderr, "\n"); fprintf(stderr, " Labels surrogate unitigs as non-repeat if they match any of the following conditions:\n"); fprintf(stderr, " 1. the unitig meets all the -l, -n and -d conditions\n"); fprintf(stderr, " 2. When -n = 0, all surrogate unitigs with more than one read\n"); fprintf(stderr, " 3. the unitig appears exactly twice, within '-d' bases from the end of a scaffold\n"); exit(1); } HashTable_AS *UIDtoIID = CreateScalarHashTable_AS(); HashTable_AS *CTGtoFirstUTG = CreateScalarHashTable_AS(); HashTable_AS *CTGtoLastUTG = CreateScalarHashTable_AS(); VA_TYPE(int32) *unitigLength = CreateVA_int32(8192); VA_TYPE(uint32) *surrogateCount = CreateVA_uint32(8192); VA_TYPE(uint32) *surrogateAtScaffoldEnds = CreateVA_uint32(8192); GenericMesg *pmesg; FILE *infp = fopen(asmFileName, "r"); while ((EOF != ReadProtoMesg_AS(infp, &pmesg))) { SnapUnitigMesg *utg = NULL; SnapConConMesg *ctg = NULL; SnapScaffoldMesg *scf = NULL; uint32 count = 0; uint32 forward = TRUE; uint32 lastCtg = 0; switch(pmesg->t) { case MESG_UTG: utg = (SnapUnitigMesg*)(pmesg->m); Setint32(unitigLength, utg->iaccession, &utg->length); if (utg->length >= minLength && (utg->status == AS_NOTREZ || utg->status == AS_SEP)) { // store the mapping for this unitig's UID to IID and initialize it's instance counter at 0 count = 0; InsertInHashTable_AS(UIDtoIID, AS_UID_toInteger(utg->eaccession), 0, (uint64)utg->iaccession, 0); Setuint32(surrogateCount, utg->iaccession, &count); } break; case MESG_CCO: ctg = (SnapConConMesg *)(pmesg->m); for (int32 i = 0; i < ctg->num_unitigs; i++) { // increment the surrogate unitigs instance counter if (ExistsInHashTable_AS(UIDtoIID, AS_UID_toInteger(ctg->unitigs[i].eident), 0)) { uint32 *ret = Getuint32(surrogateCount, (uint32) LookupValueInHashTable_AS(UIDtoIID, AS_UID_toInteger(ctg->unitigs[i].eident), 0)); assert(ret != NULL); (*ret)++; // store first surrogate in a contig if (!ExistsInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(ctg->eaccession), 0) && MIN(ctg->unitigs[i].position.bgn, ctg->unitigs[i].position.end) < distanceToEnds) { InsertInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(ctg->eaccession), 0, LookupValueInHashTable_AS(UIDtoIID, AS_UID_toInteger(ctg->unitigs[i].eident), 0), 0); } // also store the last if ((ctg->length - MAX(ctg->unitigs[i].position.bgn, ctg->unitigs[i].position.end)) < distanceToEnds) { ReplaceInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(ctg->eaccession), 0, LookupValueInHashTable_AS(UIDtoIID, AS_UID_toInteger(ctg->unitigs[i].eident), 0), 0); } } } break; case MESG_SCF: scf = (SnapScaffoldMesg *)(pmesg->m); count = scf->iaccession; if (scf->contig_pairs[0].orient.isAnti() || scf->contig_pairs[0].orient.isOuttie()) { forward = FALSE; } lastCtg = MAX(scf->num_contig_pairs - 1, 0); // All four cases below follow the same pattern // The first time a surrogate is found at the end of a scaffold, we record the scaffold ID // When the surrogate is seen at the end of a second scaffold, we record that it has been found at the ends of two scaffolds (UINT32_MAX) // If the surrogate is seen more than once in a single scaffold, it is eliminated (it can't connect two scaffolds) // If the surrogate is only seen once at the end of a scaffold (and again in the middle), it is eliminated // 1. Contig is first in scaffold and is forward, take the surrogate from the beginning of contig, if it exists if (ExistsInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0) && forward) { uint32 *myval = Getuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0)); if (myval != NULL && (*myval) == scf->iaccession) { count = 0; } else if (myval != NULL && (*myval) != 0 && (*myval) != scf->iaccession) { count = UINT32_MAX; } Setuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0), &count); count = scf->iaccession; } // 2. Contig is last in scaffold and is reversed, take the surrogate from the beginning of the contig, if it exists if (ExistsInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0) && !forward) { uint32 *myval = Getuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0)); if (myval != NULL && (*myval) == scf->iaccession) { count = 0; } else if (myval != NULL && (*myval) != 0 && (*myval) != scf->iaccession) { count = UINT32_MAX; } Setuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0), &count); count = scf->iaccession; } // 3. Contig is first in scaffold and is reversed, take the surrogate from the end of the contig, if it exists if (ExistsInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0) && !forward) { uint32 *myval = Getuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0)); if (myval != NULL && (*myval) == scf->iaccession) { count = 0; } else if (myval != NULL && (*myval) != 0 && (*myval) != scf->iaccession) { count = UINT32_MAX; } Setuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0), &count); count = scf->iaccession; } // 4. Contig is last in scaffold and is forward, take the surrogate from the end of the contig, if it exists if (ExistsInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0) && forward) { uint32 *myval = Getuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0)); if (myval != NULL && (*myval) == scf->iaccession) { count = 0; } else if (myval != NULL && (*myval) != 0 && (*myval) != scf->iaccession) { count = UINT32_MAX; } Setuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0), &count); count = scf->iaccession; } break; default: break; } } fclose(infp); uint32 *ret = NULL; uint32 *atScfEnd = NULL; // open the tig store for in-place writing (we don't increment the version since CGW always reads a fixed version initially) // this also removes any partitioning MultiAlignStore *tigStore = new MultiAlignStore(tigStoreName, tigStoreVers, 0, 0, TRUE, TRUE); for (uint32 i = 0; i < tigStore->numUnitigs(); i++) { uint32 *ret = Getuint32(surrogateCount, i); uint32 *atScfEnd = Getuint32(surrogateAtScaffoldEnds, i); uint32 *length = Getuint32(unitigLength, i); bool toggled = false; if (ret != NULL && (*ret) == (uint32)numInstances && numInstances != 0) { toggled = TRUE; } // if we find a surrogate that has two instances and it is at scaffold ends mark toggle it as well else if (ret != NULL && (*ret) == NUM_INSTANCES_AT_SCAFFOLD_ENDS && atScfEnd != NULL && (*atScfEnd) == UINT32_MAX) { toggled = TRUE; } // special case, mark non-singleton unitigs as unique if we are given no instances else if (numInstances == 0 && (length != NULL && (*length) >= minLength) && tigStore->getNumFrags(i, TRUE) > 1) { toggled = TRUE; } if (toggled) { tigStore->setUnitigFUR(i, AS_FORCED_UNIQUE); numToggled++; } } DeleteHashTable_AS(UIDtoIID); DeleteHashTable_AS(CTGtoFirstUTG); DeleteHashTable_AS(CTGtoLastUTG); delete tigStore; fprintf(stderr, "Toggled %d\n", numToggled); return 0; }