int updateFragmentWithParent(IntUnitigMesg *iunitig, int thisFrag, OverlapStore *ovs) { uint32 ovlMax = 0; uint32 ovlLen = 0; OVSoverlap *ovl = NULL; int testFrag = thisFrag - 1; int testOvl = 0; int oldParent = iunitig->f_list[thisFrag].parent; int oldAHang = iunitig->f_list[thisFrag].ahang; int oldBHang = iunitig->f_list[thisFrag].bhang; uint32 consensusCutoff = AS_OVS_encodeQuality(AS_CNS_ERROR_RATE); int contained = 0; int fragment = -1; int overlap = -1; int overlapIdentity = consensusCutoff; int overlapBHang = AS_READ_MAX_NORMAL_LEN; HashTable_AS *ovlBefore = CreateScalarHashTable_AS(); HashTable_AS *ovlAfter = CreateScalarHashTable_AS(); HashTable_AS *iidIndex = CreateScalarHashTable_AS(); int hangSlop = 0; int failed = -1; fprintf(stderr, "\n"); fprintf(stderr, "WORKING on fragment %d == %d\n", thisFrag, iunitig->f_list[thisFrag].ident); // Save in the hash table the fragments before/after this one. // for (testFrag=0; testFrag<iunitig->num_frags; testFrag++) { InsertInHashTable_AS(iidIndex, (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64), (uint64)testFrag, 0); if (testFrag < thisFrag) InsertInHashTable_AS(ovlBefore, (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64), ~(uint64)0, 0); if (testFrag > thisFrag) InsertInHashTable_AS(ovlAfter, (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64), ~(uint64)0, 0); } // Get the overlaps for this fragment. // AS_OVS_setRangeOverlapStore(ovs, iunitig->f_list[thisFrag].ident, iunitig->f_list[thisFrag].ident); if (ovlMax < AS_OVS_numOverlapsInRange(ovs)) { ovlMax = AS_OVS_numOverlapsInRange(ovs) * 2; ovl = (OVSoverlap *)safe_realloc(ovl, sizeof(OVSoverlap) * ovlMax); } ovlLen = 0; while (AS_OVS_readOverlapFromStore(ovs, ovl+ovlLen, AS_OVS_TYPE_OVL)) { int aid=0, bid=0; int afwd=0, bfwd=0; int correct=0; // Reorient the overlap so the b_iid is thisFrag. // { AS_IID x = ovl[ovlLen].a_iid; ovl[ovlLen].a_iid = ovl[ovlLen].b_iid; ovl[ovlLen].b_iid = x; if (ovl[ovlLen].dat.ovl.flipped) { int x = ovl[ovlLen].dat.ovl.a_hang; ovl[ovlLen].dat.ovl.a_hang = ovl[ovlLen].dat.ovl.b_hang; ovl[ovlLen].dat.ovl.b_hang = x; } else { ovl[ovlLen].dat.ovl.a_hang = -ovl[ovlLen].dat.ovl.a_hang; ovl[ovlLen].dat.ovl.b_hang = -ovl[ovlLen].dat.ovl.b_hang; } } // Make sure we get the correct overlap. We seem to be allowed // to have both an I and an N overlap for a given pair of // fragments. At least, I hope that's all we're allowed. // aid = LookupValueInHashTable_AS(iidIndex, (uint64)ovl[ovlLen].a_iid, sizeof(uint64)); bid = LookupValueInHashTable_AS(iidIndex, (uint64)ovl[ovlLen].b_iid, sizeof(uint64)); afwd = (iunitig->f_list[aid].position.bgn < iunitig->f_list[aid].position.end); bfwd = (iunitig->f_list[bid].position.bgn < iunitig->f_list[bid].position.end); if ((afwd == bfwd) && (ovl[ovlLen].dat.ovl.flipped == 0)) correct = 1; if ((afwd != bfwd) && (ovl[ovlLen].dat.ovl.flipped == 1)) correct = 1; if (ExistsInHashTable_AS(ovlBefore, (uint64)ovl[ovlLen].a_iid, sizeof(uint64))) { if (correct) ReplaceInHashTable_AS(ovlBefore, (uint64)ovl[ovlLen].a_iid, sizeof(uint64), (uint64)ovlLen, 0); fprintf(stderr, "%s before overlap for %d (%c) to %d (%c) ("F_S64","F_S64",%c) at ovl position %d\n", correct ? "save" : "skip", ovl[ovlLen].a_iid, afwd ? 'F' : 'R', ovl[ovlLen].b_iid, bfwd ? 'F' : 'R', ovl[ovlLen].dat.ovl.a_hang, ovl[ovlLen].dat.ovl.b_hang, ovl[ovlLen].dat.ovl.flipped ? 'I' : 'N', ovlLen); } if (ExistsInHashTable_AS(ovlAfter, (uint64)ovl[ovlLen].a_iid, sizeof(uint64))) { if (correct) ReplaceInHashTable_AS(ovlAfter, (uint64)ovl[ovlLen].a_iid, sizeof(uint64), (uint64)ovlLen, 0); fprintf(stderr, "%s after overlap for %d (%c) to %d (%c) ("F_S64","F_S64",%c) at ovl position %d\n", correct ? "save" : "skip", ovl[ovlLen].a_iid, afwd ? 'F' : 'R', ovl[ovlLen].b_iid, bfwd ? 'F' : 'R', ovl[ovlLen].dat.ovl.a_hang, ovl[ovlLen].dat.ovl.b_hang, ovl[ovlLen].dat.ovl.flipped ? 'I' : 'N', ovlLen); } ovlLen++; } tryAgain: // See if we're contained in any of these overlaps. if (overlap == -1) { for (testFrag=thisFrag-1; testFrag>=0; testFrag--) { if (ExistsInHashTable_AS(ovlBefore, (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64))) { testOvl = LookupValueInHashTable_AS(ovlBefore, (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64)); // Fragment has no overlap if (testOvl == -1) continue; fprintf(stderr, "found testFrag = %d testOvl = %d erates "F_U64" %u hang "F_S64" "F_S64" (CONTAIN) slop=%d\n", testFrag, testOvl, ovl[testOvl].dat.ovl.orig_erate, consensusCutoff, ovl[testOvl].dat.ovl.a_hang, ovl[testOvl].dat.ovl.b_hang, hangSlop); // Three if's for documentation: // 1) If we're an overlap we care about // 2) If we're a contained overlap // 3) If we're better than what we've seen so far // Then save the overlap // if (ovl[testOvl].dat.ovl.orig_erate < consensusCutoff) { if ((ovl[testOvl].dat.ovl.a_hang >= -hangSlop) && (ovl[testOvl].dat.ovl.b_hang <= hangSlop)) { if (ovl[testOvl].dat.ovl.orig_erate < overlapIdentity) { contained = 1; fragment = testFrag; overlap = testOvl; overlapBHang = 0; overlapIdentity = ovl[testOvl].dat.ovl.orig_erate; } } } } } } // If not contained, scan the overlaps again, looking for the // thickest/bestest. This will be the overlap with the smallest a // or b hang -- depending on the orientation of the parent // fragment. // // Instead of working through overlaps, we work through fragments. // if (overlap == -1) { for (testFrag=thisFrag-1; testFrag>=0; testFrag--) { if (ExistsInHashTable_AS(ovlBefore, (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64))) { int ahang = 0; int bhang = 0; testOvl = LookupValueInHashTable_AS(ovlBefore, (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64)); // Fragment has no overlap if (testOvl == -1) continue; // Overlap is too noisy if (ovl[testOvl].dat.ovl.orig_erate >= consensusCutoff) continue; if (iunitig->f_list[testFrag].position.bgn < iunitig->f_list[testFrag].position.end) { ahang = ovl[testOvl].dat.ovl.a_hang; bhang = ovl[testOvl].dat.ovl.b_hang; } else { ahang = -ovl[testOvl].dat.ovl.b_hang; bhang = -ovl[testOvl].dat.ovl.a_hang; } // Overlap isn't dovetail -- negative ahang if (ahang < 0) continue; // Overlap isn't dovetail -- containment if (bhang < 0) continue; fprintf(stderr, "found testFrag = %d testOvl = %d erates "F_U64" %u hang "F_S64" "F_S64" (DOVETAIL) slop=%d\n", testFrag, testOvl, ovl[testOvl].dat.ovl.orig_erate, consensusCutoff, ovl[testOvl].dat.ovl.a_hang, ovl[testOvl].dat.ovl.b_hang, hangSlop); if (bhang < overlapBHang) { contained = 0; fragment = testFrag; overlap = testOvl; overlapIdentity = ovl[testOvl].dat.ovl.orig_erate; overlapBHang = bhang; } } } } // Now, if we have found the parent fragment, update. // if (overlap >= 0) { testOvl = overlap; testFrag = fragment; iunitig->f_list[thisFrag].parent = ovl[testOvl].a_iid; if (contained) iunitig->f_list[thisFrag].contained = iunitig->f_list[thisFrag].parent; else iunitig->f_list[thisFrag].contained = 0; // Reorient again based on the orientation of the testFrag. // if (iunitig->f_list[testFrag].position.bgn < iunitig->f_list[testFrag].position.end) { // testFrag is forward iunitig->f_list[thisFrag].ahang = ovl[testOvl].dat.ovl.a_hang; iunitig->f_list[thisFrag].bhang = ovl[testOvl].dat.ovl.b_hang; } else { // testFrag is reverse iunitig->f_list[thisFrag].ahang = -ovl[testOvl].dat.ovl.b_hang; iunitig->f_list[thisFrag].bhang = -ovl[testOvl].dat.ovl.a_hang; } // Report we did something. // fprintf(stderr, "Updated fragment "F_IID" from "F_IID",%d,%d to "F_IID",%d,%d\n", iunitig->f_list[thisFrag].ident, oldParent, oldAHang, oldBHang, iunitig->f_list[thisFrag].parent, iunitig->f_list[thisFrag].ahang, iunitig->f_list[thisFrag].bhang); goto successfullyUpdated; } // Otherwise, try to find an overlap again, this time allowing a // bit of slop in the hangs. // if (hangSlop == 0) { hangSlop = 10; goto tryAgain; } // Now, we're convinced there is no decent overlap between this // fragment and any fragment before it. // // Scan forward for the first thing we overlap. for (testFrag=thisFrag+1; testFrag < iunitig->num_frags; testFrag++) { int ahang = 0; int bhang = 0; testOvl = LookupValueInHashTable_AS(ovlAfter, (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64)); // Fragment has no overlap if (testOvl == -1) continue; // Overlap is too noisy if (ovl[testOvl].dat.ovl.orig_erate >= consensusCutoff) continue; if (iunitig->f_list[testFrag].position.bgn < iunitig->f_list[testFrag].position.end) { ahang = ovl[testOvl].dat.ovl.a_hang; bhang = ovl[testOvl].dat.ovl.b_hang; } else { ahang = -ovl[testOvl].dat.ovl.b_hang; bhang = -ovl[testOvl].dat.ovl.a_hang; } // Don't allow negative ahangs. At all. This catches the case // where the parent might be contained in us, and generally makes // consensus happier. // // Don't allow empty hangs - this can lead to infinite loops // where we keep swapping the same two fragments. OK, not // infinite, since we eventually run out of stack space and // crash. // if (ahang <= 0) continue; fprintf(stderr, "shifttest ovl=%d testFrag="F_IID" pos %d-%d thisFrag="F_IID" pos %d-%d hangs %d,%d\n", testOvl, iunitig->f_list[testFrag].ident, iunitig->f_list[testFrag].position.bgn, iunitig->f_list[testFrag].position.end, iunitig->f_list[thisFrag].ident, iunitig->f_list[thisFrag].position.bgn, iunitig->f_list[thisFrag].position.end, ahang, bhang); IntMultiPos fragCopy = iunitig->f_list[thisFrag]; memmove(iunitig->f_list + thisFrag, iunitig->f_list + thisFrag + 1, sizeof(IntMultiPos) * (testFrag - thisFrag)); iunitig->f_list[testFrag] = fragCopy; fprintf(stderr, "Shifted fragment "F_IID" from position %d to position %d\n", iunitig->f_list[testFrag].ident, thisFrag, testFrag); // Since we moved things around, we must process the new fragment // at 'thisFrag's location. // failed = updateFragmentWithParent(iunitig, thisFrag, ovs); if (failed == -1) goto successfullyUpdated; break; } // And we failed. Good luck with this one. // fprintf(stderr, "Failed to update fragment "F_IID" from "F_IID",%d,%d.\n", iunitig->f_list[thisFrag].ident, oldParent, oldAHang, oldBHang); failed = thisFrag; successfullyUpdated: DeleteHashTable_AS(ovlBefore); DeleteHashTable_AS(ovlAfter); safe_free(ovl); return(failed); }
int main (int argc, char *argv[]) { char *asmFileName = NULL; char *tigStoreName = NULL; uint32 tigStoreVers = 2; int minLength = DEFAULT_UNITIG_LENGTH; int numInstances = DEFAULT_NUM_INSTANCES; int distanceToEnds = DEFAULT_DISTANCE_TO_ENDS; uint32 numToggled = 0; argc = AS_configure(argc, argv); int arg=1; int err=0; while (arg < argc) { if (strcmp(argv[arg], "-a") == 0) { asmFileName = argv[++arg]; } else if (strcmp(argv[arg], "-t") == 0) { tigStoreName = argv[++arg]; tigStoreVers = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-l") == 0) { minLength = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-n") == 0) { numInstances = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-d") == 0) { distanceToEnds = atoi(argv[++arg]); } else { fprintf(stderr, "%s: unknown option '%s'\n", argv[0], argv[arg]); err++; } arg++; } if (minLength <= 0) err++; if (numInstances < 0) err++; if (distanceToEnds <= 0) err++; if ((asmFileName == NULL) || (tigStoreName == NULL) || (err > 0)) { fprintf(stderr, "usage: %s -a asmFile -t tigStore version [-l minLength] [-n numInstances] [-d distanceToEnd]\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, " -a asmFile path to the assembly .asm file\n"); fprintf(stderr, " -t tigStore version path to the tigStore and version to modify\n"); fprintf(stderr, " -l minLength minimum size of a unitig to be toggled, default=%d)\n", DEFAULT_UNITIG_LENGTH); fprintf(stderr, " -n numInstances number of instances of a surrogate that is toggled, default = %d\n", DEFAULT_NUM_INSTANCES); fprintf(stderr, " -d distanceToEnd max number of bases the surrogate can be from the end of a scaffold for toggling, default = %d\n", DEFAULT_DISTANCE_TO_ENDS); fprintf(stderr, "\n"); fprintf(stderr, " Labels surrogate unitigs as non-repeat if they match any of the following conditions:\n"); fprintf(stderr, " 1. the unitig meets all the -l, -n and -d conditions\n"); fprintf(stderr, " 2. When -n = 0, all surrogate unitigs with more than one read\n"); fprintf(stderr, " 3. the unitig appears exactly twice, within '-d' bases from the end of a scaffold\n"); exit(1); } HashTable_AS *UIDtoIID = CreateScalarHashTable_AS(); HashTable_AS *CTGtoFirstUTG = CreateScalarHashTable_AS(); HashTable_AS *CTGtoLastUTG = CreateScalarHashTable_AS(); VA_TYPE(int32) *unitigLength = CreateVA_int32(8192); VA_TYPE(uint32) *surrogateCount = CreateVA_uint32(8192); VA_TYPE(uint32) *surrogateAtScaffoldEnds = CreateVA_uint32(8192); GenericMesg *pmesg; FILE *infp = fopen(asmFileName, "r"); while ((EOF != ReadProtoMesg_AS(infp, &pmesg))) { SnapUnitigMesg *utg = NULL; SnapConConMesg *ctg = NULL; SnapScaffoldMesg *scf = NULL; uint32 count = 0; uint32 forward = TRUE; uint32 lastCtg = 0; switch(pmesg->t) { case MESG_UTG: utg = (SnapUnitigMesg*)(pmesg->m); Setint32(unitigLength, utg->iaccession, &utg->length); if (utg->length >= minLength && (utg->status == AS_NOTREZ || utg->status == AS_SEP)) { // store the mapping for this unitig's UID to IID and initialize it's instance counter at 0 count = 0; InsertInHashTable_AS(UIDtoIID, AS_UID_toInteger(utg->eaccession), 0, (uint64)utg->iaccession, 0); Setuint32(surrogateCount, utg->iaccession, &count); } break; case MESG_CCO: ctg = (SnapConConMesg *)(pmesg->m); for (int32 i = 0; i < ctg->num_unitigs; i++) { // increment the surrogate unitigs instance counter if (ExistsInHashTable_AS(UIDtoIID, AS_UID_toInteger(ctg->unitigs[i].eident), 0)) { uint32 *ret = Getuint32(surrogateCount, (uint32) LookupValueInHashTable_AS(UIDtoIID, AS_UID_toInteger(ctg->unitigs[i].eident), 0)); assert(ret != NULL); (*ret)++; // store first surrogate in a contig if (!ExistsInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(ctg->eaccession), 0) && MIN(ctg->unitigs[i].position.bgn, ctg->unitigs[i].position.end) < distanceToEnds) { InsertInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(ctg->eaccession), 0, LookupValueInHashTable_AS(UIDtoIID, AS_UID_toInteger(ctg->unitigs[i].eident), 0), 0); } // also store the last if ((ctg->length - MAX(ctg->unitigs[i].position.bgn, ctg->unitigs[i].position.end)) < distanceToEnds) { ReplaceInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(ctg->eaccession), 0, LookupValueInHashTable_AS(UIDtoIID, AS_UID_toInteger(ctg->unitigs[i].eident), 0), 0); } } } break; case MESG_SCF: scf = (SnapScaffoldMesg *)(pmesg->m); count = scf->iaccession; if (scf->contig_pairs[0].orient.isAnti() || scf->contig_pairs[0].orient.isOuttie()) { forward = FALSE; } lastCtg = MAX(scf->num_contig_pairs - 1, 0); // All four cases below follow the same pattern // The first time a surrogate is found at the end of a scaffold, we record the scaffold ID // When the surrogate is seen at the end of a second scaffold, we record that it has been found at the ends of two scaffolds (UINT32_MAX) // If the surrogate is seen more than once in a single scaffold, it is eliminated (it can't connect two scaffolds) // If the surrogate is only seen once at the end of a scaffold (and again in the middle), it is eliminated // 1. Contig is first in scaffold and is forward, take the surrogate from the beginning of contig, if it exists if (ExistsInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0) && forward) { uint32 *myval = Getuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0)); if (myval != NULL && (*myval) == scf->iaccession) { count = 0; } else if (myval != NULL && (*myval) != 0 && (*myval) != scf->iaccession) { count = UINT32_MAX; } Setuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0), &count); count = scf->iaccession; } // 2. Contig is last in scaffold and is reversed, take the surrogate from the beginning of the contig, if it exists if (ExistsInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0) && !forward) { uint32 *myval = Getuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0)); if (myval != NULL && (*myval) == scf->iaccession) { count = 0; } else if (myval != NULL && (*myval) != 0 && (*myval) != scf->iaccession) { count = UINT32_MAX; } Setuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0), &count); count = scf->iaccession; } // 3. Contig is first in scaffold and is reversed, take the surrogate from the end of the contig, if it exists if (ExistsInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0) && !forward) { uint32 *myval = Getuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0)); if (myval != NULL && (*myval) == scf->iaccession) { count = 0; } else if (myval != NULL && (*myval) != 0 && (*myval) != scf->iaccession) { count = UINT32_MAX; } Setuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0), &count); count = scf->iaccession; } // 4. Contig is last in scaffold and is forward, take the surrogate from the end of the contig, if it exists if (ExistsInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0) && forward) { uint32 *myval = Getuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0)); if (myval != NULL && (*myval) == scf->iaccession) { count = 0; } else if (myval != NULL && (*myval) != 0 && (*myval) != scf->iaccession) { count = UINT32_MAX; } Setuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0), &count); count = scf->iaccession; } break; default: break; } } fclose(infp); uint32 *ret = NULL; uint32 *atScfEnd = NULL; // open the tig store for in-place writing (we don't increment the version since CGW always reads a fixed version initially) // this also removes any partitioning MultiAlignStore *tigStore = new MultiAlignStore(tigStoreName, tigStoreVers, 0, 0, TRUE, TRUE); for (uint32 i = 0; i < tigStore->numUnitigs(); i++) { uint32 *ret = Getuint32(surrogateCount, i); uint32 *atScfEnd = Getuint32(surrogateAtScaffoldEnds, i); uint32 *length = Getuint32(unitigLength, i); bool toggled = false; if (ret != NULL && (*ret) == (uint32)numInstances && numInstances != 0) { toggled = TRUE; } // if we find a surrogate that has two instances and it is at scaffold ends mark toggle it as well else if (ret != NULL && (*ret) == NUM_INSTANCES_AT_SCAFFOLD_ENDS && atScfEnd != NULL && (*atScfEnd) == UINT32_MAX) { toggled = TRUE; } // special case, mark non-singleton unitigs as unique if we are given no instances else if (numInstances == 0 && (length != NULL && (*length) >= minLength) && tigStore->getNumFrags(i, TRUE) > 1) { toggled = TRUE; } if (toggled) { tigStore->setUnitigFUR(i, AS_FORCED_UNIQUE); numToggled++; } } DeleteHashTable_AS(UIDtoIID); DeleteHashTable_AS(CTGtoFirstUTG); DeleteHashTable_AS(CTGtoLastUTG); delete tigStore; fprintf(stderr, "Toggled %d\n", numToggled); return 0; }