static void PlaceFragments(int32 fid, IntUnitigPos *aiup, CNS_Options *opp) { Fragment *afrag = GetFragment(fragmentStore,fid); Fragment *bfrag = NULL; CNS_AlignedContigElement *belem = GetCNS_AlignedContigElement(fragment_positions, afrag->components); if (afrag->n_components == 0) return; VA_TYPE(int32) *trace = CreateVA_int32(AS_READ_MAX_NORMAL_LEN+1); for (; belem->frg_or_utg == CNS_ELEMENT_IS_FRAGMENT; belem++) { if (FALSE == ExistsInHashTable_AS(fragmentMap, belem->idx.fragment.frgIdent, 0)) // Fragment is not in the contigs f_list. It is an unplaced read from a surrogate. continue; // if it exists in the fragmentMap it should exist in this map as well since it was added at the same time // look up where this fragment is placed within the entire contig, see if that matches where we're about to place it // this is necessary for surrogates that are multiply placed in a single contig for example: // contig: --------------*****--------*****--------> where ***** represents a surrogate // ----> readA // when placing readA within the surrogate unitig, we see if readA belongs in surrogate instance A or B // by computing the position of the unitig within the contig and adding ahang to it // if this computed position matches the position that the IMP record retrieved below tells us, proceed, otherwise skip placement IntMultiPos *bimp = (IntMultiPos *)LookupValueInHashTable_AS(fragmentToIMP, belem->idx.fragment.frgIdent, 0); int32 bbgn = (bimp->position.bgn < bimp->position.end ? bimp->position.bgn : bimp->position.end); int32 abgn = (aiup->position.bgn < aiup->position.end ? aiup->position.bgn : aiup->position.end); int32 fcomplement = afrag->complement; int32 bcomplement = (belem->position.bgn < belem->position.end) ? 0 : 1; int32 ahang = 0; int32 bhang = 0; int32 ovl = 0; OverlapType otype; // all of fid's component frags will be aligned to it (not to // each other) // // fcomplement==0 fcomplement==1 // // A) fid C) fid // ------------------> <---------------- // ---> <--- // bid (bcomplement==0) bid // // B) fid D) fid // ------------------> <---------------- // <--- ---> // bid (bcomplement==1) bid // // The afrag is a unitig, so b is always contained (length of // overlap is length of belem). if (fcomplement && bcomplement) { ahang = afrag->length - belem->position.bgn; /* Case D */ bhang = belem->position.end - afrag->length; ovl = belem->position.bgn - belem->position.end; } else if (fcomplement && !bcomplement) { ahang = afrag->length - belem->position.end; /* Case C */ bhang = belem->position.bgn - afrag->length; ovl = belem->position.end - belem->position.bgn; } else if (!fcomplement && bcomplement) { ahang = belem->position.end; /* Case B */ bhang = belem->position.bgn - afrag->length; ovl = belem->position.bgn - belem->position.end; } else { ahang = belem->position.bgn; /* Case A */ bhang = belem->position.end - afrag->length; ovl = belem->position.end - belem->position.bgn; } assert(ahang >= 0); assert(bhang <= 0); assert(ovl > 0); if (aiup->num_instances > 1 && abs(ahang + abgn - bbgn) > MAX_SURROGATE_FUDGE_FACTOR) { if (VERBOSE_MULTIALIGN_OUTPUT) fprintf(stderr, "Not placing fragment %d into unitig %d because the positions (%d, %d) do not match (%d, %d)\n", belem->idx.fragment.frgIdent, afrag->iid, bimp->position.bgn, bimp->position.end, ahang + GetColumn(columnStore,(GetBead(beadStore,afrag->firstbead.get() ))->column_index)->ma_index, bhang + GetColumn(columnStore,(GetBead(beadStore,afrag->firstbead.get()+afrag->length-1))->column_index)->ma_index+1); continue; } int32 blid = AppendFragToLocalStore(belem->idx.fragment.frgType, belem->idx.fragment.frgIdent, (bcomplement != fcomplement), belem->idx.fragment.frgContained, AS_OTHER_UNITIG); afrag = GetFragment(fragmentStore, fid); // AppendFragToLocalStore can change the pointer on us. bfrag = GetFragment(fragmentStore, blid); if (!GetAlignmentTraceDriver(afrag, NULL, bfrag, &ahang, &bhang, ovl, trace, &otype, GETALIGNTRACE_CONTIGF, 0)) { //if (!GetAlignmentTrace(afrag->lid, 0, blid, &ahang, &bhang, ovl, trace, &otype, DP_Compare, DONT_SHOW_OLAP, 0, AS_CONSENSUS, AS_CNS_ERROR_RATE) && // !GetAlignmentTrace(afrag->lid, 0, blid, &ahang, &bhang, ovl, trace, &otype, Local_Overlap_AS_forCNS, DONT_SHOW_OLAP, 0, AS_CONSENSUS, AS_CNS_ERROR_RATE)) { Bead *afirst = GetBead(beadStore, afrag->firstbead.get() + ahang); Column *col = GetColumn(columnStore, afirst->column_index); MANode *manode = GetMANode(manodeStore, col->ma_id); RefreshMANode(manode->lid, 0, opp, NULL, NULL, 0, 0); // BPW not sure why we need this fprintf(stderr, "Could (really) not find overlap between %d (%c) and %d (%c) estimated ahang: %d (ejecting frag %d from contig)\n", afrag->iid, afrag->type, belem->idx.fragment.frgIdent, belem->idx.fragment.frgType, ahang, belem->idx.fragment.frgIdent); GetFragment(fragmentStore,blid)->deleted = 1; } else { ApplyAlignment(afrag->lid, 0, NULL, blid, ahang, Getint32(trace,0)); } } // over all fragments Delete_VA(trace); }
int updateFragmentWithParent(IntUnitigMesg *iunitig, int thisFrag, OverlapStore *ovs) { uint32 ovlMax = 0; uint32 ovlLen = 0; OVSoverlap *ovl = NULL; int testFrag = thisFrag - 1; int testOvl = 0; int oldParent = iunitig->f_list[thisFrag].parent; int oldAHang = iunitig->f_list[thisFrag].ahang; int oldBHang = iunitig->f_list[thisFrag].bhang; uint32 consensusCutoff = AS_OVS_encodeQuality(AS_CNS_ERROR_RATE); int contained = 0; int fragment = -1; int overlap = -1; int overlapIdentity = consensusCutoff; int overlapBHang = AS_READ_MAX_NORMAL_LEN; HashTable_AS *ovlBefore = CreateScalarHashTable_AS(); HashTable_AS *ovlAfter = CreateScalarHashTable_AS(); HashTable_AS *iidIndex = CreateScalarHashTable_AS(); int hangSlop = 0; int failed = -1; fprintf(stderr, "\n"); fprintf(stderr, "WORKING on fragment %d == %d\n", thisFrag, iunitig->f_list[thisFrag].ident); // Save in the hash table the fragments before/after this one. // for (testFrag=0; testFrag<iunitig->num_frags; testFrag++) { InsertInHashTable_AS(iidIndex, (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64), (uint64)testFrag, 0); if (testFrag < thisFrag) InsertInHashTable_AS(ovlBefore, (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64), ~(uint64)0, 0); if (testFrag > thisFrag) InsertInHashTable_AS(ovlAfter, (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64), ~(uint64)0, 0); } // Get the overlaps for this fragment. // AS_OVS_setRangeOverlapStore(ovs, iunitig->f_list[thisFrag].ident, iunitig->f_list[thisFrag].ident); if (ovlMax < AS_OVS_numOverlapsInRange(ovs)) { ovlMax = AS_OVS_numOverlapsInRange(ovs) * 2; ovl = (OVSoverlap *)safe_realloc(ovl, sizeof(OVSoverlap) * ovlMax); } ovlLen = 0; while (AS_OVS_readOverlapFromStore(ovs, ovl+ovlLen, AS_OVS_TYPE_OVL)) { int aid=0, bid=0; int afwd=0, bfwd=0; int correct=0; // Reorient the overlap so the b_iid is thisFrag. // { AS_IID x = ovl[ovlLen].a_iid; ovl[ovlLen].a_iid = ovl[ovlLen].b_iid; ovl[ovlLen].b_iid = x; if (ovl[ovlLen].dat.ovl.flipped) { int x = ovl[ovlLen].dat.ovl.a_hang; ovl[ovlLen].dat.ovl.a_hang = ovl[ovlLen].dat.ovl.b_hang; ovl[ovlLen].dat.ovl.b_hang = x; } else { ovl[ovlLen].dat.ovl.a_hang = -ovl[ovlLen].dat.ovl.a_hang; ovl[ovlLen].dat.ovl.b_hang = -ovl[ovlLen].dat.ovl.b_hang; } } // Make sure we get the correct overlap. We seem to be allowed // to have both an I and an N overlap for a given pair of // fragments. At least, I hope that's all we're allowed. // aid = LookupValueInHashTable_AS(iidIndex, (uint64)ovl[ovlLen].a_iid, sizeof(uint64)); bid = LookupValueInHashTable_AS(iidIndex, (uint64)ovl[ovlLen].b_iid, sizeof(uint64)); afwd = (iunitig->f_list[aid].position.bgn < iunitig->f_list[aid].position.end); bfwd = (iunitig->f_list[bid].position.bgn < iunitig->f_list[bid].position.end); if ((afwd == bfwd) && (ovl[ovlLen].dat.ovl.flipped == 0)) correct = 1; if ((afwd != bfwd) && (ovl[ovlLen].dat.ovl.flipped == 1)) correct = 1; if (ExistsInHashTable_AS(ovlBefore, (uint64)ovl[ovlLen].a_iid, sizeof(uint64))) { if (correct) ReplaceInHashTable_AS(ovlBefore, (uint64)ovl[ovlLen].a_iid, sizeof(uint64), (uint64)ovlLen, 0); fprintf(stderr, "%s before overlap for %d (%c) to %d (%c) ("F_S64","F_S64",%c) at ovl position %d\n", correct ? "save" : "skip", ovl[ovlLen].a_iid, afwd ? 'F' : 'R', ovl[ovlLen].b_iid, bfwd ? 'F' : 'R', ovl[ovlLen].dat.ovl.a_hang, ovl[ovlLen].dat.ovl.b_hang, ovl[ovlLen].dat.ovl.flipped ? 'I' : 'N', ovlLen); } if (ExistsInHashTable_AS(ovlAfter, (uint64)ovl[ovlLen].a_iid, sizeof(uint64))) { if (correct) ReplaceInHashTable_AS(ovlAfter, (uint64)ovl[ovlLen].a_iid, sizeof(uint64), (uint64)ovlLen, 0); fprintf(stderr, "%s after overlap for %d (%c) to %d (%c) ("F_S64","F_S64",%c) at ovl position %d\n", correct ? "save" : "skip", ovl[ovlLen].a_iid, afwd ? 'F' : 'R', ovl[ovlLen].b_iid, bfwd ? 'F' : 'R', ovl[ovlLen].dat.ovl.a_hang, ovl[ovlLen].dat.ovl.b_hang, ovl[ovlLen].dat.ovl.flipped ? 'I' : 'N', ovlLen); } ovlLen++; } tryAgain: // See if we're contained in any of these overlaps. if (overlap == -1) { for (testFrag=thisFrag-1; testFrag>=0; testFrag--) { if (ExistsInHashTable_AS(ovlBefore, (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64))) { testOvl = LookupValueInHashTable_AS(ovlBefore, (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64)); // Fragment has no overlap if (testOvl == -1) continue; fprintf(stderr, "found testFrag = %d testOvl = %d erates "F_U64" %u hang "F_S64" "F_S64" (CONTAIN) slop=%d\n", testFrag, testOvl, ovl[testOvl].dat.ovl.orig_erate, consensusCutoff, ovl[testOvl].dat.ovl.a_hang, ovl[testOvl].dat.ovl.b_hang, hangSlop); // Three if's for documentation: // 1) If we're an overlap we care about // 2) If we're a contained overlap // 3) If we're better than what we've seen so far // Then save the overlap // if (ovl[testOvl].dat.ovl.orig_erate < consensusCutoff) { if ((ovl[testOvl].dat.ovl.a_hang >= -hangSlop) && (ovl[testOvl].dat.ovl.b_hang <= hangSlop)) { if (ovl[testOvl].dat.ovl.orig_erate < overlapIdentity) { contained = 1; fragment = testFrag; overlap = testOvl; overlapBHang = 0; overlapIdentity = ovl[testOvl].dat.ovl.orig_erate; } } } } } } // If not contained, scan the overlaps again, looking for the // thickest/bestest. This will be the overlap with the smallest a // or b hang -- depending on the orientation of the parent // fragment. // // Instead of working through overlaps, we work through fragments. // if (overlap == -1) { for (testFrag=thisFrag-1; testFrag>=0; testFrag--) { if (ExistsInHashTable_AS(ovlBefore, (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64))) { int ahang = 0; int bhang = 0; testOvl = LookupValueInHashTable_AS(ovlBefore, (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64)); // Fragment has no overlap if (testOvl == -1) continue; // Overlap is too noisy if (ovl[testOvl].dat.ovl.orig_erate >= consensusCutoff) continue; if (iunitig->f_list[testFrag].position.bgn < iunitig->f_list[testFrag].position.end) { ahang = ovl[testOvl].dat.ovl.a_hang; bhang = ovl[testOvl].dat.ovl.b_hang; } else { ahang = -ovl[testOvl].dat.ovl.b_hang; bhang = -ovl[testOvl].dat.ovl.a_hang; } // Overlap isn't dovetail -- negative ahang if (ahang < 0) continue; // Overlap isn't dovetail -- containment if (bhang < 0) continue; fprintf(stderr, "found testFrag = %d testOvl = %d erates "F_U64" %u hang "F_S64" "F_S64" (DOVETAIL) slop=%d\n", testFrag, testOvl, ovl[testOvl].dat.ovl.orig_erate, consensusCutoff, ovl[testOvl].dat.ovl.a_hang, ovl[testOvl].dat.ovl.b_hang, hangSlop); if (bhang < overlapBHang) { contained = 0; fragment = testFrag; overlap = testOvl; overlapIdentity = ovl[testOvl].dat.ovl.orig_erate; overlapBHang = bhang; } } } } // Now, if we have found the parent fragment, update. // if (overlap >= 0) { testOvl = overlap; testFrag = fragment; iunitig->f_list[thisFrag].parent = ovl[testOvl].a_iid; if (contained) iunitig->f_list[thisFrag].contained = iunitig->f_list[thisFrag].parent; else iunitig->f_list[thisFrag].contained = 0; // Reorient again based on the orientation of the testFrag. // if (iunitig->f_list[testFrag].position.bgn < iunitig->f_list[testFrag].position.end) { // testFrag is forward iunitig->f_list[thisFrag].ahang = ovl[testOvl].dat.ovl.a_hang; iunitig->f_list[thisFrag].bhang = ovl[testOvl].dat.ovl.b_hang; } else { // testFrag is reverse iunitig->f_list[thisFrag].ahang = -ovl[testOvl].dat.ovl.b_hang; iunitig->f_list[thisFrag].bhang = -ovl[testOvl].dat.ovl.a_hang; } // Report we did something. // fprintf(stderr, "Updated fragment "F_IID" from "F_IID",%d,%d to "F_IID",%d,%d\n", iunitig->f_list[thisFrag].ident, oldParent, oldAHang, oldBHang, iunitig->f_list[thisFrag].parent, iunitig->f_list[thisFrag].ahang, iunitig->f_list[thisFrag].bhang); goto successfullyUpdated; } // Otherwise, try to find an overlap again, this time allowing a // bit of slop in the hangs. // if (hangSlop == 0) { hangSlop = 10; goto tryAgain; } // Now, we're convinced there is no decent overlap between this // fragment and any fragment before it. // // Scan forward for the first thing we overlap. for (testFrag=thisFrag+1; testFrag < iunitig->num_frags; testFrag++) { int ahang = 0; int bhang = 0; testOvl = LookupValueInHashTable_AS(ovlAfter, (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64)); // Fragment has no overlap if (testOvl == -1) continue; // Overlap is too noisy if (ovl[testOvl].dat.ovl.orig_erate >= consensusCutoff) continue; if (iunitig->f_list[testFrag].position.bgn < iunitig->f_list[testFrag].position.end) { ahang = ovl[testOvl].dat.ovl.a_hang; bhang = ovl[testOvl].dat.ovl.b_hang; } else { ahang = -ovl[testOvl].dat.ovl.b_hang; bhang = -ovl[testOvl].dat.ovl.a_hang; } // Don't allow negative ahangs. At all. This catches the case // where the parent might be contained in us, and generally makes // consensus happier. // // Don't allow empty hangs - this can lead to infinite loops // where we keep swapping the same two fragments. OK, not // infinite, since we eventually run out of stack space and // crash. // if (ahang <= 0) continue; fprintf(stderr, "shifttest ovl=%d testFrag="F_IID" pos %d-%d thisFrag="F_IID" pos %d-%d hangs %d,%d\n", testOvl, iunitig->f_list[testFrag].ident, iunitig->f_list[testFrag].position.bgn, iunitig->f_list[testFrag].position.end, iunitig->f_list[thisFrag].ident, iunitig->f_list[thisFrag].position.bgn, iunitig->f_list[thisFrag].position.end, ahang, bhang); IntMultiPos fragCopy = iunitig->f_list[thisFrag]; memmove(iunitig->f_list + thisFrag, iunitig->f_list + thisFrag + 1, sizeof(IntMultiPos) * (testFrag - thisFrag)); iunitig->f_list[testFrag] = fragCopy; fprintf(stderr, "Shifted fragment "F_IID" from position %d to position %d\n", iunitig->f_list[testFrag].ident, thisFrag, testFrag); // Since we moved things around, we must process the new fragment // at 'thisFrag's location. // failed = updateFragmentWithParent(iunitig, thisFrag, ovs); if (failed == -1) goto successfullyUpdated; break; } // And we failed. Good luck with this one. // fprintf(stderr, "Failed to update fragment "F_IID" from "F_IID",%d,%d.\n", iunitig->f_list[thisFrag].ident, oldParent, oldAHang, oldBHang); failed = thisFrag; successfullyUpdated: DeleteHashTable_AS(ovlBefore); DeleteHashTable_AS(ovlAfter); safe_free(ovl); return(failed); }
static int abAbacus::SetUngappedFragmentPositions(FragType type,int32 n_frags, MultiAlignT *uma) { int32 num_frags = GetNumIntMultiPoss(uma->f_list); int32 num_unitigs = GetNumIntUnitigPoss(uma->u_list); HashTable_AS *unitigFrags = CreateScalarHashTable_AS(); int32 num_columns = GetMultiAlignLength(uma); int32 ungapped_pos = 0; int32 *gapped_positions = new int32 [num_columns + 1]; char *consensus = Getchar(uma->consensus,0); for (int32 i=0; i<num_columns+1; i++) { gapped_positions[i] = ungapped_pos; if (consensus[i] != '-') ungapped_pos++; } // Remember the first fragment we add. int32 first_frag = GetNumCNS_AlignedContigElements(fragment_positions); for (int32 ifrag=0; ifrag<num_frags; ifrag++) { CNS_AlignedContigElement epos; IntMultiPos *frag = GetIntMultiPos(uma->f_list, ifrag); if (ExistsInHashTable_AS(unitigFrags, frag->ident, 0)) { fprintf(stderr,"SetUngappedFragmentPositions()-- ident %d already in hashtable\n", frag->ident); assert(0); } if (HASH_SUCCESS != InsertInHashTable_AS(unitigFrags, frag->ident, 0, 1, 0)) { fprintf(stderr,"SetUngappedFragmentPositions()-- Failure to insert ident %d in hashtable\n", frag->ident); assert(0); } assert(frag->position.bgn >= 0); assert(frag->position.bgn < num_columns + 1); assert(frag->position.end >= 0); assert(frag->position.end < num_columns + 1); epos.frg_or_utg = CNS_ELEMENT_IS_FRAGMENT; epos.idx.fragment.frgIdent = frag->ident; epos.idx.fragment.frgType = frag->type; epos.idx.fragment.frgContained = frag->contained; epos.idx.fragment.frgInUnitig = (type == AS_CONTIG) ? -1 : uma->maID; epos.position.bgn = gapped_positions[frag->position.bgn]; epos.position.end = gapped_positions[frag->position.end]; //fprintf(stderr, "SetUngappedFragmentPositions()-- FRG id=%d type=%c pos=%d,%d (orig pos=%d,%d)\n", // frag->ident, frag->type, epos.position.bgn, epos.position.end, frag->position.bgn, frag->position.end); // Adjust the ungapped position if we fall within a gap // if (epos.position.bgn == epos.position.end) { fprintf(stderr,"SetUngappedFragmentPositions()-- Encountered bgn==end=="F_S32" in ungapped coords within SetUngappedFragmentPositions for "F_CID "(gapped coords "F_S32","F_S32")\n", epos.position.bgn,frag->ident,frag->position.bgn,frag->position.end); assert(frag->position.bgn != frag->position.end); if (frag->position.bgn < frag->position.end) { if (epos.position.bgn > 0) epos.position.bgn--; else epos.position.end++; } else { if (epos.position.end > 0) epos.position.end--; else epos.position.bgn++; } fprintf(stderr,"SetUngappedFragmentPositions()-- Reset to "F_S32","F_S32"\n", epos.position.bgn, epos.position.end); } AppendVA_CNS_AlignedContigElement(fragment_positions, &epos); } for (int32 ifrag=0; ifrag < num_unitigs; ifrag++){ CNS_AlignedContigElement epos; IntUnitigPos *unitig = GetIntUnitigPos(uma->u_list, ifrag); epos.frg_or_utg = CNS_ELEMENT_IS_UNITIG; epos.idx.unitig.utgIdent = unitig->ident; epos.idx.unitig.utgType = unitig->type; epos.position.bgn = gapped_positions[unitig->position.bgn]; epos.position.end = gapped_positions[unitig->position.end]; //fprintf(stderr, "SetUngappedFragmentPositions()-- UTG id=%d type=%c pos=%d,%d (orig pos=%d,%d)\n", // unitig->ident, unitig->type, epos.position.bgn, epos.position.end, unitig->position.bgn, unitig->position.end); AppendVA_CNS_AlignedContigElement(fragment_positions,&epos); } // This is used only by ReplaceEndUnitigInContig(). Mark fragments in the "anchoring" contig // that belong to this unitig. // if (type != AS_CONTIG) { Fragment *anchor = GetFragment(fragmentStore,0); if ((anchor != NULL) && (anchor->type == AS_CONTIG)) { CNS_AlignedContigElement *af = GetCNS_AlignedContigElement(fragment_positions, anchor->components); for (int32 ifrag=0; ifrag < anchor->n_components; ifrag++, af++) { if ((af->frg_or_utg == CNS_ELEMENT_IS_FRAGMENT) && (ExistsInHashTable_AS(unitigFrags, af->idx.fragment.frgIdent, 0))) af->idx.fragment.frgInUnitig = uma->maID; } } } DeleteHashTable_AS(unitigFrags); delete [] gapped_positions; return first_frag; }
int main (int argc, char *argv[]) { char *asmFileName = NULL; char *tigStoreName = NULL; uint32 tigStoreVers = 2; int minLength = DEFAULT_UNITIG_LENGTH; int numInstances = DEFAULT_NUM_INSTANCES; int distanceToEnds = DEFAULT_DISTANCE_TO_ENDS; uint32 numToggled = 0; argc = AS_configure(argc, argv); int arg=1; int err=0; while (arg < argc) { if (strcmp(argv[arg], "-a") == 0) { asmFileName = argv[++arg]; } else if (strcmp(argv[arg], "-t") == 0) { tigStoreName = argv[++arg]; tigStoreVers = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-l") == 0) { minLength = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-n") == 0) { numInstances = atoi(argv[++arg]); } else if (strcmp(argv[arg], "-d") == 0) { distanceToEnds = atoi(argv[++arg]); } else { fprintf(stderr, "%s: unknown option '%s'\n", argv[0], argv[arg]); err++; } arg++; } if (minLength <= 0) err++; if (numInstances < 0) err++; if (distanceToEnds <= 0) err++; if ((asmFileName == NULL) || (tigStoreName == NULL) || (err > 0)) { fprintf(stderr, "usage: %s -a asmFile -t tigStore version [-l minLength] [-n numInstances] [-d distanceToEnd]\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, " -a asmFile path to the assembly .asm file\n"); fprintf(stderr, " -t tigStore version path to the tigStore and version to modify\n"); fprintf(stderr, " -l minLength minimum size of a unitig to be toggled, default=%d)\n", DEFAULT_UNITIG_LENGTH); fprintf(stderr, " -n numInstances number of instances of a surrogate that is toggled, default = %d\n", DEFAULT_NUM_INSTANCES); fprintf(stderr, " -d distanceToEnd max number of bases the surrogate can be from the end of a scaffold for toggling, default = %d\n", DEFAULT_DISTANCE_TO_ENDS); fprintf(stderr, "\n"); fprintf(stderr, " Labels surrogate unitigs as non-repeat if they match any of the following conditions:\n"); fprintf(stderr, " 1. the unitig meets all the -l, -n and -d conditions\n"); fprintf(stderr, " 2. When -n = 0, all surrogate unitigs with more than one read\n"); fprintf(stderr, " 3. the unitig appears exactly twice, within '-d' bases from the end of a scaffold\n"); exit(1); } HashTable_AS *UIDtoIID = CreateScalarHashTable_AS(); HashTable_AS *CTGtoFirstUTG = CreateScalarHashTable_AS(); HashTable_AS *CTGtoLastUTG = CreateScalarHashTable_AS(); VA_TYPE(int32) *unitigLength = CreateVA_int32(8192); VA_TYPE(uint32) *surrogateCount = CreateVA_uint32(8192); VA_TYPE(uint32) *surrogateAtScaffoldEnds = CreateVA_uint32(8192); GenericMesg *pmesg; FILE *infp = fopen(asmFileName, "r"); while ((EOF != ReadProtoMesg_AS(infp, &pmesg))) { SnapUnitigMesg *utg = NULL; SnapConConMesg *ctg = NULL; SnapScaffoldMesg *scf = NULL; uint32 count = 0; uint32 forward = TRUE; uint32 lastCtg = 0; switch(pmesg->t) { case MESG_UTG: utg = (SnapUnitigMesg*)(pmesg->m); Setint32(unitigLength, utg->iaccession, &utg->length); if (utg->length >= minLength && (utg->status == AS_NOTREZ || utg->status == AS_SEP)) { // store the mapping for this unitig's UID to IID and initialize it's instance counter at 0 count = 0; InsertInHashTable_AS(UIDtoIID, AS_UID_toInteger(utg->eaccession), 0, (uint64)utg->iaccession, 0); Setuint32(surrogateCount, utg->iaccession, &count); } break; case MESG_CCO: ctg = (SnapConConMesg *)(pmesg->m); for (int32 i = 0; i < ctg->num_unitigs; i++) { // increment the surrogate unitigs instance counter if (ExistsInHashTable_AS(UIDtoIID, AS_UID_toInteger(ctg->unitigs[i].eident), 0)) { uint32 *ret = Getuint32(surrogateCount, (uint32) LookupValueInHashTable_AS(UIDtoIID, AS_UID_toInteger(ctg->unitigs[i].eident), 0)); assert(ret != NULL); (*ret)++; // store first surrogate in a contig if (!ExistsInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(ctg->eaccession), 0) && MIN(ctg->unitigs[i].position.bgn, ctg->unitigs[i].position.end) < distanceToEnds) { InsertInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(ctg->eaccession), 0, LookupValueInHashTable_AS(UIDtoIID, AS_UID_toInteger(ctg->unitigs[i].eident), 0), 0); } // also store the last if ((ctg->length - MAX(ctg->unitigs[i].position.bgn, ctg->unitigs[i].position.end)) < distanceToEnds) { ReplaceInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(ctg->eaccession), 0, LookupValueInHashTable_AS(UIDtoIID, AS_UID_toInteger(ctg->unitigs[i].eident), 0), 0); } } } break; case MESG_SCF: scf = (SnapScaffoldMesg *)(pmesg->m); count = scf->iaccession; if (scf->contig_pairs[0].orient.isAnti() || scf->contig_pairs[0].orient.isOuttie()) { forward = FALSE; } lastCtg = MAX(scf->num_contig_pairs - 1, 0); // All four cases below follow the same pattern // The first time a surrogate is found at the end of a scaffold, we record the scaffold ID // When the surrogate is seen at the end of a second scaffold, we record that it has been found at the ends of two scaffolds (UINT32_MAX) // If the surrogate is seen more than once in a single scaffold, it is eliminated (it can't connect two scaffolds) // If the surrogate is only seen once at the end of a scaffold (and again in the middle), it is eliminated // 1. Contig is first in scaffold and is forward, take the surrogate from the beginning of contig, if it exists if (ExistsInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0) && forward) { uint32 *myval = Getuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0)); if (myval != NULL && (*myval) == scf->iaccession) { count = 0; } else if (myval != NULL && (*myval) != 0 && (*myval) != scf->iaccession) { count = UINT32_MAX; } Setuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0), &count); count = scf->iaccession; } // 2. Contig is last in scaffold and is reversed, take the surrogate from the beginning of the contig, if it exists if (ExistsInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0) && !forward) { uint32 *myval = Getuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0)); if (myval != NULL && (*myval) == scf->iaccession) { count = 0; } else if (myval != NULL && (*myval) != 0 && (*myval) != scf->iaccession) { count = UINT32_MAX; } Setuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0), &count); count = scf->iaccession; } // 3. Contig is first in scaffold and is reversed, take the surrogate from the end of the contig, if it exists if (ExistsInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0) && !forward) { uint32 *myval = Getuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0)); if (myval != NULL && (*myval) == scf->iaccession) { count = 0; } else if (myval != NULL && (*myval) != 0 && (*myval) != scf->iaccession) { count = UINT32_MAX; } Setuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0), &count); count = scf->iaccession; } // 4. Contig is last in scaffold and is forward, take the surrogate from the end of the contig, if it exists if (ExistsInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0) && forward) { uint32 *myval = Getuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0)); if (myval != NULL && (*myval) == scf->iaccession) { count = 0; } else if (myval != NULL && (*myval) != 0 && (*myval) != scf->iaccession) { count = UINT32_MAX; } Setuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0), &count); count = scf->iaccession; } break; default: break; } } fclose(infp); uint32 *ret = NULL; uint32 *atScfEnd = NULL; // open the tig store for in-place writing (we don't increment the version since CGW always reads a fixed version initially) // this also removes any partitioning MultiAlignStore *tigStore = new MultiAlignStore(tigStoreName, tigStoreVers, 0, 0, TRUE, TRUE); for (uint32 i = 0; i < tigStore->numUnitigs(); i++) { uint32 *ret = Getuint32(surrogateCount, i); uint32 *atScfEnd = Getuint32(surrogateAtScaffoldEnds, i); uint32 *length = Getuint32(unitigLength, i); bool toggled = false; if (ret != NULL && (*ret) == (uint32)numInstances && numInstances != 0) { toggled = TRUE; } // if we find a surrogate that has two instances and it is at scaffold ends mark toggle it as well else if (ret != NULL && (*ret) == NUM_INSTANCES_AT_SCAFFOLD_ENDS && atScfEnd != NULL && (*atScfEnd) == UINT32_MAX) { toggled = TRUE; } // special case, mark non-singleton unitigs as unique if we are given no instances else if (numInstances == 0 && (length != NULL && (*length) >= minLength) && tigStore->getNumFrags(i, TRUE) > 1) { toggled = TRUE; } if (toggled) { tigStore->setUnitigFUR(i, AS_FORCED_UNIQUE); numToggled++; } } DeleteHashTable_AS(UIDtoIID); DeleteHashTable_AS(CTGtoFirstUTG); DeleteHashTable_AS(CTGtoLastUTG); delete tigStore; fprintf(stderr, "Toggled %d\n", numToggled); return 0; }