示例#1
0
AS_CGB_Bubble_List_t
_collect_bubbles(BubGraph_t bg, BubVertexSet *fwd, BubVertexSet *rvs,
		 IntFragment_ID *top, int num_valid)
{
  IntFragment_ID f, bub_start;
  HashTable_AS *init_nodes = NULL;
  IntFragment_ID *i_node = NULL;
  AS_CGB_Bubble_List result;
  AS_CGB_Bubble_List_t *ins_h = &(result.next);
  BVSPair *bp_ins_keys = NULL, bp_find_key;

  memset(&result,0,sizeof(AS_CGB_Bubble_List));
  init_nodes  = CreateGenericHashTable_AS(_hash_vset_hash, _hash_vset_cmp);
  bp_ins_keys = (BVSPair *)safe_malloc(sizeof(BVSPair) * num_valid );
  result.next = NULL;

  for (f = 0; f < num_valid; ++f)
    if (_is_initiation_node(BG_inDegree(bg, top[f], AS_CGB_BUBBLE_E_VALID),
			    BG_outDegree(bg, top[f], AS_CGB_BUBBLE_E_VALID)) &&
	!BVS_empty(&(fwd[top[f]])) &&
	!BVS_empty(&(rvs[top[f]]))) {
#if AS_CGB_BUBBLE_VERY_VERBOSE
      fprintf(stderr, "Inserting "F_IID " ("F_IID ") into the table.\n", top[f],
	      get_iid_fragment(BG_vertices(bg), top[f]));
#endif
      bp_ins_keys[f].f = &(fwd[top[f]]);
      bp_ins_keys[f].r = &(rvs[top[f]]);
      InsertInHashTable_AS(init_nodes, (uint64)(INTPTR)&bp_ins_keys[f], sizeof(BVSPair), (uint64)(INTPTR)&top[f], 0);
    }

  for (f = 0; f < num_valid; ++f)
    if (_is_termination_node(BG_inDegree(bg, top[f], AS_CGB_BUBBLE_E_VALID),
			     BG_outDegree(bg, top[f], AS_CGB_BUBBLE_E_VALID))&&
	!BVS_empty(&(fwd[top[f]])) &&
	!BVS_empty(&(rvs[top[f]]))) {
#if AS_CGB_BUBBLE_VERY_VERBOSE
      fprintf(stderr, "Looking for matches for "F_IID " ("F_IID ") in the table.  ",
	      top[f], get_iid_fragment(BG_vertices(bg), top[f]));
#endif
      bp_find_key.f = &(fwd[top[f]]);
      bp_find_key.r = &(rvs[top[f]]);
      i_node = (IntFragment_ID *)(INTPTR)LookupValueInHashTable_AS(init_nodes, (uint64)(INTPTR)&bp_find_key, sizeof(BVSPair));
#if AS_CGB_BUBBLE_VERY_VERBOSE
      if (!i_node)
	fprintf(stderr, "None found.\n");
      else
	fprintf(stderr, "Found init node = "F_IID " ("F_IID ").\n", *i_node,
		get_iid_fragment(BG_vertices(bg), *i_node));
#endif

      if (i_node) {
	AS_CGB_Bubble_List_t new_bub = NULL;
	new_bub = (AS_CGB_Bubble_List *)safe_malloc(sizeof(AS_CGB_Bubble_List));
	bub_start = *i_node;
	new_bub->start = bub_start;
	new_bub->end = top[f];
	*ins_h = new_bub;
	ins_h = &(new_bub->next);
	*ins_h = NULL;
      }
    }

  DeleteHashTable_AS(init_nodes);
  safe_free(bp_ins_keys);
  return result.next;
}
示例#2
0
bool
MultiAlignContig(MultiAlignT  *ma,
                 gkStore      *UNUSED,
                 CNS_Options  *opp) {
  int32        num_bases     = 0;
  int32        num_unitigs   = GetNumIntUnitigPoss(ma->u_list);
  int32        num_frags     = GetNumIntMultiPoss(ma->f_list);
  int32        num_columns   = 0;

  IntMultiPos  *flist    = GetIntMultiPos(ma->f_list, 0);
  IntUnitigPos *ulist    = GetIntUnitigPos(ma->u_list, 0);
  IntMultiVar  *vlist    = GetIntMultiVar(ma->v_list, 0);

  SeqInterval  *offsets       = (SeqInterval *) safe_calloc(num_unitigs,sizeof(SeqInterval));

  for (int32 i=0;i<num_unitigs;i++) {
    int32 flen   = (ulist[i].position.bgn < ulist[i].position.end) ? (ulist[i].position.end < ulist[i].position.bgn) : (ulist[i].position.bgn - ulist[i].position.end);
    num_bases   += flen + 2 * AS_CNS_ERROR_RATE * flen;

    num_columns = (ulist[i].position.bgn > num_columns) ? ulist[i].position.bgn : num_columns;
    num_columns = (ulist[i].position.end > num_columns) ? ulist[i].position.end : num_columns;

    //fprintf(stderr, "CTG %d UTG %d %d-%d\n",
    //        ma->maID, ulist[i].ident, ulist[i].position.bgn, ulist[i].position.end);
  }

  for (int32 i=0;i<num_frags;i++) {
    int32 flen   = (flist[i].position.bgn < flist[i].position.end) ? (flist[i].position.end < flist[i].position.bgn) : (flist[i].position.bgn - flist[i].position.end);
    num_bases   += flen + 2 * AS_CNS_ERROR_RATE * flen;
  }

  ResetStores(num_bases, num_unitigs, num_columns);

  fragmentMap   = CreateScalarHashTable_AS();
  fragmentToIMP = CreateScalarHashTable_AS();

  for (int32 i=0; i<num_frags; i++) {

    //  Add all fragments in the contigs f_list to the fragmentMap.  This tells us if a fragment is
    //  not placed in a surrogate (because they aren't in the contigs f_list, but will appear in a
    //  surrogate unitigs f_list).
    //
    if (HASH_SUCCESS != InsertInHashTable_AS(fragmentMap, flist[i].ident, 0, 1, 0)) {
      fprintf(stderr, "MultiAlignContig()-- Contig %d FAILED.  Fragment %d is a duplicate.\n",
              ma->maID, flist[i].ident);
      return(false);
    }

    // SK store IID to IMP message mapping
    InsertInHashTable_AS(fragmentToIMP, flist[i].ident, 0, (uint64)&flist[i], 0);
  }

  for (int32 i=0;i<num_unitigs;i++) {
    uint32 complement = (ulist[i].position.bgn<ulist[i].position.end)?0:1;
    uint32 fid = AppendFragToLocalStore(AS_UNITIG,
                                 ulist[i].ident,
                                 complement,
                                 0,
                                 ulist[i].type);
    offsets[fid].bgn = complement?ulist[i].position.end:ulist[i].position.bgn;
    offsets[fid].end = complement?ulist[i].position.bgn:ulist[i].position.end;
  }

  MANode *manode = CreateMANode(ma->maID);

  // Seed multiAlignment with 1st fragment of 1st unitig

  SeedMAWithFragment(manode->lid, GetFragment(fragmentStore,0)->lid, opp);
  PlaceFragments(GetFragment(fragmentStore,0)->lid, ulist + GetFragment(fragmentStore,0)->lid, opp);

  // Now, loop on remaining fragments, aligning to:
  //    a)  containing frag (if contained)
  // or b)  previously aligned frag

  VA_TYPE(int32) *trace = CreateVA_int32(AS_READ_MAX_NORMAL_LEN+1);

  for (int32 i=1;i<num_unitigs;i++) {
    Fragment *afrag = NULL;
    Fragment *bfrag = GetFragment(fragmentStore,i);

    int32    ahang  = 0;
    int32    bhang  = 0;
    int32    ovl    = 0;
    int32    alid   = 0;
    int32    blid   = bfrag->lid;

    OverlapType otype;

    int32 olap_success  = 0;
    int32 try_contained = 0;
    int32 align_to      = i - 1;

    Fragment *afrag_first = NULL;
    int32       ahang_first = 0;
    int32       bhang_first = 0;

    while (!olap_success) {
    nextFrag:

      if (try_contained == 0)
        //  Skip contained stuff.
        while ((align_to > 0) &&
               ((GetFragment(fragmentStore, align_to)->is_contained) ||
                (GetFragment(fragmentStore, align_to)->container_iid > 0)))
          align_to--;

      if (align_to < 0) {
        if (VERBOSE_MULTIALIGN_OUTPUT)
          fprintf(stderr, "MultiAlignContig: hit the beginning of unitig list: no unitig upstream overlaps with current unitig %d\n", bfrag->iid);

        if (try_contained == 0) {
          if (VERBOSE_MULTIALIGN_OUTPUT)
            fprintf(stderr, "MultiAlignContig: trying contained afrags for bfrag %d\n", bfrag->iid);
          try_contained = 1;
          align_to      = i-1;
          goto nextFrag;
        }

        break;
      }

      afrag = GetFragment(fragmentStore, align_to);
      alid  = afrag->lid;

      ahang = offsets[blid].bgn - offsets[alid].bgn;
      bhang = offsets[blid].end - offsets[alid].end;

      if (afrag_first == NULL) {
        afrag_first = afrag;
        ahang_first = ahang;
        bhang_first = bhang;
      }

      //  This code copied from MultiAlignUnitig.

      if (offsets[afrag->lid].bgn < offsets[bfrag->lid].bgn)
        if (offsets[afrag->lid].end < offsets[bfrag->lid].end)
          ovl = offsets[afrag->lid].end - offsets[bfrag->lid].bgn;
        else
          //ovl = offsets[bfrag->lid].end - offsets[bfrag->lid].bgn;
          ovl = bfrag->length;
      else
        if (offsets[afrag->lid].end < offsets[bfrag->lid].end)
          //ovl = offsets[afrag->lid].end - offsets[afrag->lid].bgn;
          ovl = afrag->length;
        else
          ovl = offsets[bfrag->lid].end - offsets[afrag->lid].bgn;

      //  End of copy

      if (ovl <= 0) {
        if (VERBOSE_MULTIALIGN_OUTPUT)
          fprintf(stderr, "MultiAlignContig: positions of afrag %d and bfrag %d do not overlap.  Proceed to the next upstream afrag\n", afrag->iid, bfrag->iid);
        align_to--;
        goto nextFrag;
      }

      olap_success = GetAlignmentTraceDriver(afrag, NULL,
                                             bfrag,
                                             &ahang, &bhang, ovl,
                                             trace,
                                             &otype,
                                             GETALIGNTRACE_CONTIGU,
                                             (blid + 1 < num_unitigs) ? (offsets[blid + 1].bgn - offsets[blid].bgn) : 800);

      //  Nope, fail.
      if (!olap_success) {
        if (VERBOSE_MULTIALIGN_OUTPUT)
          fprintf(stderr, "MultiAlignContig: Positions of afrag %d (%c) and bfrag %d (%c) overlap, but GetAlignmentTrace returns no overlap success.\n",
                  afrag->iid, afrag->type, bfrag->iid, bfrag->type);

        align_to--;

        if ((align_to < 0) && (!try_contained)) {
          if (VERBOSE_MULTIALIGN_OUTPUT)
            fprintf(stderr, "MultiAlignContig: Try contained afrags for bfrag %d\n", bfrag->iid);
          try_contained = 1;
          align_to = i-1;
        }
      }
    }  //  while !olap_success


    if ((!olap_success) && (FORCE_UNITIG_ABUT == 0)) {
      fprintf(stderr,"MultiAlignContig: Could (really) not find overlap between %d (%c) and %d (%c), estimated ahang %d\n",
              afrag->iid,afrag->type,bfrag->iid,bfrag->type, ahang);
      fprintf(stderr,"MultiAlignContig: You can (possibly) force these to abut with '-D forceunitigabut', but that code is buggy at best.\n");
      goto returnFailure;
    }

#if 1
    if ((!olap_success) && (FORCE_UNITIG_ABUT == 1)) {
      if (afrag_first) {
        afrag = afrag_first;
        ahang = ahang_first;
        bhang = bhang_first;
      } else {
        //  Dang, we're really screwed.  Nobody overlapped with us.
        //  Cross our fingers and find the closest end point.
        //
        int32   maxOvl = -offsets[blid].bgn;

        //if (VERBOSE_MULTIALIGN_OUTPUT)
        //  fprintf(stderr, "MultiAlignContig:  YIKES!  Your unitig doesn't overlap with anything!  Picking the closest thing!\n");

        align_to = i-1;

        while (align_to >= 0) {
          if ((try_contained == 0) &&
              ((GetFragment(fragmentStore, align_to)->is_contained) ||
               (GetFragment(fragmentStore, align_to)->container_iid > 0))) {
            //  NOP!  Found a contained frag, and we want to skip it.
          } else if (maxOvl < offsets[alid].end - offsets[blid].bgn) {
            afrag  = GetFragment(fragmentStore, align_to);
            alid   = afrag->lid;
            ahang  = offsets[blid].bgn - offsets[alid].bgn;
            maxOvl = offsets[alid].end - offsets[blid].bgn;

            //fprintf(stderr, "MultiAlignContig:  RESET align_to=%d alid=%d maxOvl=%d ahang=%d\n", align_to, alid, maxOvl, ahang);
          }

          align_to--;
        }  //  while align_to >= 0
      }

      fprintf(stderr, "MultiAlignContig:  Forcing abut between afrag %d (%c) and bfrag %d (%c) in contig %d.\n",
              afrag->iid, afrag->type, bfrag->iid, bfrag->type, ma->maID);

      //  Force a 1bp overlap.  We'd like to strictly abut, but ApplyAlignment() requires that there
      //  be an overlap, and removing checks for that seem like a bad idea.
      //
      ahang = afrag->length - 1;

      otype = AS_DOVETAIL;

      int32 zero = 0;

      ResetVA_int32(trace);
      AppendVA_int32(trace, &zero);

      assert(*Getint32(trace,0) == 0);
      assert(GetNumint32s(trace) == 1);
    }
#endif

    //  Unitig is placed, or we just forced it to be placed.

    if (otype == AS_CONTAINMENT) {
      bfrag->is_contained = 1;
      if (bfrag->container_iid == 0)
        bfrag->container_iid = 1;  //  Not sure why 1 and not afrag->iid
    }

    ApplyAlignment(afrag->lid, 0, NULL, bfrag->lid, ahang, Getint32(trace,0));
    PlaceFragments(bfrag->lid, ulist + bfrag->lid, opp);
  }  //  over all unitigs

  // Now, must find fragments in regions of overlapping unitigs, and adjust
  // their alignments as needed
  RefreshMANode(manode->lid, 0, opp, NULL, NULL, 0, 0);

  //fprintf(stderr,"MultiAlignContig: Initial pairwise induced alignment\n");
  //PrintAlignment(stderr,manode->lid,0,-1);

  AbacusRefine(manode,0,-1,CNS_SMOOTH, opp);
  MergeRefine(manode->lid, NULL, 0, opp, 1);
  AbacusRefine(manode,0,-1,CNS_POLYX, opp);

  //fprintf(stderr,"MultiAlignContig: POLYX refined alignment\n");
  //PrintAlignment(stderr,manode->lid,0,-1);

  {
    IntMultiVar  *vl = NULL;
    int32         nv = 0;

    RefreshMANode(manode->lid, 0, opp, &nv, &vl, 0, 0);
    AbacusRefine(manode,0,-1,CNS_INDEL, opp);
    MergeRefine(manode->lid, ma->v_list, 0, opp, 2);
  }

  //fprintf(stderr,"MultiAlignContig: Final refined alignment\n");
  //PrintAlignment(stderr,manode->lid,0,-1);

  //if (num_frags == 0)
  //  PrintAlignment(stderr,manode->lid,0,-1);

  GetMANodeConsensus(manode->lid, ma->consensus, ma->quality);
  GetMANodePositions(manode->lid, ma);

  DeleteMANode(manode->lid);

  safe_free(offsets);
  Delete_VA(trace);
  DeleteHashTable_AS(fragmentMap);  
  fragmentMap = NULL;
  DeleteHashTable_AS(fragmentToIMP);
  fragmentToIMP = NULL;
  return(true);

 returnFailure:
  safe_free(offsets);
  Delete_VA(trace);
  DeleteHashTable_AS(fragmentMap);
  fragmentMap = NULL;
  DeleteHashTable_AS(fragmentToIMP);
  fragmentToIMP = NULL;
  return(false);
}
示例#3
0
int
updateFragmentWithParent(IntUnitigMesg *iunitig, int thisFrag, OverlapStore *ovs) {
    uint32         ovlMax = 0;
    uint32         ovlLen = 0;
    OVSoverlap    *ovl    = NULL;

    int     testFrag = thisFrag - 1;
    int     testOvl  = 0;

    int     oldParent = iunitig->f_list[thisFrag].parent;
    int     oldAHang  = iunitig->f_list[thisFrag].ahang;
    int     oldBHang  = iunitig->f_list[thisFrag].bhang;

    uint32  consensusCutoff = AS_OVS_encodeQuality(AS_CNS_ERROR_RATE);

    int     contained       = 0;
    int     fragment        = -1;
    int     overlap         = -1;
    int     overlapIdentity = consensusCutoff;
    int     overlapBHang    = AS_READ_MAX_NORMAL_LEN;

    HashTable_AS  *ovlBefore = CreateScalarHashTable_AS();
    HashTable_AS  *ovlAfter  = CreateScalarHashTable_AS();
    HashTable_AS  *iidIndex  = CreateScalarHashTable_AS();

    int     hangSlop = 0;

    int     failed   = -1;

    fprintf(stderr, "\n");
    fprintf(stderr, "WORKING on fragment %d == %d\n", thisFrag, iunitig->f_list[thisFrag].ident);

    //  Save in the hash table the fragments before/after this one.
    //
    for (testFrag=0; testFrag<iunitig->num_frags; testFrag++) {
        InsertInHashTable_AS(iidIndex,
                             (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64),
                             (uint64)testFrag, 0);

        if (testFrag < thisFrag)
            InsertInHashTable_AS(ovlBefore,
                                 (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64),
                                 ~(uint64)0, 0);
        if (testFrag > thisFrag)
            InsertInHashTable_AS(ovlAfter,
                                 (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64),
                                 ~(uint64)0, 0);
    }

    //  Get the overlaps for this fragment.
    //
    AS_OVS_setRangeOverlapStore(ovs, iunitig->f_list[thisFrag].ident, iunitig->f_list[thisFrag].ident);

    if (ovlMax < AS_OVS_numOverlapsInRange(ovs)) {
        ovlMax = AS_OVS_numOverlapsInRange(ovs) * 2;
        ovl    = (OVSoverlap *)safe_realloc(ovl, sizeof(OVSoverlap) * ovlMax);
    }
    ovlLen = 0;
    while (AS_OVS_readOverlapFromStore(ovs, ovl+ovlLen, AS_OVS_TYPE_OVL)) {
        int  aid=0,  bid=0;
        int  afwd=0, bfwd=0;
        int  correct=0;

        //  Reorient the overlap so the b_iid is thisFrag.
        //
        {
            AS_IID x = ovl[ovlLen].a_iid;
            ovl[ovlLen].a_iid = ovl[ovlLen].b_iid;
            ovl[ovlLen].b_iid = x;

            if (ovl[ovlLen].dat.ovl.flipped) {
                int x = ovl[ovlLen].dat.ovl.a_hang;
                ovl[ovlLen].dat.ovl.a_hang = ovl[ovlLen].dat.ovl.b_hang;
                ovl[ovlLen].dat.ovl.b_hang = x;
            } else {
                ovl[ovlLen].dat.ovl.a_hang = -ovl[ovlLen].dat.ovl.a_hang;
                ovl[ovlLen].dat.ovl.b_hang = -ovl[ovlLen].dat.ovl.b_hang;
            }
        }

        //  Make sure we get the correct overlap.  We seem to be allowed
        //  to have both an I and an N overlap for a given pair of
        //  fragments.  At least, I hope that's all we're allowed.
        //
        aid = LookupValueInHashTable_AS(iidIndex, (uint64)ovl[ovlLen].a_iid, sizeof(uint64));
        bid = LookupValueInHashTable_AS(iidIndex, (uint64)ovl[ovlLen].b_iid, sizeof(uint64));

        afwd = (iunitig->f_list[aid].position.bgn < iunitig->f_list[aid].position.end);
        bfwd = (iunitig->f_list[bid].position.bgn < iunitig->f_list[bid].position.end);

        if ((afwd == bfwd) && (ovl[ovlLen].dat.ovl.flipped == 0))
            correct = 1;
        if ((afwd != bfwd) && (ovl[ovlLen].dat.ovl.flipped == 1))
            correct = 1;


        if (ExistsInHashTable_AS(ovlBefore, (uint64)ovl[ovlLen].a_iid, sizeof(uint64))) {
            if (correct)
                ReplaceInHashTable_AS(ovlBefore,
                                      (uint64)ovl[ovlLen].a_iid, sizeof(uint64),
                                      (uint64)ovlLen, 0);
            fprintf(stderr, "%s before overlap for %d (%c) to %d (%c) ("F_S64","F_S64",%c) at ovl position %d\n",
                    correct ? "save" : "skip",
                    ovl[ovlLen].a_iid, afwd ? 'F' : 'R',
                    ovl[ovlLen].b_iid, bfwd ? 'F' : 'R',
                    ovl[ovlLen].dat.ovl.a_hang,
                    ovl[ovlLen].dat.ovl.b_hang,
                    ovl[ovlLen].dat.ovl.flipped ? 'I' : 'N',
                    ovlLen);
        }

        if (ExistsInHashTable_AS(ovlAfter, (uint64)ovl[ovlLen].a_iid, sizeof(uint64))) {
            if (correct)
                ReplaceInHashTable_AS(ovlAfter,
                                      (uint64)ovl[ovlLen].a_iid, sizeof(uint64),
                                      (uint64)ovlLen, 0);
            fprintf(stderr, "%s after  overlap for %d (%c) to %d (%c) ("F_S64","F_S64",%c) at ovl position %d\n",
                    correct ? "save" : "skip",
                    ovl[ovlLen].a_iid, afwd ? 'F' : 'R',
                    ovl[ovlLen].b_iid, bfwd ? 'F' : 'R',
                    ovl[ovlLen].dat.ovl.a_hang,
                    ovl[ovlLen].dat.ovl.b_hang,
                    ovl[ovlLen].dat.ovl.flipped ? 'I' : 'N',
                    ovlLen);
        }

        ovlLen++;
    }

tryAgain:

    //  See if we're contained in any of these overlaps.
    if (overlap == -1) {
        for (testFrag=thisFrag-1; testFrag>=0; testFrag--) {
            if (ExistsInHashTable_AS(ovlBefore, (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64))) {
                testOvl = LookupValueInHashTable_AS(ovlBefore, (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64));

                //  Fragment has no overlap
                if (testOvl == -1)
                    continue;

                fprintf(stderr, "found testFrag = %d testOvl = %d erates "F_U64" %u hang "F_S64" "F_S64" (CONTAIN) slop=%d\n",
                        testFrag, testOvl,
                        ovl[testOvl].dat.ovl.orig_erate, consensusCutoff,
                        ovl[testOvl].dat.ovl.a_hang,
                        ovl[testOvl].dat.ovl.b_hang,
                        hangSlop);

                //  Three if's for documentation:
                //  1)  If we're an overlap we care about
                //  2)  If we're a contained overlap
                //  3)  If we're better than what we've seen so far
                //  Then save the overlap
                //
                if (ovl[testOvl].dat.ovl.orig_erate < consensusCutoff) {
                    if ((ovl[testOvl].dat.ovl.a_hang >= -hangSlop) &&
                            (ovl[testOvl].dat.ovl.b_hang <= hangSlop)) {
                        if (ovl[testOvl].dat.ovl.orig_erate < overlapIdentity) {
                            contained       = 1;
                            fragment        = testFrag;
                            overlap         = testOvl;
                            overlapBHang    = 0;
                            overlapIdentity = ovl[testOvl].dat.ovl.orig_erate;
                        }
                    }
                }
            }
        }
    }


    //  If not contained, scan the overlaps again, looking for the
    //  thickest/bestest.  This will be the overlap with the smallest a
    //  or b hang -- depending on the orientation of the parent
    //  fragment.
    //
    //  Instead of working through overlaps, we work through fragments.
    //
    if (overlap == -1) {
        for (testFrag=thisFrag-1; testFrag>=0; testFrag--) {
            if (ExistsInHashTable_AS(ovlBefore, (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64))) {
                int ahang = 0;
                int bhang = 0;

                testOvl = LookupValueInHashTable_AS(ovlBefore, (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64));

                //  Fragment has no overlap
                if (testOvl == -1)
                    continue;

                //  Overlap is too noisy
                if (ovl[testOvl].dat.ovl.orig_erate >= consensusCutoff)
                    continue;

                if (iunitig->f_list[testFrag].position.bgn < iunitig->f_list[testFrag].position.end) {
                    ahang = ovl[testOvl].dat.ovl.a_hang;
                    bhang = ovl[testOvl].dat.ovl.b_hang;
                } else {
                    ahang = -ovl[testOvl].dat.ovl.b_hang;
                    bhang = -ovl[testOvl].dat.ovl.a_hang;
                }

                //  Overlap isn't dovetail -- negative ahang
                if (ahang < 0)
                    continue;

                //  Overlap isn't dovetail -- containment
                if (bhang < 0)
                    continue;

                fprintf(stderr, "found testFrag = %d testOvl = %d erates "F_U64" %u hang "F_S64" "F_S64" (DOVETAIL) slop=%d\n",
                        testFrag, testOvl,
                        ovl[testOvl].dat.ovl.orig_erate, consensusCutoff,
                        ovl[testOvl].dat.ovl.a_hang,
                        ovl[testOvl].dat.ovl.b_hang,
                        hangSlop);

                if (bhang < overlapBHang) {
                    contained       = 0;
                    fragment        = testFrag;
                    overlap         = testOvl;
                    overlapIdentity = ovl[testOvl].dat.ovl.orig_erate;
                    overlapBHang    = bhang;
                }
            }
        }
    }


    //  Now, if we have found the parent fragment, update.
    //
    if (overlap >= 0) {
        testOvl  = overlap;
        testFrag = fragment;

        iunitig->f_list[thisFrag].parent = ovl[testOvl].a_iid;

        if (contained)
            iunitig->f_list[thisFrag].contained = iunitig->f_list[thisFrag].parent;
        else
            iunitig->f_list[thisFrag].contained = 0;

        //  Reorient again based on the orientation of the testFrag.
        //
        if (iunitig->f_list[testFrag].position.bgn < iunitig->f_list[testFrag].position.end) {
            //  testFrag is forward
            iunitig->f_list[thisFrag].ahang  = ovl[testOvl].dat.ovl.a_hang;
            iunitig->f_list[thisFrag].bhang  = ovl[testOvl].dat.ovl.b_hang;
        } else {
            //  testFrag is reverse
            iunitig->f_list[thisFrag].ahang  = -ovl[testOvl].dat.ovl.b_hang;
            iunitig->f_list[thisFrag].bhang  = -ovl[testOvl].dat.ovl.a_hang;
        }

        //  Report we did something.
        //
        fprintf(stderr, "Updated fragment "F_IID" from "F_IID",%d,%d to "F_IID",%d,%d\n",
                iunitig->f_list[thisFrag].ident,
                oldParent,
                oldAHang,
                oldBHang,
                iunitig->f_list[thisFrag].parent,
                iunitig->f_list[thisFrag].ahang,
                iunitig->f_list[thisFrag].bhang);

        goto successfullyUpdated;
    }


    //  Otherwise, try to find an overlap again, this time allowing a
    //  bit of slop in the hangs.
    //
    if (hangSlop == 0) {
        hangSlop = 10;
        goto tryAgain;
    }


    //  Now, we're convinced there is no decent overlap between this
    //  fragment and any fragment before it.
    //
    //  Scan forward for the first thing we overlap.

    for (testFrag=thisFrag+1; testFrag < iunitig->num_frags; testFrag++) {
        int ahang = 0;
        int bhang = 0;

        testOvl = LookupValueInHashTable_AS(ovlAfter, (uint64)iunitig->f_list[testFrag].ident, sizeof(uint64));

        //  Fragment has no overlap
        if (testOvl == -1)
            continue;

        //  Overlap is too noisy
        if (ovl[testOvl].dat.ovl.orig_erate >= consensusCutoff)
            continue;

        if (iunitig->f_list[testFrag].position.bgn < iunitig->f_list[testFrag].position.end) {
            ahang = ovl[testOvl].dat.ovl.a_hang;
            bhang = ovl[testOvl].dat.ovl.b_hang;
        } else {
            ahang = -ovl[testOvl].dat.ovl.b_hang;
            bhang = -ovl[testOvl].dat.ovl.a_hang;
        }

        //  Don't allow negative ahangs.  At all.  This catches the case
        //  where the parent might be contained in us, and generally makes
        //  consensus happier.
        //
        //  Don't allow empty hangs - this can lead to infinite loops
        //  where we keep swapping the same two fragments.  OK, not
        //  infinite, since we eventually run out of stack space and
        //  crash.
        //
        if (ahang <= 0)
            continue;

        fprintf(stderr, "shifttest ovl=%d testFrag="F_IID" pos %d-%d  thisFrag="F_IID" pos %d-%d  hangs %d,%d\n",
                testOvl,
                iunitig->f_list[testFrag].ident,
                iunitig->f_list[testFrag].position.bgn,
                iunitig->f_list[testFrag].position.end,
                iunitig->f_list[thisFrag].ident,
                iunitig->f_list[thisFrag].position.bgn,
                iunitig->f_list[thisFrag].position.end,
                ahang, bhang);

        IntMultiPos  fragCopy = iunitig->f_list[thisFrag];

        memmove(iunitig->f_list + thisFrag,
                iunitig->f_list + thisFrag + 1,
                sizeof(IntMultiPos) * (testFrag - thisFrag));

        iunitig->f_list[testFrag] = fragCopy;

        fprintf(stderr, "Shifted fragment "F_IID" from position %d to position %d\n",
                iunitig->f_list[testFrag].ident,
                thisFrag, testFrag);

        //  Since we moved things around, we must process the new fragment
        //  at 'thisFrag's location.
        //
        failed = updateFragmentWithParent(iunitig, thisFrag, ovs);

        if (failed == -1)
            goto successfullyUpdated;

        break;
    }


    //  And we failed.  Good luck with this one.
    //
    fprintf(stderr, "Failed to update fragment "F_IID" from "F_IID",%d,%d.\n",
            iunitig->f_list[thisFrag].ident,
            oldParent,
            oldAHang,
            oldBHang);

    failed = thisFrag;

successfullyUpdated:
    DeleteHashTable_AS(ovlBefore);
    DeleteHashTable_AS(ovlAfter);
    safe_free(ovl);

    return(failed);
}
示例#4
0
static
int
abAbacus::SetUngappedFragmentPositions(FragType type,int32 n_frags, MultiAlignT *uma) {

  int32 num_frags   = GetNumIntMultiPoss(uma->f_list);
  int32 num_unitigs = GetNumIntUnitigPoss(uma->u_list);

  HashTable_AS *unitigFrags = CreateScalarHashTable_AS();

  int32 num_columns   = GetMultiAlignLength(uma);
  int32 ungapped_pos  = 0;

  int32 *gapped_positions = new int32 [num_columns + 1];
  char  *consensus        = Getchar(uma->consensus,0);

  for (int32 i=0; i<num_columns+1; i++) {
    gapped_positions[i] = ungapped_pos;

    if (consensus[i] != '-')
      ungapped_pos++;
  }

  //  Remember the first fragment we add.
  int32 first_frag = GetNumCNS_AlignedContigElements(fragment_positions);

  for (int32 ifrag=0; ifrag<num_frags; ifrag++) {
    CNS_AlignedContigElement epos;
    IntMultiPos *frag = GetIntMultiPos(uma->f_list, ifrag);

    if (ExistsInHashTable_AS(unitigFrags, frag->ident, 0)) {
      fprintf(stderr,"SetUngappedFragmentPositions()-- ident %d already in hashtable\n", frag->ident);
      assert(0);
    }
    if (HASH_SUCCESS != InsertInHashTable_AS(unitigFrags, frag->ident, 0, 1, 0)) {
      fprintf(stderr,"SetUngappedFragmentPositions()-- Failure to insert ident %d in hashtable\n", frag->ident);
      assert(0);
    }

    assert(frag->position.bgn >= 0);
    assert(frag->position.bgn < num_columns + 1);
    assert(frag->position.end >= 0);
    assert(frag->position.end < num_columns + 1);

    epos.frg_or_utg                  = CNS_ELEMENT_IS_FRAGMENT;
    epos.idx.fragment.frgIdent       = frag->ident;
    epos.idx.fragment.frgType        = frag->type;
    epos.idx.fragment.frgContained   = frag->contained;
    epos.idx.fragment.frgInUnitig    = (type == AS_CONTIG) ? -1 : uma->maID;
    epos.position.bgn                = gapped_positions[frag->position.bgn];
    epos.position.end                = gapped_positions[frag->position.end];

    //fprintf(stderr, "SetUngappedFragmentPositions()-- FRG id=%d type=%c pos=%d,%d (orig pos=%d,%d)\n",
    //        frag->ident, frag->type, epos.position.bgn, epos.position.end, frag->position.bgn, frag->position.end);

    //  Adjust the ungapped position if we fall within a gap
    //
    if (epos.position.bgn == epos.position.end) {
      fprintf(stderr,"SetUngappedFragmentPositions()-- Encountered bgn==end=="F_S32" in ungapped coords within SetUngappedFragmentPositions for "F_CID "(gapped coords "F_S32","F_S32")\n",
              epos.position.bgn,frag->ident,frag->position.bgn,frag->position.end);
      assert(frag->position.bgn != frag->position.end);

      if (frag->position.bgn < frag->position.end) {
        if (epos.position.bgn > 0)
          epos.position.bgn--;
        else
          epos.position.end++;
      } else {
        if (epos.position.end > 0)
          epos.position.end--;
        else
          epos.position.bgn++;
      }
      fprintf(stderr,"SetUngappedFragmentPositions()--   Reset to "F_S32","F_S32"\n",
              epos.position.bgn,
              epos.position.end);
    }

    AppendVA_CNS_AlignedContigElement(fragment_positions, &epos);
  }


  for (int32 ifrag=0; ifrag < num_unitigs; ifrag++){
    CNS_AlignedContigElement epos;
    IntUnitigPos *unitig = GetIntUnitigPos(uma->u_list, ifrag);

    epos.frg_or_utg           = CNS_ELEMENT_IS_UNITIG;
    epos.idx.unitig.utgIdent  = unitig->ident;
    epos.idx.unitig.utgType   = unitig->type;
    epos.position.bgn         = gapped_positions[unitig->position.bgn];
    epos.position.end         = gapped_positions[unitig->position.end];

    //fprintf(stderr, "SetUngappedFragmentPositions()-- UTG id=%d type=%c pos=%d,%d (orig pos=%d,%d)\n",
    //        unitig->ident, unitig->type, epos.position.bgn, epos.position.end, unitig->position.bgn, unitig->position.end);

    AppendVA_CNS_AlignedContigElement(fragment_positions,&epos);
  }

  //  This is used only by ReplaceEndUnitigInContig().  Mark fragments in the "anchoring" contig
  //  that belong to this unitig.
  //
  if (type != AS_CONTIG) {
    Fragment *anchor = GetFragment(fragmentStore,0);

    if ((anchor != NULL) &&
        (anchor->type == AS_CONTIG)) {
      CNS_AlignedContigElement *af = GetCNS_AlignedContigElement(fragment_positions, anchor->components);

      for (int32 ifrag=0; ifrag < anchor->n_components; ifrag++, af++) {
        if ((af->frg_or_utg == CNS_ELEMENT_IS_FRAGMENT) &&
            (ExistsInHashTable_AS(unitigFrags, af->idx.fragment.frgIdent, 0)))
          af->idx.fragment.frgInUnitig = uma->maID;
      }
    }
  }

  DeleteHashTable_AS(unitigFrags);
  delete [] gapped_positions;

  return first_frag;
}
示例#5
0
int main (int argc, char *argv[]) {
   char    *asmFileName    = NULL;
   char    *tigStoreName   = NULL;
   uint32   tigStoreVers   = 2;

   int      minLength      = DEFAULT_UNITIG_LENGTH;
   int      numInstances   = DEFAULT_NUM_INSTANCES;
   int      distanceToEnds = DEFAULT_DISTANCE_TO_ENDS;

   uint32   numToggled     = 0;

   argc = AS_configure(argc, argv);
  
   int arg=1;
   int err=0;
   while (arg < argc) {
      if        (strcmp(argv[arg], "-a") == 0) {
         asmFileName = argv[++arg];

      } else if (strcmp(argv[arg], "-t") == 0) {
        tigStoreName = argv[++arg];
        tigStoreVers = atoi(argv[++arg]);

      } else if (strcmp(argv[arg], "-l") == 0) {
         minLength = atoi(argv[++arg]);

      } else if (strcmp(argv[arg], "-n") == 0) {
         numInstances = atoi(argv[++arg]);

      } else if (strcmp(argv[arg], "-d") == 0) {
         distanceToEnds = atoi(argv[++arg]);

      } else {
         fprintf(stderr, "%s: unknown option '%s'\n", argv[0], argv[arg]);
         err++;
      }

      arg++;
   }

   if (minLength <= 0) err++;
   if (numInstances < 0) err++;
   if (distanceToEnds <= 0) err++;

   if ((asmFileName == NULL) || (tigStoreName == NULL) || (err > 0)) {
      fprintf(stderr, "usage: %s -a asmFile -t tigStore version [-l minLength] [-n numInstances] [-d distanceToEnd]\n", argv[0]);
      fprintf(stderr, "\n");
      fprintf(stderr, "  -a asmFile            path to the assembly .asm file\n");
      fprintf(stderr, "  -t tigStore version   path to the tigStore and version to modify\n");

      fprintf(stderr, "  -l minLength          minimum size of a unitig to be toggled, default=%d)\n", DEFAULT_UNITIG_LENGTH);
      fprintf(stderr, "  -n numInstances       number of instances of a surrogate that is toggled, default = %d\n", DEFAULT_NUM_INSTANCES);
      fprintf(stderr, "  -d distanceToEnd      max number of bases the surrogate can be from the end of a scaffold for toggling, default = %d\n", DEFAULT_DISTANCE_TO_ENDS);
      fprintf(stderr, "\n");
      fprintf(stderr, "  Labels surrogate unitigs as non-repeat if they match any of the following conditions:\n");
      fprintf(stderr, "    1. the unitig meets all the -l, -n and -d conditions\n");
      fprintf(stderr, "    2. When -n = 0, all surrogate unitigs with more than one read\n");
      fprintf(stderr, "    3. the unitig appears exactly twice, within '-d' bases from the end of a scaffold\n");
      exit(1);
   }
  
   HashTable_AS      *UIDtoIID         = CreateScalarHashTable_AS();
   HashTable_AS      *CTGtoFirstUTG    = CreateScalarHashTable_AS();
   HashTable_AS      *CTGtoLastUTG     = CreateScalarHashTable_AS();
   VA_TYPE(int32)    *unitigLength	   = CreateVA_int32(8192);
   VA_TYPE(uint32)   *surrogateCount   = CreateVA_uint32(8192);
   VA_TYPE(uint32)   *surrogateAtScaffoldEnds   = CreateVA_uint32(8192);
   
   GenericMesg    *pmesg;
   FILE           *infp = fopen(asmFileName, "r");   

   while ((EOF != ReadProtoMesg_AS(infp, &pmesg))) {
      SnapUnitigMesg    *utg     = NULL;
      SnapConConMesg    *ctg     = NULL;
      SnapScaffoldMesg  *scf     = NULL;
      uint32             count   = 0;
      uint32             forward = TRUE;
      uint32             lastCtg = 0;

      switch(pmesg->t) {
         case MESG_UTG:
            utg = (SnapUnitigMesg*)(pmesg->m);
            Setint32(unitigLength, utg->iaccession, &utg->length);

            if (utg->length >= minLength && (utg->status == AS_NOTREZ || utg->status == AS_SEP)) {               
               // store the mapping for this unitig's UID to IID and initialize it's instance counter at 0
               count = 0;
               InsertInHashTable_AS(UIDtoIID, AS_UID_toInteger(utg->eaccession), 0, (uint64)utg->iaccession, 0);               
               Setuint32(surrogateCount, utg->iaccession, &count);
            }
            break;    

         case MESG_CCO:
            ctg = (SnapConConMesg *)(pmesg->m);
            
            for (int32 i = 0; i < ctg->num_unitigs; i++) {
               // increment the surrogate unitigs instance counter
               if (ExistsInHashTable_AS(UIDtoIID, AS_UID_toInteger(ctg->unitigs[i].eident), 0)) {
                  uint32 *ret = Getuint32(surrogateCount, (uint32) LookupValueInHashTable_AS(UIDtoIID, AS_UID_toInteger(ctg->unitigs[i].eident), 0));
                  assert(ret != NULL);
                  (*ret)++;

                  // store first surrogate in a contig
                  if (!ExistsInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(ctg->eaccession), 0) && 
                        MIN(ctg->unitigs[i].position.bgn, ctg->unitigs[i].position.end) < distanceToEnds) {
                     InsertInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(ctg->eaccession), 0, LookupValueInHashTable_AS(UIDtoIID, AS_UID_toInteger(ctg->unitigs[i].eident), 0), 0); 
                  }

                  // also store the last
                  if ((ctg->length - MAX(ctg->unitigs[i].position.bgn, ctg->unitigs[i].position.end)) < distanceToEnds) {
                     ReplaceInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(ctg->eaccession), 0, LookupValueInHashTable_AS(UIDtoIID, AS_UID_toInteger(ctg->unitigs[i].eident), 0), 0);
                  }
               }
            }
            break;

         case MESG_SCF:
            scf = (SnapScaffoldMesg *)(pmesg->m);
            
            count = scf->iaccession;
            if (scf->contig_pairs[0].orient.isAnti() || scf->contig_pairs[0].orient.isOuttie()) {
               forward = FALSE;
            }
            lastCtg = MAX(scf->num_contig_pairs - 1, 0);
            
            // All four cases below follow the same pattern
            // The first time a surrogate is found at the end of a scaffold, we record the scaffold ID
            // When the surrogate is seen at the end of a second scaffold, we record that it has been found at the ends of two scaffolds (UINT32_MAX)
            // If the surrogate is seen more than once in a single scaffold, it is eliminated (it can't connect two scaffolds)
            // If the surrogate is only seen once at the end of a scaffold (and again in the middle), it is eliminated
            // 1. Contig is first in scaffold and is forward, take the surrogate from the beginning of contig, if it exists                        
            if (ExistsInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0) && forward) {
               uint32 *myval = Getuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0));
               if (myval != NULL && (*myval) == scf->iaccession) {
                  count = 0;
               } else if (myval != NULL && (*myval) != 0 && (*myval) != scf->iaccession) {
                  count = UINT32_MAX;
               }
               Setuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0), &count);
               count = scf->iaccession;
            }
            // 2. Contig is last in scaffold and is reversed, take the surrogate from the beginning of the contig, if it exists
            if (ExistsInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0) && !forward) {
               uint32 *myval = Getuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0));
               if (myval != NULL && (*myval) == scf->iaccession) {
                  count = 0;
               } else if (myval != NULL && (*myval) != 0 && (*myval) != scf->iaccession) {
                  count = UINT32_MAX;
               }
               Setuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoFirstUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0), &count);
               count = scf->iaccession;
            }
            // 3. Contig is first in scaffold and is reversed, take the surrogate from the end of the contig, if it exists            
            if (ExistsInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0) && !forward) {
               uint32 *myval = Getuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0));
               if (myval != NULL && (*myval) == scf->iaccession) {
                  count = 0;
               } else if (myval != NULL && (*myval) != 0 && (*myval) != scf->iaccession) {
                  count = UINT32_MAX;
               }
               Setuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[0].econtig1), 0), &count);
               count = scf->iaccession;
            }
            // 4. Contig is last in scaffold and is forward, take the surrogate from the end of the contig, if it exists
            if (ExistsInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0) && forward) {
               uint32 *myval = Getuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0));
               if (myval != NULL && (*myval) == scf->iaccession) {
                  count = 0;
               } else if (myval != NULL && (*myval) != 0 && (*myval) != scf->iaccession) {
                  count = UINT32_MAX;
               }
               Setuint32(surrogateAtScaffoldEnds, (uint32) LookupValueInHashTable_AS(CTGtoLastUTG, AS_UID_toInteger(scf->contig_pairs[lastCtg].econtig2), 0), &count);
               count = scf->iaccession;
            }
            break;
         default:
            break;
      }
   }
   fclose(infp);
  



   uint32 *ret       = NULL;
   uint32 *atScfEnd  = NULL;

   // open the tig store for in-place writing (we don't increment the version since CGW always reads a fixed version initially)
   // this also removes any partitioning

   MultiAlignStore *tigStore = new MultiAlignStore(tigStoreName, tigStoreVers, 0, 0, TRUE, TRUE);

   for (uint32 i = 0; i < tigStore->numUnitigs(); i++) {
      uint32 *ret      = Getuint32(surrogateCount, i);
      uint32 *atScfEnd = Getuint32(surrogateAtScaffoldEnds, i);
      uint32 *length   = Getuint32(unitigLength, i);

      bool toggled = false;
                     
      if (ret != NULL && (*ret) == (uint32)numInstances && numInstances != 0) {
         toggled = TRUE;
      } 

      // if we find a surrogate that has two instances and it is at scaffold ends mark toggle it as well
      else if (ret != NULL && (*ret) == NUM_INSTANCES_AT_SCAFFOLD_ENDS && atScfEnd != NULL && (*atScfEnd) == UINT32_MAX) {
         toggled = TRUE;
      }   

      // special case, mark non-singleton unitigs as unique if we are given no instances
      else if (numInstances == 0 && (length != NULL && (*length) >= minLength) && tigStore->getNumFrags(i, TRUE) > 1) {
         toggled = TRUE;
      }
      
      if (toggled) {
         tigStore->setUnitigFUR(i, AS_FORCED_UNIQUE);
         numToggled++;
      }      
   }
   
   DeleteHashTable_AS(UIDtoIID);
   DeleteHashTable_AS(CTGtoFirstUTG);
   DeleteHashTable_AS(CTGtoLastUTG);

   delete tigStore;
   
   fprintf(stderr, "Toggled %d\n", numToggled);
   
   return 0;
}