Ejemplo n.º 1
0
void
Correct_Frags(coParameters *G,
              gkStore      *gkpStore) {

  //  The original converted to lowercase, and made non-acgt be 'a'.

  for (uint32 i=0; i<256; i++)
    filter[i] = 'a';

  filter['A'] = filter['a'] = 'a';
  filter['C'] = filter['c'] = 'c';
  filter['G'] = filter['g'] = 'g';
  filter['T'] = filter['t'] = 't';

  //  Open the corrections, as an array.

  memoryMappedFile     *Cfile = new memoryMappedFile(G->correctionsName);
  Correction_Output_t  *C     = (Correction_Output_t *)Cfile->get();
  uint64                Cpos  = 0;
  uint64                Clen  = Cfile->length() / sizeof(Correction_Output_t);

  uint64     firstRecord   = 0;
  uint64     currentRecord = 0;

  fprintf(stderr, "Reading "F_U64" corrections from '%s'.\n", Clen, G->correctionsName);

  //  Count the number of bases, so we can do two gigantic allocations for bases and adjustments.
  //  Adjustments are always less than the number of corrections; we could also count exactly.

  G->basesLen   = 0;
  G->adjustsLen = 0;

  for (uint32 curID=G->bgnID; curID<=G->endID; curID++) {
    gkRead *read = gkpStore->gkStore_getRead(curID);

    G->basesLen += read->gkRead_sequenceLength() + 1;
  }

  for (uint64 c=0; c<Clen; c++) {
    switch (C[c].type) {
      case DELETE:
      case A_INSERT:
      case C_INSERT:
      case G_INSERT:
      case T_INSERT:
        G->adjustsLen++;
        break;
    }
  }

  fprintf(stderr, "Correcting "F_U64" bases with "F_U64" indel adjustments.\n", G->basesLen, G->adjustsLen);

  G->bases        = new char          [G->basesLen];
  G->adjusts      = new Adjust_t      [G->adjustsLen];
  G->reads        = new Frag_Info_t   [G->endID - G->bgnID + 1];
  G->readsLen     = 0;

  G->basesLen   = 0;
  G->adjustsLen = 0;

  uint64   changes[12] = {0};

  //  Load reads and apply corrections for each one.

  gkReadData *readData = new gkReadData;

  for (uint32 curID=G->bgnID; curID<=G->endID; curID++) {
    gkRead *read       = gkpStore->gkStore_getRead(curID);

    gkpStore->gkStore_loadReadData(read, readData);

    uint32  readLength = read->gkRead_sequenceLength();
    char   *readBases  = readData->gkReadData_getSequence();

    //  Save pointers to the bases and adjustments.

    G->reads[G->readsLen].bases       = G->bases   + G->basesLen;
    G->reads[G->readsLen].basesLen    = 0;
    G->reads[G->readsLen].adjusts     = G->adjusts + G->adjustsLen;
    G->reads[G->readsLen].adjustsLen  = 0;

    //  Find the correct corrections.

    while ((Cpos < Clen) && (C[Cpos].readID < curID))
      Cpos++;

    //  We should be at the IDENT message.

    if (C[Cpos].type != IDENT) {
      fprintf(stderr, "ERROR: didn't find IDENT at Cpos=%u for read %u\n", Cpos, curID);
      fprintf(stderr, "       C[Cpos] = keep_left=%u keep_right=%u type=%u pos=%u readID=%u\n",
              C[Cpos].keep_left,
              C[Cpos].keep_right,
              C[Cpos].type,
              C[Cpos].pos,
              C[Cpos].readID);
    }
    assert(C[Cpos].type == IDENT);

    G->reads[G->readsLen].keep_left  = C[Cpos].keep_left;
    G->reads[G->readsLen].keep_right = C[Cpos].keep_right;

    //Cpos++;

    //  Now do the corrections.

    correctRead(curID,
                G->reads[G->readsLen].bases,
                G->reads[G->readsLen].basesLen,
                G->reads[G->readsLen].adjusts,
                G->reads[G->readsLen].adjustsLen,
                readData->gkReadData_getSequence(),
                read->gkRead_sequenceLength(),
                C,
                Cpos,
                Clen,
                changes);

    //  Update the lengths in the globals.

    G->basesLen   += G->reads[G->readsLen].basesLen   + 1;
    G->adjustsLen += G->reads[G->readsLen].adjustsLen;
    G->readsLen   += 1;
  }

  delete readData;
  delete Cfile;

  fprintf(stderr, "Corrected "F_U64" bases with "F_U64" substitutions, "F_U64" deletions and "F_U64" insertions.\n",
          G->basesLen,
          changes[A_SUBST] + changes[C_SUBST] + changes[G_SUBST] + changes[T_SUBST],
          changes[DELETE],
          changes[A_INSERT] + changes[C_INSERT] + changes[G_INSERT] + changes[T_INSERT]);
}
//  Read old fragments in  gkpStore  and choose the ones that
//  have overlaps with fragments in  Frag. Recompute the
//  overlaps, using fragment corrections and output the revised error.
void
Redo_Olaps(coParameters *G, gkStore *gkpStore) {

  //  Figure out the range of B reads we care about.  We probably could just loop over every read in
  //  the store with minimal penalty.

  uint64     thisOvl = 0;
  uint64     lastOvl = G->olapsLen - 1;

  uint32     loBid   = G->olaps[thisOvl].b_iid;
  uint32     hiBid   = G->olaps[lastOvl].b_iid;

  //  Open all the corrections.

  memoryMappedFile     *Cfile = new memoryMappedFile(G->correctionsName);
  Correction_Output_t  *C     = (Correction_Output_t *)Cfile->get();
  uint64                Cpos  = 0;
  uint64                Clen  = Cfile->length() / sizeof(Correction_Output_t);

  //  Allocate some temporary work space for the forward and reverse corrected B reads.

  fprintf(stderr, "--Allocate "F_U64" MB for fseq and rseq.\n", (2 * sizeof(char) * 2 * (AS_MAX_READLEN + 1)) >> 20);
  char          *fseq    = new char     [AS_MAX_READLEN + 1 + AS_MAX_READLEN + 1];
  uint32         fseqLen = 0;

  char          *rseq    = new char     [AS_MAX_READLEN + 1 + AS_MAX_READLEN + 1];
  uint32         rseqLen = 0;

  fprintf(stderr, "--Allocate "F_U64" MB for fadj and radj.\n", (2 * sizeof(Adjust_t) * (AS_MAX_READLEN + 1)) >> 20);
  Adjust_t      *fadj    = new Adjust_t [AS_MAX_READLEN + 1];
  Adjust_t      *radj    = new Adjust_t [AS_MAX_READLEN + 1];
  uint32         fadjLen  = 0;  //  radj is the same length

  fprintf(stderr, "--Allocate "F_U64" MB for pedWorkArea_t.\n", sizeof(pedWorkArea_t) >> 20);
  gkReadData    *readData = new gkReadData;
  pedWorkArea_t *ped      = new pedWorkArea_t;

  uint64         Total_Alignments_Ct           = 0;

  uint64         Failed_Alignments_Ct          = 0;
  uint64         Failed_Alignments_Both_Ct     = 0;
  uint64         Failed_Alignments_End_Ct      = 0;
  uint64         Failed_Alignments_Length_Ct   = 0;

  uint32         rhaFail = 0;
  uint32         rhaPass = 0;

  uint64         olapsFwd = 0;
  uint64         olapsRev = 0;



  ped->initialize(G, G->errorRate);

  //  Process overlaps.  Loop over the B reads, and recompute each overlap.

  for (uint32 curID=loBid; curID<=hiBid; curID++) {
    if (((curID - loBid) % 1024) == 0)
      fprintf(stderr, "Recomputing overlaps - %9u - %9u - %9u\r", loBid, curID, hiBid);

    if (curID < G->olaps[thisOvl].b_iid)
      continue;

    gkRead *read = gkpStore->gkStore_getRead(curID);

    gkpStore->gkStore_loadReadData(read, readData);

    //  Apply corrections to the B read (also converts to lower case, reverses it, etc)

    //fprintf(stderr, "Correcting B read %u at Cpos=%u\n", curID, Cpos);

    fseqLen = 0;
    rseqLen = 0;

    fadjLen = 0;

    correctRead(curID,
                fseq, fseqLen, fadj, fadjLen,
                readData->gkReadData_getSequence(),
                read->gkRead_sequenceLength(),
                C, Cpos, Clen);

    //  Create copies of the sequence for forward and reverse.  There isn't a need for the forward copy (except that
    //  we mutate it with corrections), and the reverse copy could be deferred until it is needed.

    memcpy(rseq, fseq, sizeof(char) * (fseqLen + 1));

    reverseComplementSequence(rseq, fseqLen);

    Make_Rev_Adjust(radj, fadj, fadjLen, fseqLen);

    //  Recompute alignments for all overlaps involving the B read.

    for (; ((thisOvl <= lastOvl) &&
            (G->olaps[thisOvl].b_iid == curID)); thisOvl++) {
      Olap_Info_t  *olap = G->olaps + thisOvl;

      //fprintf(stderr, "processing overlap %u - %u\n", olap->a_iid, olap->b_iid);

      //  Find the A segment.  It's always forward.  It's already been corrected.

      char *a_part = G->reads[olap->a_iid - G->bgnID].bases;

      if (olap->a_hang > 0) {
        int32 ha = Hang_Adjust(olap->a_hang,
                               G->reads[olap->a_iid - G->bgnID].adjusts,
                               G->reads[olap->a_iid - G->bgnID].adjustsLen);
        a_part += ha;
        //fprintf(stderr, "offset a_part by ha=%d\n", ha);
      }

      //  Find the B segment.

      char *b_part = (olap->normal == true) ? fseq : rseq;

      //if (olap->normal == true)
      //  fprintf(stderr, "b_part = fseq %40.40s\n", fseq);
      //else
      //  fprintf(stderr, "b_part = rseq %40.40s\n", rseq);

      if (olap->normal == true)
        olapsFwd++;
      else
        olapsRev++;

      bool rha=false;
      if (olap->a_hang < 0) {
        int32 ha = (olap->normal == true) ? Hang_Adjust(-olap->a_hang, fadj, fadjLen) :
                                            Hang_Adjust(-olap->a_hang, radj, fadjLen);
        b_part += ha;
        //fprintf(stderr, "offset b_part by ha=%d normal=%d\n", ha, olap->normal);
        rha=true;
      }

      //  Compute the alignment.

      int32   a_part_len  = strlen(a_part);
      int32   b_part_len  = strlen(b_part);
      int32   olap_len    = min(a_part_len, b_part_len);

      int32   a_end        = 0;
      int32   b_end        = 0;
      bool    match_to_end = false;

      //fprintf(stderr, ">A\n%s\n", a_part);
      //fprintf(stderr, ">B\n%s\n", b_part);

      int32 errors = Prefix_Edit_Dist(a_part, a_part_len,
                                      b_part, b_part_len,
                                      G->Error_Bound[olap_len],
                                      a_end,
                                      b_end,
                                      match_to_end,
                                      ped);

      //  ped->delta isn't used.

      //  ??  These both occur, but the first is much much more common.

      if ((ped->deltaLen > 0) && (ped->delta[0] == 1) && (0 < G->olaps[thisOvl].a_hang)) {
        int32  stop = min(ped->deltaLen, (int32)G->olaps[thisOvl].a_hang);  //  a_hang is int32:31!
        int32  i = 0;

        for  (i=0; (i < stop) && (ped->delta[i] == 1); i++)
          ;

        //fprintf(stderr, "RESET 1 i=%d delta=%d\n", i, ped->delta[i]);
        assert((i == stop) || (ped->delta[i] != -1));

        ped->deltaLen -= i;

        memmove(ped->delta, ped->delta + i, ped->deltaLen * sizeof (int));

        a_part     += i;
        a_end      -= i;
        a_part_len -= i;
        errors     -= i;

      } else if ((ped->deltaLen > 0) && (ped->delta[0] == -1) && (G->olaps[thisOvl].a_hang < 0)) {
        int32  stop = min(ped->deltaLen, - G->olaps[thisOvl].a_hang);
        int32  i = 0;

        for  (i=0; (i < stop) && (ped->delta[i] == -1); i++)
          ;

        //fprintf(stderr, "RESET 2 i=%d delta=%d\n", i, ped->delta[i]);
        assert((i == stop) || (ped->delta[i] != 1));

        ped->deltaLen -= i;

        memmove(ped->delta, ped->delta + i, ped->deltaLen * sizeof (int));

        b_part     += i;
        b_end      -= i;
        b_part_len -= i;
        errors     -= i;
      }


      Total_Alignments_Ct++;


      int32  olapLen = min(a_end, b_end);

      if ((match_to_end == false) && (olapLen <= 0))
        Failed_Alignments_Both_Ct++;

      if (match_to_end == false)
        Failed_Alignments_End_Ct++;

      if (olapLen <= 0)
        Failed_Alignments_Length_Ct++;

      if ((match_to_end == false) || (olapLen <= 0)) {
        Failed_Alignments_Ct++;

#if 0
        //  I can't find any patterns in these errors.  I thought that it was caused by the corrections, but I
        //  found a case where no corrections were made and the alignment still failed.  Perhaps it is differences
        //  in the alignment code (the forward vs reverse prefix distance in overlapper vs only the forward here)?

        fprintf(stderr, "Redo_Olaps()--\n");
        fprintf(stderr, "Redo_Olaps()--\n");
        fprintf(stderr, "Redo_Olaps()--  Bad alignment  errors %d  a_end %d  b_end %d  match_to_end %d  olapLen %d\n",
                errors, a_end, b_end, match_to_end, olapLen);
        fprintf(stderr, "Redo_Olaps()--  Overlap        a_hang %d b_hang %d innie %d\n",
                olap->a_hang, olap->b_hang, olap->innie);
        fprintf(stderr, "Redo_Olaps()--  Reads          a_id %u a_length %d b_id %u b_length %d\n",
                G->olaps[thisOvl].a_iid,
                G->reads[ G->olaps[thisOvl].a_iid ].basesLen,
                G->olaps[thisOvl].b_iid,
                G->reads[ G->olaps[thisOvl].b_iid ].basesLen);
        fprintf(stderr, "Redo_Olaps()--  A %s\n", a_part);
        fprintf(stderr, "Redo_Olaps()--  B %s\n", b_part);

        Display_Alignment(a_part, a_part_len, b_part, b_part_len, ped->delta, ped->deltaLen);

        fprintf(stderr, "\n");
#endif

        if (rha)
          rhaFail++;

        continue;
      }

      if (rha)
        rhaPass++;

      G->olaps[thisOvl].evalue = AS_OVS_encodeEvalue((double)errors / olapLen);

      //fprintf(stderr, "REDO - errors = %u / olapLep = %u -- %f\n", errors, olapLen, AS_OVS_decodeEvalue(G->olaps[thisOvl].evalue));
    }
  }

  fprintf(stderr, "\n");

  delete    ped;
  delete    readData;
  delete [] radj;
  delete [] fadj;
  delete [] rseq;
  delete [] fseq;
  delete    Cfile;

  fprintf(stderr, "--  Release bases, adjusts and reads.\n");

  delete [] G->bases;     G->bases   = NULL;
  delete [] G->adjusts;   G->adjusts = NULL;
  delete [] G->reads;     G->reads   = NULL;

  fprintf(stderr, "Olaps Fwd "F_U64"\n", olapsFwd);
  fprintf(stderr, "Olaps Rev "F_U64"\n", olapsRev);

  fprintf(stderr, "Total:  "F_U64"\n", Total_Alignments_Ct);
  fprintf(stderr, "Failed: "F_U64" (both)\n", Failed_Alignments_Both_Ct);
  fprintf(stderr, "Failed: "F_U64" (either)\n", Failed_Alignments_Ct);
  fprintf(stderr, "Failed: "F_U64" (match to end)\n", Failed_Alignments_End_Ct);
  fprintf(stderr, "Failed: "F_U64" (negative length)\n", Failed_Alignments_Length_Ct);

  fprintf(stderr, "rhaFail %u rhaPass %u\n", rhaFail, rhaPass);
}