Пример #1
0
ALNoverlap *
Optimal_Overlap_AS_forCNS(char *a, char *b,
                          int begUNUSED, int endUNUSED,
                          int ahang, int bhang,
                          int opposite,
                          double erate, double thresh, int minlen,
                          CompareOptions what) {
    static Optimal_Overlap_Data  *ood = NULL;

    if (ood == NULL)
        ood = new Optimal_Overlap_Data;

    memset(ood->h_alignA, 0, sizeof(char) * (AS_MAX_READLEN + AS_MAX_READLEN + 2));
    memset(ood->h_alignB, 0, sizeof(char) * (AS_MAX_READLEN + AS_MAX_READLEN + 2));
    memset(ood->h_trace,  0, sizeof(int)  * (AS_MAX_READLEN + AS_MAX_READLEN + 2));

    alignLinker_s   al;

    //if (VERBOSE_MULTIALIGN_OUTPUT >= 3)
    //  fprintf(stderr, "Optimal_Overlap_AS_forCNS()--  Begins\n");

#if 0
    if (erate > AS_MAX_ERROR_RATE) {
        //fprintf(stderr, "Optimal_Overlap_AS_forCNS()--  erate=%f >= AS_MAX_ERROR_RATE=%f, reset to max\n", erate, (double)AS_MAX_ERROR_RATE);
        erate = AS_MAX_ERROR_RATE;
    }
    assert((0.0 <= erate) && (erate <= AS_MAX_ERROR_RATE));
#endif

    if (opposite)
        reverseComplementSequence(b, strlen(b));

    //if (VERBOSE_MULTIALIGN_OUTPUT >= 3) {
    //  fprintf(stderr, "ALIGN %s\n", a);
    //  fprintf(stderr, "ALIGN %s\n", b);
    //}

    alignLinker(ood->h_alignA,
                ood->h_alignB,
                a,
                b,
                &al,
                true,   //  Looking for global end-to-end alignments
                false,  //  Count matches to N as matches
                ahang, bhang);
    if (al.alignLen == 0) {
        return NULL;
    }

    //if (VERBOSE_MULTIALIGN_OUTPUT >= 3) {
    //  fprintf(stderr, "ALIGN %d %d-%d %d-%d opposite=%d\n", al.alignLen, al.begI, al.endI, al.begJ, al.endJ, opposite);
    //  fprintf(stderr, "ALIGN '%s'\n", ood->h_alignA);
    //  fprintf(stderr, "ALIGN '%s'\n", ood->h_alignB);
    //}

    if (opposite) {
        reverseComplementSequence(b, strlen(b));

        reverseComplementSequence(ood->h_alignA, al.alignLen);
        reverseComplementSequence(ood->h_alignB, al.alignLen);

        int x = al.begJ;
        al.begJ = al.lenB - al.endJ;
        al.endJ = al.lenB - x;
    }

    //  We don't expect partial overlaps here.  At least one fragment
    //  must have an alignment to the very start.
    //
    //  ECR depends on this return value; it is allowed to fail
    //  when building a new unitig multialign.  For example:
    //
    //  <-----------------------
    //        ------>
    //
    //  When ECR tries to extend the second fragment, it checks that
    //  the extended fragment overlaps the next contig.  It does not
    //  check that the extended bits agree with the first fragment,
    //  leaving that up to "does the unitig rebuild".
    //
    if ((al.begJ != 0) && (al.begI != 0))
        return(NULL);

    ood->o.begpos  = (al.begI           > 0) ? (al.begI)           : -(al.begJ);
    ood->o.endpos  = (al.lenB - al.endJ > 0) ? (al.lenB - al.endJ) : -(al.lenA - al.endI);
    ood->o.length  = al.alignLen;
    ood->o.diffs   = 0;
    ood->o.comp    = opposite;
    ood->o.trace   = ood->h_trace;

    {
        int x=0;

        int tp = 0;
        int ap = al.begI;
        int bp = al.begJ;

        for (x=0; x<al.alignLen; x++) {
            if (ood->h_alignA[x] == '-') {
                ood->h_trace[tp++] = -(ap + 1);
                ap--;
            }
            if (ood->h_alignB[x] == '-') {
                ood->h_trace[tp++] = bp + 1;
                bp--;
            }

            //  Count the differences.
            //
            //  But allow N's and lowercase as matches.  If either letter is N, then the other letter is
            //  NOT N (if both letters were N, both would be lowercase n, representing a match).  This
            //  just subtracts out the diff we added in above.
            //
            bool  diff   = false;
            bool  ignore = false;

            if (toupper(ood->h_alignA[x]) != toupper(ood->h_alignB[x]))
                diff = true;

            if ((ood->h_alignA[x] == 'N') || (ood->h_alignA[x] == 'n') ||
                    (ood->h_alignB[x] == 'N') || (ood->h_alignB[x] == 'n'))
                ignore = true;

            if (islower(ood->h_alignA[x]) && (ood->h_alignB[x] == '-'))
                ignore = true;

            if ((diff == true) && (ignore == false))
                ood->o.diffs++;

            bp++;
            ap++;
        }

        ood->h_trace[tp] = 0;

        //if (VERBOSE_MULTIALIGN_OUTPUT >= 4) {
        //  fprintf(stderr, "trace");
        //  for (x=0; x<tp; x++)
        //    fprintf(stderr, " %d", ood->h_trace[x]);
        //  fprintf(stderr, "\n");
        //  fprintf(stderr, "A: %4d-%4d %4d %s\n", al.begI, al.endI, al.lenA, ood->h_alignA);
        //  fprintf(stderr, "B: %4d-%4d %4d %s\n", al.begJ, al.endJ, al.lenB, ood->h_alignB);
        //}
    }

    //if (VERBOSE_MULTIALIGN_OUTPUT >= 3) {
    //  fprintf(stderr, "ERATE:   diffs=%d / length=%d = %f\n", ood->o.diffs, ood->o.length, (double)ood->o.diffs / ood->o.length);
    //  fprintf(stderr, "Optimal_Overlap_AS_forCNS()--  Ends\n");
    //}

    if ((double)ood->o.diffs / ood->o.length <= erate)
        return(&ood->o);

    return(NULL);
}
Пример #2
0
int foundLinker(char *seq,char *link1,char *link2,int *st,int *en,float *pId,
                int allowTail) {
  int score,alnlen,id,start,end,coverage;
  int score1,alnlen1,id1,start1,end1,coverage1;
  int score2,alnlen2,id2,start2,end2,coverage2;
  int found,found1,found2;
  float percentId,percentId1,percentId2;
  char *link;

  found1 = found2 = found = 0;

  score1 = alignLinker(seq,link1,&alnlen1,&id1,&start1,&end1);
  score2 = alignLinker(seq,link2,&alnlen2,&id2,&start2,&end2);
  percentId1 = ((float) id1) / ((float) alnlen1);
  percentId2 = ((float) id2) / ((float) alnlen2);
  coverage1 = end1-start1+1;
  coverage2 = end2-start2+1;

  if (percentId1 >= identity &&
      (coverage1 >= matchLen || (allowTail && strlen(seq)-end1 <= 2 &&
                                coverage1 >= minLengthToCallLinker))) {
    found1 = 1;
  }
  if (percentId2 >= identity &&
      (coverage2 >= matchLen || (allowTail && strlen(seq)-end2 <= 2 &&
                                coverage2 >= minLengthToCallLinker))) {
    found2 = 1;
  }

  if (verbose > 1) {
    if (found1) {
      fprintf(stderr,"found1: %s score=%d id=%d alnlen=%d\n",seq,score1,id1,alnlen1);
      fprintf(stderr,"        %*s%s\n",start1,"",link1);
    } 
    if (found2) {
      fprintf(stderr,"found2: %s score=%d id=%d alnlen=%d\n",seq,score2,id2,alnlen2);
      fprintf(stderr,"        %*s%s\n",start2,"",link2);
    } 
  }
  if (found1 && (!found2 || score1 > score2)) {
    found = 1;
    score = score1;
    alnlen = alnlen1;
    id = id1;
    start = start1;
    end = end1;
    coverage = coverage1;
    percentId = percentId1;
    link = link1;
  } else if ((!found1 || score2 >= score1) && found2) {
    found = 1;
    score = score2;
    alnlen = alnlen2;
    id = id2;
    start = start2;
    end = end2;
    coverage = coverage2;
    percentId = percentId2;
    link = link2;
  } else if (!found1 && !found2) {
    found = 0;
  } else {
    fprintf(stderr,"ERROR: ambiguous case in foundLinker\n");
    fprintf(stderr,"  seq:  %s\n",seq);
    fprintf(stderr,"  lnk1: %s score1=%d id1=%d start1=%d end1=%d alnlen1=%d\n",
            link1,score1,id1,start1,end1,alnlen1);
    fprintf(stderr,"  lnk2: %s score2=%d id2=%d start2=%d end2=%d alnlen2=%d\n",
            link2,score2,id2,start2,end2,alnlen2);
    found = 0;
  }
  if (found) {
    *pId = percentId;
    *st = start;
    *en = end;
    if (verbose) {
      fprintf(stderr,"final: %s score=%d id=%d alnlen=%d\n",seq,score,id,alnlen);
      fprintf(stderr,"       %*s%s\n",start,"",link);
    }
  } else {
    if (verbose) fprintf(stderr,"no match %s\n",seq);
  }
  return found;
}