static
int
Rev_Prefix_Edit_Dist (char A[], int m, char T[], int n, int Error_Limit,
                      int * A_End, int * T_End, int * Leftover, int * Match_To_End,
                      Work_Area_t * WA) {

  double  Score, Max_Score;
  int  Max_Score_Len = 0, Max_Score_Best_d = 0, Max_Score_Best_e = 0;
  int  Tail_Len;
  int  Best_d, Best_e, From, Last, Longest, Max, Row;
  int  Left, Right;
  int  d, e, j, k;


  assert (m <= n);
  Best_d = Best_e = Longest = 0;
  WA->Left_Delta_Len = 0;

  for  (Row = 0;  Row < m
          && (A[- Row] == T[- Row]
              || A[- Row] == DONT_KNOW_CHAR
              || T[- Row] == DONT_KNOW_CHAR);  Row++)
    ;

  if (WA->Edit_Array_Lazy[0] == NULL)
    Allocate_More_Edit_Space(WA);

  WA->Edit_Array_Lazy[0][0] = Row;

  if  (Row == m) {
    (* A_End) = (* T_End) = - m;
    (* Leftover) = m;
    (* Match_To_End) = TRUE;
    return  0;
  }

  Left = Right = 0;
  Max_Score = 0.0;
  for  (e = 1;  e <= Error_Limit;  e++) {
    if (WA->Edit_Array_Lazy[e] == NULL)
      Allocate_More_Edit_Space(WA);

    Left = MAX (Left - 1, -e);
    Right = MIN (Right + 1, e);
    WA->Edit_Array_Lazy[e - 1][Left] = -2;
    WA->Edit_Array_Lazy[e - 1][Left - 1] = -2;
    WA->Edit_Array_Lazy[e - 1][Right] = -2;
    WA->Edit_Array_Lazy[e - 1][Right + 1] = -2;

    for  (d = Left;  d <= Right;  d++) {
      Row = 1 + WA->Edit_Array_Lazy[e - 1][d];
      if  ((j = WA->Edit_Array_Lazy[e - 1][d - 1]) > Row)
        Row = j;
      if  ((j = 1 + WA->Edit_Array_Lazy[e - 1][d + 1]) > Row)
        Row = j;
      while  (Row < m && Row + d < n
              && (A[- Row] == T[- Row - d]
                  || A[- Row] == DONT_KNOW_CHAR
                  || T[- Row - d] == DONT_KNOW_CHAR))
        Row++;

      WA->Edit_Array_Lazy[e][d] = Row;

      if  (Row == m || Row + d == n) {

        //  Check for branch point here caused by uneven
        //  distribution of errors

        Score = Row * Branch_Match_Value - e;
        // Assumes  Branch_Match_Value
        //             - Branch_Error_Value == 1.0
        Tail_Len = Row - Max_Score_Len;
        if  ((Doing_Partial_Overlaps && Score < Max_Score)
             || (e > MIN_BRANCH_END_DIST / 2
                 && Tail_Len >= MIN_BRANCH_END_DIST
                 && (Max_Score - Score) / Tail_Len >= MIN_BRANCH_TAIL_SLOPE)) {
          (* A_End) = - Max_Score_Len;
          (* T_End) = - Max_Score_Len - Max_Score_Best_d;
          Set_Left_Delta (Max_Score_Best_e, Max_Score_Best_d,
                          Leftover, T_End, n, WA);
          (* Match_To_End) = FALSE;
          return  Max_Score_Best_e;
        }

        (* A_End) = - Row;           // One past last align position
        (* T_End) = - Row - d;
        Set_Left_Delta (e, d, Leftover, T_End, n, WA);
        (* Match_To_End) = TRUE;
        return  e;
      }
    }

    while  (Left <= Right && Left < 0
            && WA->Edit_Array_Lazy[e][Left] < WA->Edit_Match_Limit[e])
      Left++;

    if  (Left >= 0)
      while  (Left <= Right
              && WA->Edit_Array_Lazy[e][Left] + Left < WA->Edit_Match_Limit[e])
        Left++;

    if  (Left > Right)
      break;

    while  (Right > 0
            && WA->Edit_Array_Lazy[e][Right] + Right < WA->Edit_Match_Limit[e])
      Right--;

    if  (Right <= 0)
      while  (WA->Edit_Array_Lazy[e][Right] < WA->Edit_Match_Limit[e])
        Right--;

    assert (Left <= Right);

    for  (d = Left;  d <= Right;  d++)
      if  (WA->Edit_Array_Lazy[e][d] > Longest) {
        Best_d = d;
        Best_e = e;
        Longest = WA->Edit_Array_Lazy[e][d];
      }

    Score = Longest * Branch_Match_Value - e;

    // Assumes  Branch_Match_Value - Branch_Error_Value == 1.0
    if  (Score > Max_Score) {
      Max_Score = Score;
      Max_Score_Len = Longest;
      Max_Score_Best_d = Best_d;
      Max_Score_Best_e = Best_e;
    }
  }

  (* A_End) = - Max_Score_Len;
  (* T_End) = - Max_Score_Len - Max_Score_Best_d;
  Set_Left_Delta (Max_Score_Best_e, Max_Score_Best_d, Leftover, T_End, n, WA);
  (* Match_To_End) = FALSE;
  return  Max_Score_Best_e;
}
예제 #2
0
void
NDalgorithm::reverse(char    *A,   int32 Alen,   //  first sequence and length
                     char    *T,   int32 Tlen,   //  second sequence and length
                     int32   &A_End,
                     int32   &T_End,
                     int32   &Leftover,      //  <- novel
                     bool    &Match_To_End) {

    assert (Alen <= Tlen);

    int32  Best_d      = 0;
    int32  Best_e      = 0;
    int32  Best_row    = 0;
    int32  Best_score  = 0;


    int32  Row = 0;
    int32  Dst = 0;
    int32  Err = 0;
    int32  Sco = 0;
    int32  fromd = 0;

    //  Skip ahead over matches.  The original used to also skip if either sequence was N.
    while ((Row < Alen) && (isMatch(A[-Row], T[-Row]))) {
        Sco += matchScore(A[-Row], T[-Row]);
        Row++;
    }

    if (Edit_Array_Lazy[0] == NULL)
        allocateMoreEditSpace();

    Edit_Array_Lazy[0][0].row    = Row;
    Edit_Array_Lazy[0][0].dist   = Dst;
    Edit_Array_Lazy[0][0].errs   = 0;
    Edit_Array_Lazy[0][0].score  = Sco;
    Edit_Array_Lazy[0][0].fromd  = INT32_MAX;

    //  Exact match?

    if (Row == Alen) {
        A_End        = -Alen;
        T_End        = -Alen;
        Leftover     =  Alen;
        Match_To_End = true;
#ifdef DEBUG
        fprintf(stderr, "NDalgorithm::reverse()-- exact match\n");
#endif

        Left_Score       = Sco;
        Left_Delta_Len   = 0;

        return;
    }

    int32  Left  = 0;
    int32  Right = 0;

    int32  Max_Score         = PEDMINSCORE;
    int32  Max_Score_Len     = 0;
    int32  Max_Score_Best_d  = 0;
    int32  Max_Score_Best_e  = 0;

    for (int32 ei=1; ei <= Edit_Space_Max; ei++) {
        if (Edit_Array_Lazy[ei] == NULL)
            if (allocateMoreEditSpace() == false) {
                //  FAIL
                return;
            }

        Left  = MAX (Left  - 1, -ei);
        Right = MIN (Right + 1,  ei);

        //fprintf(stderr, "REVERSE ei=%d Left=%d Right=%d\n", ei, Left, Right);

        Edit_Array_Lazy[ei-1][Left  - 1].init();
        Edit_Array_Lazy[ei-1][Left     ].init();
        //  Of note, [0][0] on the first iteration is not reset here.
        Edit_Array_Lazy[ei-1][Right    ].init();
        Edit_Array_Lazy[ei-1][Right + 1].init();

        for (int32 d = Left;  d <= Right;  d++) {

            //  A mismatch.
            {
                int32  aPos         = -(1 + Edit_Array_Lazy[ei-1][d].row)     + 1;  //  +1 because we need to compare the base we are at,
                int32  tPos         = -(1 + Edit_Array_Lazy[ei-1][d].row) - d + 1;  //  not the base we will be at after the mismatch

                Row   = 1 + Edit_Array_Lazy[ei-1][d].row;
                Dst   =     Edit_Array_Lazy[ei-1][d].dist  + 1;
                Err   =     Edit_Array_Lazy[ei-1][d].errs  + 1;
                fromd =     d;

                //fprintf(stderr, "aPos %d tPos %d\n", aPos, tPos);

                //  If negative, we have a pointer into valid sequence.  If not, this mismatch
                //  doesn't make sense, and the row/score are set to bogus values.

                if ((aPos <= 0) && (tPos <= 0)) {
                    assert(-aPos <= Alen);
                    assert(-tPos <= Tlen);

                    assert(A[aPos] != T[tPos]);

                    Sco = Edit_Array_Lazy[ei-1][d].score + mismatchScore(A[aPos], T[tPos]);

                } else {
                    Sco = PEDMINSCORE;
                }
            }

            //  Insert a gap in A.  Check the other sequence to see if this is a zero-cost gap.  Note
            //  agreement with future value of Row and what is used in isMatch() below.
            //  Testcase test-st-ts shows this works.

            {
                int32  tPos     = -(0 + Edit_Array_Lazy[ei-1][d-1].row) - d;

                //assert( tPos <= 0);     Not true at the lower end; we'll just skip the cell since it's invalid
                //assert(-tPos < Tlen);

                if ((tPos <= 0) && (-tPos < Tlen)) {
                    int32  gapCost = isFreeGap( T[tPos] ) ? PEDFREEGAP : PEDGAP;

                    //if (gapCost == 0)
                    //  fprintf(stderr, "NDalgorithm::reverse()--  free A gap for aPos=%d tPos=%d t=%c/%d\n", tPos + d, tPos, T[tPos], T[tPos]);

                    if (Edit_Array_Lazy[ei-1][d-1].score + gapCost > Sco) {
                        Row   =     Edit_Array_Lazy[ei-1][d-1].row;
                        Dst   =     Edit_Array_Lazy[ei-1][d-1].dist  + (gapCost == PEDFREEGAP) ? 0 : 0;
                        Err   =     Edit_Array_Lazy[ei-1][d-1].errs  + (gapCost == PEDFREEGAP) ? 0 : 0;
                        Sco   =     Edit_Array_Lazy[ei-1][d-1].score +  gapCost;
                        fromd =     d-1;
                    }
                }
            }

            //  Insert a gap in T.

            {
                int32  aPos = -(1 + Edit_Array_Lazy[ei-1][d+1].row);

                //assert( aPos <= 0);
                //assert(-aPos < Tlen);

                if ((aPos <= 0) && (-aPos <= Alen)) {
                    int32  gapCost = isFreeGap( A[aPos] ) ? 0 : PEDGAP;

                    //if (gapCost == 0)
                    //  fprintf(stderr, "NDalgorithm::reverse()--  free T gap for aPos=%d tPos=%d t=%c/%d\n", aPos, aPos - d, A[aPos], A[aPos]);

                    if (Edit_Array_Lazy[ei-1][d+1].score + gapCost > Sco) {
                        Row   = 1 + Edit_Array_Lazy[ei-1][d+1].row;
                        Dst   =     Edit_Array_Lazy[ei-1][d+1].dist  + (gapCost == PEDFREEGAP) ? 0 : 1;
                        Err   =     Edit_Array_Lazy[ei-1][d+1].errs  + (gapCost == PEDFREEGAP) ? 0 : 1;
                        Sco   =     Edit_Array_Lazy[ei-1][d+1].score +  gapCost;
                        fromd =     d+1;
                    }
                }
            }

            //  If A or B is N, that isn't a mismatch.
            //  If A is lowercase and T is uppercase, it's a match.
            //  If A is lowercase and T doesn't match, ignore the cost of the gap in B

            while ((Row < Alen) && (Row + d < Tlen) && (isMatch(A[-Row], T[-Row - d]))) {
                Sco += matchScore(A[-Row], T[-Row - d]);
                Row += 1;
                Dst += 1;
                Err += 0;
            }

            Edit_Array_Lazy[ei][d].row   = Row;
            Edit_Array_Lazy[ei][d].dist  = Dst;
            Edit_Array_Lazy[ei][d].errs  = Err;
            Edit_Array_Lazy[ei][d].score = Sco;
            Edit_Array_Lazy[ei][d].fromd = fromd;


            if (Row == Alen || Row + d == Tlen) {
                A_End = - Row;           // One past last align position
                T_End = - Row - d;

                Set_Left_Delta(A, T, ei, d, Leftover, T_End, Tlen);

                Match_To_End = true;

                return;  //return(ei);
            }
        }  //  Over all diagonals.

        //  Reset the band
        //
        //  The .dist used to be .row.

        while  ((Left <= Right) && (Left < 0) && (Edit_Array_Lazy[ei][Left].dist < Edit_Match_Limit[ Edit_Array_Lazy[ei][Left].errs ]))
            Left++;

        if (Left >= 0)
            while  ((Left <= Right) && (Edit_Array_Lazy[ei][Left].dist + Left < Edit_Match_Limit[ Edit_Array_Lazy[ei][Left].errs ]))
                Left++;

        if (Left > Right)
            break;

        while  ((Right > 0) && (Edit_Array_Lazy[ei][Right].dist + Right < Edit_Match_Limit[ Edit_Array_Lazy[ei][Right].errs ]))
            Right--;

        if (Right <= 0)
            while  (Edit_Array_Lazy[ei][Right].dist < Edit_Match_Limit[ Edit_Array_Lazy[ei][Right].errs ])
                Right--;

        assert (Left <= Right);

        for (int32 d = Left;  d <= Right;  d++)
            if (Edit_Array_Lazy[ei][d].score > Best_score) {
                Best_d      = d;
                Best_e      = ei;
                Best_row    = Edit_Array_Lazy[ei][d].row;
                Best_score  = Edit_Array_Lazy[ei][d].score;
            }

        if (Best_score > Max_Score) {
            Max_Score_Best_d = Best_d;
            Max_Score_Best_e = Best_e;
            Max_Score        = Best_score;
            Max_Score_Len    = Best_row;
        }
    }  //  Over all possible number of errors

#ifdef DEBUG
    fprintf(stderr, "NDalgorithm::reverse()-- iterated over all errors, return best found\n");
#endif

    A_End = - Max_Score_Len;
    T_End = - Max_Score_Len - Max_Score_Best_d;

    Set_Left_Delta(A, T, Max_Score_Best_e, Max_Score_Best_d, Leftover, T_End, Tlen);

    Match_To_End = false;

    return;  //return(Max_Score_Best_e);
}