コード例 #1
0
static
int
Prefix_Edit_Dist(char A[], int m, char T[], int n, int Error_Limit,
                 int * A_End, int * T_End, int * Match_To_End, Work_Area_t * WA) {
  //int  Delta_Stack[MAX_ERRORS];
  double  Score, Max_Score;
  int  Max_Score_Len = 0, Max_Score_Best_d = 0, Max_Score_Best_e = 0;
  int  Tail_Len;
  int  Best_d, Best_e, From, Last, Longest, Max, Row;
  int  Left, Right;
  int  d, e, i, j, k;

  assert (m <= n);
  Best_d = Best_e = Longest = 0;
  WA->Right_Delta_Len = 0;

  for  (Row = 0;  Row < m
          && (A[Row] == T[Row]
              || A[Row] == DONT_KNOW_CHAR
              || T[Row] == DONT_KNOW_CHAR);  Row++)
    ;

  if (WA->Edit_Array_Lazy[0] == NULL)
    Allocate_More_Edit_Space(WA);

  WA->Edit_Array_Lazy[0][0] = Row;

  if  (Row == m) {
    // Exact match
    (* A_End) = (* T_End) = m;
    (* Match_To_End) = TRUE;
    return  0;
  }

  Left = Right = 0;
  Max_Score = 0.0;
  for  (e = 1;  e <= Error_Limit;  e++) {
    if (WA->Edit_Array_Lazy[e] == NULL)
      Allocate_More_Edit_Space(WA);

    Left = MAX (Left - 1, -e);
    Right = MIN (Right + 1, e);
    WA->Edit_Array_Lazy[e - 1][Left] = -2;
    WA->Edit_Array_Lazy[e - 1][Left - 1] = -2;
    WA->Edit_Array_Lazy[e - 1][Right] = -2;
    WA->Edit_Array_Lazy[e - 1][Right + 1] = -2;

    for  (d = Left;  d <= Right;  d++) {
      Row = 1 + WA->Edit_Array_Lazy[e - 1][d];
      if  ((j = WA->Edit_Array_Lazy[e - 1][d - 1]) > Row)
        Row = j;
      if  ((j = 1 + WA->Edit_Array_Lazy[e - 1][d + 1]) > Row)
        Row = j;
      while  (Row < m && Row + d < n
              && (A[Row] == T[Row + d]
                  || A[Row] == DONT_KNOW_CHAR
                  || T[Row + d] == DONT_KNOW_CHAR))
        Row++;

      WA->Edit_Array_Lazy[e][d] = Row;

      if  (Row == m || Row + d == n) {
        //  Check for branch point here caused by uneven
        //  distribution of errors
        Score = Row * Branch_Match_Value - e;
        // Assumes  Branch_Match_Value
        //             - Branch_Error_Value == 1.0
        Tail_Len = Row - Max_Score_Len;
        if  ((Doing_Partial_Overlaps && Score < Max_Score)
             ||  (e > MIN_BRANCH_END_DIST / 2
                  && Tail_Len >= MIN_BRANCH_END_DIST
                  && (Max_Score - Score) / Tail_Len >= MIN_BRANCH_TAIL_SLOPE)) {
          (* A_End) = Max_Score_Len;
          (* T_End) = Max_Score_Len + Max_Score_Best_d;
          Set_Right_Delta (Max_Score_Best_e, Max_Score_Best_d, WA);
          (* Match_To_End) = FALSE;
          return  Max_Score_Best_e;
        }

        // Force last error to be mismatch rather than insertion
        if  (Row == m && 1 + WA->Edit_Array_Lazy[e - 1][d + 1] == WA->Edit_Array_Lazy[e][d] && d < Right) {
          d++;
          WA->Edit_Array_Lazy[e][d] = WA->Edit_Array_Lazy[e][d - 1];
        }

        (* A_End) = Row;           // One past last align position
        (* T_End) = Row + d;
        Set_Right_Delta (e, d, WA);
        (* Match_To_End) = TRUE;
        return  e;
      }
    }

    while  (Left <= Right && Left < 0
            && WA->Edit_Array_Lazy[e][Left] < WA->Edit_Match_Limit[e])
      Left++;

    if  (Left >= 0)
      while  (Left <= Right
              && WA->Edit_Array_Lazy[e][Left] + Left < WA->Edit_Match_Limit[e])
        Left++;

    if  (Left > Right)
      break;

    while  (Right > 0
            && WA->Edit_Array_Lazy[e][Right] + Right < WA->Edit_Match_Limit[e])
      Right--;

    if  (Right <= 0)
      while  (WA->Edit_Array_Lazy[e][Right] < WA->Edit_Match_Limit[e])
        Right--;

    assert (Left <= Right);

    for  (d = Left;  d <= Right;  d++)
      if  (WA->Edit_Array_Lazy[e][d] > Longest) {
        Best_d = d;
        Best_e = e;
        Longest = WA->Edit_Array_Lazy[e][d];
      }

    Score = Longest * Branch_Match_Value - e;

    // Assumes  Branch_Match_Value - Branch_Error_Value == 1.0
    if  (Score > Max_Score) {
      Max_Score = Score;
      Max_Score_Len = Longest;
      Max_Score_Best_d = Best_d;
      Max_Score_Best_e = Best_e;
    }
  }

  (* A_End) = Max_Score_Len;
  (* T_End) = Max_Score_Len + Max_Score_Best_d;
  Set_Right_Delta (Max_Score_Best_e, Max_Score_Best_d, WA);
  (* Match_To_End) = FALSE;
  return  Max_Score_Best_e;
}
コード例 #2
0
void
NDalgorithm::forward(char    *A,   int32 Alen,
                     char    *T,   int32 Tlen,
                     int32   &A_End,
                     int32   &T_End,
                     bool    &Match_To_End) {

  assert (Alen <= Tlen);

  int32  Best_d      = 0;
  int32  Best_e      = 0;
  int32  Best_row    = 0;
  int32  Best_score  = 0;


  int32  Row = 0;
  int32  Dst = 0;
  int32  Err = 0;
  int32  Sco = 0;

  int32  fromd = 0;

  //  Skip ahead over matches.  The original used to also skip if either sequence was N.
  while ((Row < Alen) && (isMatch(A[Row], T[Row]))) {
    Sco += matchScore(A[Row], T[Row]);
    Row++;
  }

  if (Edit_Array_Lazy[0] == NULL)
    allocateMoreEditSpace();

  Edit_Array_Lazy[0][0].row    = Row;
  Edit_Array_Lazy[0][0].dist   = Dst;
  Edit_Array_Lazy[0][0].errs   = 0;
  Edit_Array_Lazy[0][0].score  = Sco;
  Edit_Array_Lazy[0][0].fromd  = INT32_MAX;

  // Exact match?

  if (Row == Alen) {
    A_End        = Alen;
    T_End        = Alen;
    Match_To_End = true;

    Right_Score       = Sco;
    Right_Delta_Len   = 0;

    return;
  }

  int32  Left  = 0;
  int32  Right = 0;

  int32  Max_Score         = PEDMINSCORE;
  int32  Max_Score_Len     = 0;
  int32  Max_Score_Best_d  = 0;
  int32  Max_Score_Best_e  = 0;

  for (int32 ei=1; ei <= Edit_Space_Max; ei++) {
    if (Edit_Array_Lazy[ei] == NULL)
      if (allocateMoreEditSpace() == false) {
        //  FAIL
        return;
      }

    Left  = MAX (Left  - 1, -ei);
    Right = MIN (Right + 1,  ei);

    //fprintf(stderr, "FORWARD ei=%d Left=%d Right=%d\n", ei, Left, Right);

    Edit_Array_Lazy[ei-1][Left  - 1].init();
    Edit_Array_Lazy[ei-1][Left     ].init();
    //  Of note, [0][0] on the first iteration is not reset here.
    Edit_Array_Lazy[ei-1][Right    ].init();
    Edit_Array_Lazy[ei-1][Right + 1].init();

    for (int32 d = Left;  d <= Right;  d++) {

      //  A mismatch.
      {
        int32  aPos         =  (1 + Edit_Array_Lazy[ei-1][d].row)     - 1;  //  -1 because we need to compare the base we are at,
        int32  tPos         =  (1 + Edit_Array_Lazy[ei-1][d].row) + d - 1;  //  not the base we will be at after the mismatch

        Row   = 1 + Edit_Array_Lazy[ei-1][d].row;
        Dst   =     Edit_Array_Lazy[ei-1][d].dist  + 1;
        Err   =     Edit_Array_Lazy[ei-1][d].errs  + 1;
        fromd =     d;

        //  If positive, we have a pointer into valid sequence.  If not, this mismatch
        //  doesn't make sense, and the row/score are set to bogus values.

        if ((aPos >= 0) && (tPos >= 0)) {
          assert (aPos <= Alen);
          assert( tPos <= Tlen);

          assert(A[aPos] != T[tPos]);

          Sco = Edit_Array_Lazy[ei-1][d].score + mismatchScore(A[aPos], T[tPos]);

        } else {
          Sco = PEDMINSCORE;
        }
      }

      //  Insert a gap in A.  Check the other sequence to see if this is a zero-cost gap.  Note
      //  agreement with future value of Row and what is used in isMatch() below.

      {
        int32  tPos    = 0 + Edit_Array_Lazy[ei-1][d-1].row + d;

        //assert(tPos >= 0);
        //assert(tPos < Tlen);

        if ((tPos >= 0) && (tPos <= Tlen)) {
          int32  gapCost = isFreeGap( T[tPos] ) ? PEDFREEGAP : PEDGAP;

          //if (gapCost == 0)
          //  fprintf(stderr, "NDalgorithm::forward()--  free A gap for aPos=%d tPos=%d t=%c/%d\n", tPos - d, tPos, T[tPos], T[tPos]);

          if (Edit_Array_Lazy[ei-1][d-1].score + gapCost > Sco) {
            Row   =     Edit_Array_Lazy[ei-1][d-1].row;
            Dst   =     Edit_Array_Lazy[ei-1][d-1].dist  + (gapCost == PEDFREEGAP) ? 0 : 0;
            Err   =     Edit_Array_Lazy[ei-1][d-1].errs  + (gapCost == PEDFREEGAP) ? 0 : 0;
            Sco   =     Edit_Array_Lazy[ei-1][d-1].score +  gapCost;
            fromd =     d-1;
          }
        }
      }

      //  Insert a gap in T.
      //  Testcase test-st-ts shows this works.

      {
        int32  aPos    = 1 + Edit_Array_Lazy[ei-1][d+1].row;

        //assert(aPos >= 0);
        //assert(aPos < Tlen);

        if ((aPos >= 0) && (aPos <= Alen)) {
          int32  gapCost = isFreeGap( A[aPos] ) ? 0 : PEDGAP;

          //if (gapCost == 0)
          //  fprintf(stderr, "NDalgorithm::forward()--  free T gap for aPos=%d tPos=%d a=%c/%d\n", aPos, aPos + d, A[aPos], A[aPos]);

          if (Edit_Array_Lazy[ei-1][d+1].score + gapCost > Sco) {
            Row   = 1 + Edit_Array_Lazy[ei-1][d+1].row;
            Dst   =     Edit_Array_Lazy[ei-1][d+1].dist  + (gapCost == PEDFREEGAP) ? 0 : 1;
            Err   =     Edit_Array_Lazy[ei-1][d+1].errs  + (gapCost == PEDFREEGAP) ? 0 : 1;
            Sco   =     Edit_Array_Lazy[ei-1][d+1].score +  gapCost;
            fromd =     d+1;
          }
        }
      }

      //  If A or B is N, that isn't a mismatch.
      //  If A is lowercase and T is uppercase, it's a match.
      //  If A is lowercase and T doesn't match, ignore the cost of the gap in B

      while ((Row < Alen) && (Row + d < Tlen) && (isMatch(A[Row], T[Row + d]))) {
        Sco += matchScore(A[Row], T[Row + d]);
        Row += 1;
        Dst += 1;
        Err += 0;
      }

      Edit_Array_Lazy[ei][d].row   = Row;
      Edit_Array_Lazy[ei][d].dist  = Dst;
      Edit_Array_Lazy[ei][d].errs  = Err;
      Edit_Array_Lazy[ei][d].score = Sco;
      Edit_Array_Lazy[ei][d].fromd = fromd;

      //fprintf(stderr, "SET ei=%d d=%d -- row=%d dist=%d errs=%d score=%d fromd=%d\n", ei, d, Row, Dst, Err, Sco, fromd);

      if (Row == Alen || Row + d == Tlen) {
        A_End = Row;           // One past last align position
        T_End = Row + d;

        Set_Right_Delta(ei, d);

        Match_To_End = true;

        return;  //return(ei);
      }
    }  //  Over all diagonals.

    //  Reset the band
    //
    //  The .dist used to be .row.

    while  ((Left <= Right) && (Left < 0) && (Edit_Array_Lazy[ei][Left].dist < Edit_Match_Limit[ Edit_Array_Lazy[ei][Left].errs ]))
      Left++;

    if (Left >= 0)
      while  ((Left <= Right) && (Edit_Array_Lazy[ei][Left].dist + Left < Edit_Match_Limit[ Edit_Array_Lazy[ei][Left].errs ]))
        Left++;

    if (Left > Right)
      break;

    while  ((Right > 0) && (Edit_Array_Lazy[ei][Right].dist + Right < Edit_Match_Limit[ Edit_Array_Lazy[ei][Right].errs ]))
      Right--;

    if (Right <= 0)
      while  (Edit_Array_Lazy[ei][Right].dist < Edit_Match_Limit[ Edit_Array_Lazy[ei][Right].errs ])
        Right--;

    assert (Left <= Right);

    for (int32 d = Left;  d <= Right;  d++)
      if (Edit_Array_Lazy[ei][d].score > Best_score) {
        Best_d      = d;
        Best_e      = ei;
        Best_row    = Edit_Array_Lazy[ei][d].row;
        Best_score  = Edit_Array_Lazy[ei][d].score;
      }

    if (Best_score > Max_Score) {
      Max_Score_Best_d = Best_d;
      Max_Score_Best_e = Best_e;
      Max_Score        = Best_score;
      Max_Score_Len    = Best_row;
    }
  }  //  Over all possible number of errors

  //fprintf(stderr, "NDalgorithm::forward()- iterated over all errors, return best found\n");

  A_End = Max_Score_Len;
  T_End = Max_Score_Len + Max_Score_Best_d;

  Set_Right_Delta(Max_Score_Best_e, Max_Score_Best_d);

  Match_To_End = false;

  return;  //return(Max_Score_Best_e);
}