// aligns the query sequence to the reference using the Smith Waterman Gotoh algorithm
void CSmithWatermanGotoh::Align(unsigned int& referenceAl, string& cigarAl, const string& s1, const string& s2) {

  if((s1.length() == 0) || (s2.length() == 0)) {
    cout << "ERROR: Found a read with a zero length." << endl;
    exit(1);
  }

  unsigned int referenceLen      = s1.length() + 1;
  unsigned int queryLen          = s2.length() + 1;
  unsigned int sequenceSumLength = s1.length() + s2.length();

  // reinitialize our matrices

  if((referenceLen * queryLen) > mCurrentMatrixSize) {

    // calculate the new matrix size
    mCurrentMatrixSize = referenceLen * queryLen;

    // delete the old arrays
    if(mPointers)              delete [] mPointers;
    if(mSizesOfVerticalGaps)   delete [] mSizesOfVerticalGaps;
    if(mSizesOfHorizontalGaps) delete [] mSizesOfHorizontalGaps;

    try {

      // initialize the arrays
      mPointers              = new char[mCurrentMatrixSize];
      mSizesOfVerticalGaps   = new short[mCurrentMatrixSize];
      mSizesOfHorizontalGaps = new short[mCurrentMatrixSize];

    } catch(bad_alloc) {
      cout << "ERROR: Unable to allocate enough memory for the Smith-Waterman algorithm." << endl;
      exit(1);
    }
  }

  // initialize the traceback matrix to STOP
  memset((char*)mPointers, 0, SIZEOF_CHAR * queryLen);
  for(unsigned int i = 1; i < referenceLen; i++) mPointers[i * queryLen] = 0;

  // initialize the gap matrices to 1
  uninitialized_fill(mSizesOfVerticalGaps, mSizesOfVerticalGaps + mCurrentMatrixSize, 1);
  uninitialized_fill(mSizesOfHorizontalGaps, mSizesOfHorizontalGaps + mCurrentMatrixSize, 1);


  // initialize our repeat counts if they are needed
  vector<map<string, int> > referenceRepeats;
  vector<map<string, int> > queryRepeats;

  if (mUseRepeatGapExtensionPenalty) {
    for (unsigned int i = 0; i < queryLen; ++i)
      queryRepeats.push_back(repeatCounts(i, s2, repeat_size_max));
    for (unsigned int i = 0; i < referenceLen; ++i)
      referenceRepeats.push_back(repeatCounts(i, s1, repeat_size_max));

    // keep only the biggest repeat
    vector<map<string, int> >::iterator q = queryRepeats.begin();
    for (; q != queryRepeats.end(); ++q) {
      map<string, int>::iterator biggest = q->begin();
      map<string, int>::iterator z = q->begin();
      for (; z != q->end(); ++z)
        if (z->first.size() > biggest->first.size()) biggest = z;
      z = q->begin();
      while (z != q->end()) {
        if (z != biggest)
          q->erase(z++);
        else ++z;
      }
    }

    q = referenceRepeats.begin();
    for (; q != referenceRepeats.end(); ++q) {
      map<string, int>::iterator biggest = q->begin();
      map<string, int>::iterator z = q->begin();
      for (; z != q->end(); ++z)
        if (z->first.size() > biggest->first.size()) biggest = z;
      z = q->begin();
      while (z != q->end()) {
        if (z != biggest)
          q->erase(z++);
        else ++z;
      }
    }

    // remove repeat information from ends of queries
    // this results in the addition of spurious flanking deletions in repeats
    map<string, int>& qrend = queryRepeats.at(queryRepeats.size() - 2);
    if (!qrend.empty()) {
      int queryEndRepeatBases = qrend.begin()->first.size() * qrend.begin()->second;
      for (int i = 0; i < queryEndRepeatBases; ++i)
        queryRepeats.at(queryRepeats.size() - 2 - i).clear();
    }

    map<string, int>& qrbegin = queryRepeats.front();
    if (!qrbegin.empty()) {
      int queryBeginRepeatBases = qrbegin.begin()->first.size() * qrbegin.begin()->second;
      for (int i = 0; i < queryBeginRepeatBases; ++i)
        queryRepeats.at(i).clear();
    }

  }

  int entropyWindowSize = 8;
  vector<float> referenceEntropies;
  vector<float> queryEntropies;
  if (mUseEntropyGapOpenPenalty) {
    for (unsigned int i = 0; i < queryLen; ++i)
      queryEntropies.push_back(
        shannon_H((char*) &s2[max(0, min((int) i - entropyWindowSize / 2, (int) queryLen - entropyWindowSize - 1))],
              entropyWindowSize));
    for (unsigned int i = 0; i < referenceLen; ++i)
      referenceEntropies.push_back(
        shannon_H((char*) &s1[max(0, min((int) i - entropyWindowSize / 2, (int) referenceLen - entropyWindowSize - 1))],
              entropyWindowSize));
  }

  // normalize entropies
  /*
  float qsum = 0;
  float qnorm = 0;
  float qmax = 0;
  for (vector<float>::iterator q = queryEntropies.begin(); q != queryEntropies.end(); ++q) {
    qsum += *q;
    if (*q > qmax) qmax = *q;
  }
  qnorm = qsum / queryEntropies.size();
  for (vector<float>::iterator q = queryEntropies.begin(); q != queryEntropies.end(); ++q)
    *q = *q / qsum + qmax;

  float rsum = 0;
  float rnorm = 0;
  float rmax = 0;
  for (vector<float>::iterator r = referenceEntropies.begin(); r != referenceEntropies.end(); ++r) {
    rsum += *r;
    if (*r > rmax) rmax = *r;
  }
  rnorm = rsum / referenceEntropies.size();
  for (vector<float>::iterator r = referenceEntropies.begin(); r != referenceEntropies.end(); ++r)
    *r = *r / rsum + rmax;
  */

  //
  // construct
  //

  // reinitialize our query-dependent arrays
  if(s2.length() > mCurrentQuerySize) {

    // calculate the new query array size
    mCurrentQuerySize = s2.length();

    // delete the old arrays
    if(mQueryGapScores) delete [] mQueryGapScores;
    if(mBestScores)     delete [] mBestScores;

    // initialize the arrays
    try {

      mQueryGapScores = new float[mCurrentQuerySize + 1];
      mBestScores     = new float[mCurrentQuerySize + 1];

    } catch(bad_alloc) {
      cout << "ERROR: Unable to allocate enough memory for the Smith-Waterman algorithm." << endl;
      exit(1);
    }
  }

  // reinitialize our reference+query-dependent arrays
  if(sequenceSumLength > mCurrentAQSumSize) {

    // calculate the new reference array size
    mCurrentAQSumSize = sequenceSumLength;

    // delete the old arrays
    if(mReversedAnchor) delete [] mReversedAnchor;
    if(mReversedQuery)  delete [] mReversedQuery;

    // initialize the arrays
    try {

      mReversedAnchor = new char[mCurrentAQSumSize + 1];  // reversed sequence #1
      mReversedQuery  = new char[mCurrentAQSumSize + 1];  // reversed sequence #2

    } catch(bad_alloc) {
      cout << "ERROR: Unable to allocate enough memory for the Smith-Waterman algorithm." << endl;
      exit(1);
    }
  }

  // initialize the gap score and score vectors
  uninitialized_fill(mQueryGapScores, mQueryGapScores + queryLen, FLOAT_NEGATIVE_INFINITY);
  memset((char*)mBestScores, 0, SIZEOF_FLOAT * queryLen);

  float similarityScore, totalSimilarityScore, bestScoreDiagonal;
  float queryGapExtendScore, queryGapOpenScore;
  float referenceGapExtendScore, referenceGapOpenScore, currentAnchorGapScore;

  unsigned int BestColumn = 0;
  unsigned int BestRow    = 0;
  BestScore               = FLOAT_NEGATIVE_INFINITY;

  for(unsigned int i = 1, k = queryLen; i < referenceLen; i++, k += queryLen) {

    currentAnchorGapScore = FLOAT_NEGATIVE_INFINITY;
    bestScoreDiagonal = mBestScores[0];

    for(unsigned int j = 1, l = k + 1; j < queryLen; j++, l++) {

      // calculate our similarity score
      similarityScore = mScoringMatrix[s1[i - 1] - 'A'][s2[j - 1] - 'A'];

      // fill the matrices
      totalSimilarityScore = bestScoreDiagonal + similarityScore;
      
      //cerr << "i: " << i << ", j: " << j << ", totalSimilarityScore: " << totalSimilarityScore << endl;

      queryGapExtendScore = mQueryGapScores[j] - mGapExtendPenalty;
      queryGapOpenScore   = mBestScores[j] - mGapOpenPenalty;
      
      // compute the h**o-polymer gap score if enabled
      if(mUseHomoPolymerGapOpenPenalty)
        if((j > 1) && (s2[j - 1] == s2[j - 2]))
          queryGapOpenScore = mBestScores[j] - mHomoPolymerGapOpenPenalty;
      
      // compute the entropy gap score if enabled
      if (mUseEntropyGapOpenPenalty) {
        queryGapOpenScore = 
          mBestScores[j] - mGapOpenPenalty 
          * max(queryEntropies.at(j), referenceEntropies.at(i))
          * mEntropyGapOpenPenalty;
      }

      int gaplen = mSizesOfVerticalGaps[l - queryLen] + 1;

      if (mUseRepeatGapExtensionPenalty) {
        map<string, int>& repeats = queryRepeats[j];
        // does the sequence which would be inserted or deleted in this gap match the repeat structure which it is embedded in?
        if (!repeats.empty()) {

          const pair<string, int>& repeat = *repeats.begin();
          int repeatsize = repeat.first.size();
          if (gaplen != repeatsize && gaplen % repeatsize != 0) {
            gaplen = gaplen / repeatsize + repeatsize;
          }

          if ((repeat.first.size() * repeat.second) > 3 && gaplen + i < s1.length()) {
            string gapseq = string(&s1[i], gaplen);
            if (gapseq == repeat.first || isRepeatUnit(gapseq, repeat.first)) {
              queryGapExtendScore = mQueryGapScores[j]
                + mRepeatGapExtensionPenalty / (float) gaplen;
                //    mMaxRepeatGapExtensionPenalty)
            } else {
              queryGapExtendScore = mQueryGapScores[j] - mGapExtendPenalty;
            }
          }
        } else {
          queryGapExtendScore = mQueryGapScores[j] - mGapExtendPenalty;
        }
      }
          
      if(queryGapExtendScore > queryGapOpenScore) {
        mQueryGapScores[j] = queryGapExtendScore;
        mSizesOfVerticalGaps[l] = gaplen;
      } else mQueryGapScores[j] = queryGapOpenScore;
      
      referenceGapExtendScore = currentAnchorGapScore - mGapExtendPenalty;
      referenceGapOpenScore   = mBestScores[j - 1] - mGapOpenPenalty;
          
      // compute the h**o-polymer gap score if enabled
      if(mUseHomoPolymerGapOpenPenalty)
        if((i > 1) && (s1[i - 1] == s1[i - 2]))
          referenceGapOpenScore = mBestScores[j - 1] - mHomoPolymerGapOpenPenalty;
          
      // compute the entropy gap score if enabled
      if (mUseEntropyGapOpenPenalty) {
        referenceGapOpenScore = 
          mBestScores[j - 1] - mGapOpenPenalty 
          * max(queryEntropies.at(j), referenceEntropies.at(i))
          * mEntropyGapOpenPenalty;
      }

      gaplen = mSizesOfHorizontalGaps[l - 1] + 1;

      if (mUseRepeatGapExtensionPenalty) {
        map<string, int>& repeats = referenceRepeats[i];
        // does the sequence which would be inserted or deleted in this gap match the repeat structure which it is embedded in?
        if (!repeats.empty()) {

          const pair<string, int>& repeat = *repeats.begin();
          int repeatsize = repeat.first.size();
          if (gaplen != repeatsize && gaplen % repeatsize != 0) {
            gaplen = gaplen / repeatsize + repeatsize;
          }

          if ((repeat.first.size() * repeat.second) > 3 && gaplen + j < s2.length()) {
            string gapseq = string(&s2[j], gaplen);
            if (gapseq == repeat.first || isRepeatUnit(gapseq, repeat.first)) {
              referenceGapExtendScore = currentAnchorGapScore
                + mRepeatGapExtensionPenalty / (float) gaplen;
                //mMaxRepeatGapExtensionPenalty)
            } else {
              referenceGapExtendScore = currentAnchorGapScore - mGapExtendPenalty;
            }
          }
        } else {
          referenceGapExtendScore = currentAnchorGapScore - mGapExtendPenalty;
        }
      }

      if(referenceGapExtendScore > referenceGapOpenScore) {
        currentAnchorGapScore = referenceGapExtendScore;
        mSizesOfHorizontalGaps[l] = gaplen;
      } else currentAnchorGapScore = referenceGapOpenScore;
          
      bestScoreDiagonal = mBestScores[j];
      mBestScores[j] = MaxFloats(totalSimilarityScore, mQueryGapScores[j], currentAnchorGapScore);
          
          
      // determine the traceback direction
      // diagonal (445364713) > stop (238960195) > up (214378647) > left (166504495)
      if(mBestScores[j] == 0)                         mPointers[l] = Directions_STOP;
      else if(mBestScores[j] == totalSimilarityScore) mPointers[l] = Directions_DIAGONAL;
      else if(mBestScores[j] == mQueryGapScores[j])   mPointers[l] = Directions_UP;
      else                                            mPointers[l] = Directions_LEFT;
          
      // set the traceback start at the current cell i, j and score
      if(mBestScores[j] > BestScore) {
        BestRow    = i;
        BestColumn = j;
        BestScore  = mBestScores[j];
      }
    }
  }

  //
  // traceback
  //

  // aligned sequences
  int gappedAnchorLen  = 0;   // length of sequence #1 after alignment
  int gappedQueryLen   = 0;   // length of sequence #2 after alignment
  int numMismatches    = 0;   // the mismatched nucleotide count

  char c1, c2;

  int ci = BestRow;
  int cj = BestColumn;
  int ck = ci * queryLen;

  // traceback flag
  bool keepProcessing = true;

  while(keepProcessing) {
    //cerr << ci << " " << cj << " " << ck << "  ... " << gappedAnchorLen << " " << gappedQueryLen <<  endl;

    // diagonal (445364713) > stop (238960195) > up (214378647) > left (166504495)
    switch(mPointers[ck + cj]) {

    case Directions_DIAGONAL:
      c1 = s1[--ci];
      c2 = s2[--cj];
      ck -= queryLen;

      mReversedAnchor[gappedAnchorLen++] = c1;
      mReversedQuery[gappedQueryLen++]   = c2;

      // increment our mismatch counter
      if(mScoringMatrix[c1 - 'A'][c2 - 'A'] == mMismatchScore) numMismatches++;   
      break;

    case Directions_STOP:
      keepProcessing = false;
      break;

    case Directions_UP:
      for(unsigned int l = 0, len = mSizesOfVerticalGaps[ck + cj]; l < len; l++) {
        if (ci <= 0) {
          keepProcessing = false;
          break;
        }
        mReversedAnchor[gappedAnchorLen++] = s1[--ci];
        mReversedQuery[gappedQueryLen++]   = GAP;
        ck -= queryLen;
        numMismatches++;
      }
      break;

    case Directions_LEFT:
      for(unsigned int l = 0, len = mSizesOfHorizontalGaps[ck + cj]; l < len; l++) {
        if (cj <= 0) {
          keepProcessing = false;
          break;
        }
        mReversedAnchor[gappedAnchorLen++] = GAP;
        mReversedQuery[gappedQueryLen++]   = s2[--cj];
        numMismatches++;
      }
      break;
    }
  }

  // define the reference and query sequences
  mReversedAnchor[gappedAnchorLen] = 0;
  mReversedQuery[gappedQueryLen]   = 0;

  // catch sequences with different lengths
  if(gappedAnchorLen != gappedQueryLen) {
    cout << "ERROR: The aligned sequences have different lengths after Smith-Waterman-Gotoh algorithm." << endl;
    exit(1);
  }

  // reverse the strings and assign them to our alignment structure
  reverse(mReversedAnchor, mReversedAnchor + gappedAnchorLen);
  reverse(mReversedQuery,  mReversedQuery  + gappedQueryLen);

  //alignment.Reference = mReversedAnchor;
  //alignment.Query     = mReversedQuery;

  // set the reference endpoints
  //alignment.ReferenceBegin = ci;
  //alignment.ReferenceEnd   = BestRow - 1;
  referenceAl = ci;

  // set the query endpoints
  /*  
    if(alignment.IsReverseComplement) {
    alignment.QueryBegin = s2Length - BestColumn;
    alignment.QueryEnd   = s2Length - cj - 1;
    // alignment.QueryLength= alignment.QueryBegin - alignment.QueryEnd + 1;
    } else {
    alignment.QueryBegin = cj;
    alignment.QueryEnd   = BestColumn - 1;
    // alignment.QueryLength= alignment.QueryEnd - alignment.QueryBegin + 1;
    }
  */

  // set the query length and number of mismatches
  //alignment.QueryLength = alignment.QueryEnd - alignment.QueryBegin + 1;
  //alignment.NumMismatches  = numMismatches;

  unsigned int alLength = strlen(mReversedAnchor);
  unsigned int m = 0, d = 0, i = 0;
  bool dashRegion = false;
  ostringstream oCigar (ostringstream::out);
  int insertedBases = 0;

  if ( cj != 0 ) {
    if ( cj > 0 ) {
      oCigar << cj << 'S';
    } else { // how do we get negative cj's?
      referenceAl -= cj;
      alLength += cj;
    }
  }
    
  for ( unsigned int j = 0; j < alLength; j++ ) {
    // m
    if ( ( mReversedAnchor[j] != GAP ) && ( mReversedQuery[j] != GAP ) ) {
      if ( dashRegion ) {
        if ( d != 0 ) oCigar << d << 'D';
        else          { oCigar << i << 'I'; insertedBases += i; }
      }
      dashRegion = false;
      m++;
      d = 0;
      i = 0;
    }
    else {
      if ( !dashRegion && m )
        oCigar << m << 'M';
      dashRegion = true;
      m = 0;
      if ( mReversedAnchor[j] == GAP ) {
        if ( d != 0 ) oCigar << d << 'D';
        i++;
        d = 0;
      }
      else {
        if ( i != 0) { oCigar << i << 'I'; insertedBases += i; }
        d++;
        i = 0;
      }
    }
  }
  if      ( m != 0 ) oCigar << m << 'M';
  else if ( d != 0 ) oCigar << d << 'D';
  else if ( i != 0 ) oCigar << i << 'I';

  if ( BestColumn != s2.length() )
    oCigar << s2.length() - BestColumn << 'S';

  cigarAl = oCigar.str();

  // fix the gap order
  CorrectHomopolymerGapOrder(alLength, numMismatches);

  if (mUseEntropyGapOpenPenalty || mUseRepeatGapExtensionPenalty) {
    int offset = 0;
    string oldCigar;
    try {
      oldCigar = cigarAl;
      stablyLeftAlign(s2, cigarAl, s1.substr(referenceAl, alLength - insertedBases), offset);
    } catch (...) {
      cerr << "an exception occurred when left-aligning " << s1 << " " << s2 << endl;
      cigarAl = oldCigar; // undo the failed left-realignment attempt
      offset = 0;
    }
    referenceAl += offset;
  }

}
Esempio n. 2
0
// aligns the query sequence to the reference using the Smith Waterman Gotoh algorithm
void CSmithWatermanGotoh::Align(Alignment& alignment, const char* s1, const unsigned int s1Length, const char* s2, const unsigned int s2Length) {

	if((s1Length == 0) || (s2Length == 0)) {
		cout << "ERROR: Found a read with a zero length." << endl;
		exit(1);
	}

	unsigned int referenceLen      = s1Length + 1;
	unsigned int queryLen          = s2Length + 1;
	unsigned int sequenceSumLength = s1Length + s2Length;

	// reinitialize our matrices

	if((referenceLen * queryLen) > mCurrentMatrixSize) {

		// calculate the new matrix size
		mCurrentMatrixSize = referenceLen * queryLen;

		// delete the old arrays
		if(mPointers)              delete [] mPointers;
		if(mSizesOfVerticalGaps)   delete [] mSizesOfVerticalGaps;
		if(mSizesOfHorizontalGaps) delete [] mSizesOfHorizontalGaps;

		try {

			// initialize the arrays
			mPointers              = new char[mCurrentMatrixSize];
			mSizesOfVerticalGaps   = new short[mCurrentMatrixSize];
			mSizesOfHorizontalGaps = new short[mCurrentMatrixSize];

		} catch(bad_alloc) {
			cout << "ERROR: Unable to allocate enough memory for the Smith-Waterman algorithm." << endl;
			exit(1);
		}
	}

	// initialize the traceback matrix to STOP
	memset((char*)mPointers, 0, SIZEOF_CHAR * queryLen);
	for(unsigned int i = 1; i < referenceLen; i++) mPointers[i * queryLen] = 0;

	// initialize the gap matrices to 1
	uninitialized_fill(mSizesOfVerticalGaps, mSizesOfVerticalGaps + mCurrentMatrixSize, 1);
	uninitialized_fill(mSizesOfHorizontalGaps, mSizesOfHorizontalGaps + mCurrentMatrixSize, 1);

	//
	// construct
	//

	// reinitialize our query-dependent arrays
	if(s2Length > mCurrentQuerySize) {

		// calculate the new query array size
		mCurrentQuerySize = s2Length;

		// delete the old arrays
		if(mQueryGapScores) delete [] mQueryGapScores;
		if(mBestScores)     delete [] mBestScores;

		// initialize the arrays
		try {

			mQueryGapScores = new float[mCurrentQuerySize + 1];
			mBestScores     = new float[mCurrentQuerySize + 1];

		} catch(bad_alloc) {
			cout << "ERROR: Unable to allocate enough memory for the Smith-Waterman algorithm." << endl;
			exit(1);
		}
	}

	// reinitialize our reference+query-dependent arrays
	if(sequenceSumLength > mCurrentAQSumSize) {

		// calculate the new reference array size
		mCurrentAQSumSize = sequenceSumLength;

		// delete the old arrays
		if(mReversedAnchor) delete [] mReversedAnchor;
		if(mReversedQuery)  delete [] mReversedQuery;

		// initialize the arrays
		try {

			mReversedAnchor = new char[mCurrentAQSumSize + 1];	// reversed sequence #1
			mReversedQuery  = new char[mCurrentAQSumSize + 1];	// reversed sequence #2

		} catch(bad_alloc) {
			cout << "ERROR: Unable to allocate enough memory for the Smith-Waterman algorithm." << endl;
			exit(1);
		}
	}

	// initialize the gap score and score vectors
	uninitialized_fill(mQueryGapScores, mQueryGapScores + queryLen, FLOAT_NEGATIVE_INFINITY);
	memset((char*)mBestScores, 0, SIZEOF_FLOAT * queryLen);

	float similarityScore, totalSimilarityScore, bestScoreDiagonal;
	float queryGapExtendScore, queryGapOpenScore;
	float referenceGapExtendScore, referenceGapOpenScore, currentAnchorGapScore;

	unsigned int BestColumn = 0;
	unsigned int BestRow    = 0;
	float BestScore         = FLOAT_NEGATIVE_INFINITY;

	for(unsigned int i = 1, k = queryLen; i < referenceLen; i++, k += queryLen) {

		currentAnchorGapScore = FLOAT_NEGATIVE_INFINITY;
		bestScoreDiagonal = mBestScores[0];

		for(unsigned int j = 1, l = k + 1; j < queryLen; j++, l++) {

			// calculate our similarity score
			similarityScore = mScoringMatrix[s1[i - 1] - 'A'][s2[j - 1] - 'A'];

			// fill the matrices
			totalSimilarityScore = bestScoreDiagonal + similarityScore;

			//cout << "i: " << i << ", j: " << j << ", totalSimilarityScore: " << totalSimilarityScore << endl;

			queryGapExtendScore = mQueryGapScores[j] - mGapExtendPenalty;
			queryGapOpenScore   = mBestScores[j] - mGapOpenPenalty;

			// compute the h**o-polymer gap score if enabled
			if(mUseHomoPolymerGapOpenPenalty)
				if((j > 1) && (s2[j - 1] == s2[j - 2]))
					queryGapOpenScore = mBestScores[j] - mHomoPolymerGapOpenPenalty;

			if(queryGapExtendScore > queryGapOpenScore) {
				mQueryGapScores[j] = queryGapExtendScore;
				mSizesOfVerticalGaps[l] = (short)(mSizesOfVerticalGaps[l - queryLen] + 1);
			} else mQueryGapScores[j] = queryGapOpenScore;

			referenceGapExtendScore = currentAnchorGapScore - mGapExtendPenalty;
			referenceGapOpenScore   = mBestScores[j - 1] - mGapOpenPenalty;

			// compute the h**o-polymer gap score if enabled
			if(mUseHomoPolymerGapOpenPenalty)
				if((i > 1) && (s1[i - 1] == s1[i - 2]))
					referenceGapOpenScore = mBestScores[j - 1] - mHomoPolymerGapOpenPenalty;

			if(referenceGapExtendScore > referenceGapOpenScore) {
				currentAnchorGapScore = referenceGapExtendScore;
				mSizesOfHorizontalGaps[l] = (short)(mSizesOfHorizontalGaps[l - 1] + 1);
			} else currentAnchorGapScore = referenceGapOpenScore;

			bestScoreDiagonal = mBestScores[j];
			mBestScores[j] = MaxFloats(totalSimilarityScore, mQueryGapScores[j], currentAnchorGapScore);

			// determine the traceback direction
			// diagonal (445364713) > stop (238960195) > up (214378647) > left (166504495)
			if(mBestScores[j] == 0)                         mPointers[l] = Directions_STOP;
			else if(mBestScores[j] == totalSimilarityScore) mPointers[l] = Directions_DIAGONAL;
			else if(mBestScores[j] == mQueryGapScores[j])   mPointers[l] = Directions_UP;
			else                                            mPointers[l] = Directions_LEFT;

			// set the traceback start at the current cell i, j and score
			if(mBestScores[j] > BestScore) {
				BestRow    = i;
				BestColumn = j;
				BestScore  = mBestScores[j];
			}
		}
	}

	//
	// traceback
	//

	alignment.SwScore = BestScore;
	// aligned sequences
	int gappedAnchorLen  = 0;   // length of sequence #1 after alignment
	int gappedQueryLen   = 0;   // length of sequence #2 after alignment
	int numMismatches    = 0;   // the mismatched nucleotide count

	char c1, c2;

	int ci = BestRow;
	int cj = BestColumn;
	int ck = ci * queryLen;

	// traceback flag
	bool keepProcessing = true;
	bool hasGap = false;

	bool matchRegion = false;
	unsigned short longestMatch       = 0;
	unsigned short currentMatchLength = 0;

	while(keepProcessing) {

		// diagonal (445364713) > stop (238960195) > up (214378647) > left (166504495)
		switch(mPointers[ck + cj]) {

			case Directions_DIAGONAL:
				c1 = s1[--ci];
				c2 = s2[--cj];
				ck -= queryLen;

				if ( s1[ci] == s2[cj] ) {
					matchRegion = true;
					++currentMatchLength;
				} else {
					matchRegion = false;
					longestMatch = ( currentMatchLength > longestMatch ) ? currentMatchLength : longestMatch;
					currentMatchLength = 0;
				}

				mReversedAnchor[gappedAnchorLen++] = c1;
				mReversedQuery[gappedQueryLen++]   = c2;

				// increment our mismatch counter
				if(mScoringMatrix[c1 - 'A'][c2 - 'A'] == mMismatchScore) numMismatches++;	
				break;

			case Directions_STOP:
				if ( matchRegion )
					longestMatch = ( currentMatchLength > longestMatch ) ? currentMatchLength : longestMatch;

				keepProcessing = false;
				break;

			case Directions_UP:
				if ( matchRegion ) {
					matchRegion = false;
					longestMatch = ( currentMatchLength > longestMatch ) ? currentMatchLength : longestMatch;
					currentMatchLength = 0;
				}

				for(unsigned int l = 0, len = mSizesOfVerticalGaps[ck + cj]; l < len; l++) {
					mReversedAnchor[gappedAnchorLen++] = s1[--ci];
					mReversedQuery[gappedQueryLen++]   = GAP;
					ck -= queryLen;
					numMismatches++;
				}
				hasGap = true;
				break;

			case Directions_LEFT:
				if ( matchRegion ) {
					matchRegion = false;
					longestMatch = ( currentMatchLength > longestMatch ) ? currentMatchLength : longestMatch;
					currentMatchLength = 0;
				}

				for(unsigned int l = 0, len = mSizesOfHorizontalGaps[ck + cj]; l < len; l++) {
					mReversedAnchor[gappedAnchorLen++] = GAP;
					mReversedQuery[gappedQueryLen++]   = s2[--cj];
					numMismatches++;
				}
				hasGap = true;
				break;
		}
	}

	// define the reference and query sequences
	mReversedAnchor[gappedAnchorLen] = 0;
	mReversedQuery[gappedQueryLen]   = 0;

	// catch sequences with different lengths
	if(gappedAnchorLen != gappedQueryLen) {
		cout << "ERROR: The aligned sequences have different lengths after Smith-Waterman-Gotoh algorithm." << endl;
		exit(1);
	}

	// reverse the strings and assign them to our alignment structure
	reverse(mReversedAnchor, mReversedAnchor + gappedAnchorLen);
	reverse(mReversedQuery,  mReversedQuery  + gappedQueryLen);

	alignment.Reference = mReversedAnchor;
	alignment.Query     = mReversedQuery;

	// set the reference endpoints
	alignment.ReferenceBegin = ci;
	alignment.ReferenceEnd   = BestRow - 1;

	// set the query endpoints
	if(alignment.IsReverseStrand) {
		alignment.QueryBegin = s2Length - BestColumn;
		alignment.QueryEnd   = s2Length - cj - 1;
	} else {
		alignment.QueryBegin = cj;
		alignment.QueryEnd   = BestColumn - 1;
	}

	// set the query length and number of mismatches
	alignment.QueryLength      = alignment.QueryEnd - alignment.QueryBegin + 1;
	alignment.NumMismatches    = numMismatches;
	alignment.NumLongestMatchs = longestMatch;


	// fix the gap order
	//if(hasGap) CorrectHomopolymerGapOrder(alignment);
}
Esempio n. 3
0
// calculates the score during the forward algorithm
float CBandedSmithWaterman::CalculateScore(const char* s1, const char* s2, const unsigned int rowNum, const unsigned int columnNum, float& currentQueryGapScore, const unsigned int rowOffset, const unsigned int columnOffset) {

	// initialize
	const unsigned int row      = rowNum + rowOffset;
	const unsigned int column   = columnOffset - rowNum + columnNum;
	const unsigned int position = row * (mBandwidth + 2) + column;

	// retrieve the similarity scores
	const float similarityScore      = mScoringMatrix[s1[columnNum] - 'A'][s2[rowNum] - 'A'];
	const float totalSimilarityScore = mBestScores[column] + similarityScore;

	// ================================
	// open a gap in the query sequence
	// ================================

	float queryGapExtendScore = currentQueryGapScore - mGapExtendPenalty;
	float queryGapOpenScore   = mBestScores[column - 1] - mGapOpenPenalty;

	// compute the h**o-polymer gap score if enabled
	if(mUseHomoPolymerGapOpenPenalty)
		if((rowNum > 1) && (s2[rowNum] == s2[rowNum - 1]))
			queryGapOpenScore = mBestScores[column - 1] - mHomoPolymerGapOpenPenalty;

	if(queryGapExtendScore > queryGapOpenScore) {
		currentQueryGapScore = queryGapExtendScore;
		mPointers[position].mSizeOfHorizontalGaps = mPointers[position - 1].mSizeOfHorizontalGaps + 1;
	} else currentQueryGapScore = queryGapOpenScore;

	// ====================================
	// open a gap in the reference sequence
	// ====================================

	float anchorGapExtendScore = mAnchorGapScores[column + 1] - mGapExtendPenalty;
	float anchorGapOpenScore   = mBestScores[column + 1] - mGapOpenPenalty;

	// compute the h**o-polymer gap score if enabled	
	if(mUseHomoPolymerGapOpenPenalty)
		if((columnNum > 1) && (s1[columnNum] == s1[columnNum - 1]))
			anchorGapOpenScore = mBestScores[column + 1] - mHomoPolymerGapOpenPenalty;

	if(anchorGapExtendScore > anchorGapOpenScore) {
		mAnchorGapScores[column] = anchorGapExtendScore;
		mPointers[position].mSizeOfVerticalGaps = mPointers[position - mBandwidth - 1].mSizeOfVerticalGaps + 1;
	} else mAnchorGapScores[column] = anchorGapOpenScore;

	// ======================================
	// calculate the best score and direction
	// ======================================

	//mBestScores[column] = MaxFloats(totalSimilarityScore, mAnchorGapScores[column], currentQueryGapScore);
	mBestScores[column] = MaxFloats(totalSimilarityScore, currentQueryGapScore, mAnchorGapScores[column]);

	// determine the traceback direction
	// diagonal (445364713) > stop (238960195) > up (214378647) > left (166504495)
	if(mBestScores[column] == 0)                         mPointers[position].Direction = Directions_STOP;
	else if(mBestScores[column] == totalSimilarityScore) mPointers[position].Direction = Directions_UP;
	else if(mBestScores[column] == currentQueryGapScore) mPointers[position].Direction = Directions_LEFT;
	else                                                 mPointers[position].Direction = Directions_DIAGONAL;

	return mBestScores[column];
}