// aligns the query sequence to the reference using the Smith Waterman Gotoh algorithm void CSmithWatermanGotoh::Align(unsigned int& referenceAl, string& cigarAl, const string& s1, const string& s2) { if((s1.length() == 0) || (s2.length() == 0)) { cout << "ERROR: Found a read with a zero length." << endl; exit(1); } unsigned int referenceLen = s1.length() + 1; unsigned int queryLen = s2.length() + 1; unsigned int sequenceSumLength = s1.length() + s2.length(); // reinitialize our matrices if((referenceLen * queryLen) > mCurrentMatrixSize) { // calculate the new matrix size mCurrentMatrixSize = referenceLen * queryLen; // delete the old arrays if(mPointers) delete [] mPointers; if(mSizesOfVerticalGaps) delete [] mSizesOfVerticalGaps; if(mSizesOfHorizontalGaps) delete [] mSizesOfHorizontalGaps; try { // initialize the arrays mPointers = new char[mCurrentMatrixSize]; mSizesOfVerticalGaps = new short[mCurrentMatrixSize]; mSizesOfHorizontalGaps = new short[mCurrentMatrixSize]; } catch(bad_alloc) { cout << "ERROR: Unable to allocate enough memory for the Smith-Waterman algorithm." << endl; exit(1); } } // initialize the traceback matrix to STOP memset((char*)mPointers, 0, SIZEOF_CHAR * queryLen); for(unsigned int i = 1; i < referenceLen; i++) mPointers[i * queryLen] = 0; // initialize the gap matrices to 1 uninitialized_fill(mSizesOfVerticalGaps, mSizesOfVerticalGaps + mCurrentMatrixSize, 1); uninitialized_fill(mSizesOfHorizontalGaps, mSizesOfHorizontalGaps + mCurrentMatrixSize, 1); // initialize our repeat counts if they are needed vector<map<string, int> > referenceRepeats; vector<map<string, int> > queryRepeats; if (mUseRepeatGapExtensionPenalty) { for (unsigned int i = 0; i < queryLen; ++i) queryRepeats.push_back(repeatCounts(i, s2, repeat_size_max)); for (unsigned int i = 0; i < referenceLen; ++i) referenceRepeats.push_back(repeatCounts(i, s1, repeat_size_max)); // keep only the biggest repeat vector<map<string, int> >::iterator q = queryRepeats.begin(); for (; q != queryRepeats.end(); ++q) { map<string, int>::iterator biggest = q->begin(); map<string, int>::iterator z = q->begin(); for (; z != q->end(); ++z) if (z->first.size() > biggest->first.size()) biggest = z; z = q->begin(); while (z != q->end()) { if (z != biggest) q->erase(z++); else ++z; } } q = referenceRepeats.begin(); for (; q != referenceRepeats.end(); ++q) { map<string, int>::iterator biggest = q->begin(); map<string, int>::iterator z = q->begin(); for (; z != q->end(); ++z) if (z->first.size() > biggest->first.size()) biggest = z; z = q->begin(); while (z != q->end()) { if (z != biggest) q->erase(z++); else ++z; } } // remove repeat information from ends of queries // this results in the addition of spurious flanking deletions in repeats map<string, int>& qrend = queryRepeats.at(queryRepeats.size() - 2); if (!qrend.empty()) { int queryEndRepeatBases = qrend.begin()->first.size() * qrend.begin()->second; for (int i = 0; i < queryEndRepeatBases; ++i) queryRepeats.at(queryRepeats.size() - 2 - i).clear(); } map<string, int>& qrbegin = queryRepeats.front(); if (!qrbegin.empty()) { int queryBeginRepeatBases = qrbegin.begin()->first.size() * qrbegin.begin()->second; for (int i = 0; i < queryBeginRepeatBases; ++i) queryRepeats.at(i).clear(); } } int entropyWindowSize = 8; vector<float> referenceEntropies; vector<float> queryEntropies; if (mUseEntropyGapOpenPenalty) { for (unsigned int i = 0; i < queryLen; ++i) queryEntropies.push_back( shannon_H((char*) &s2[max(0, min((int) i - entropyWindowSize / 2, (int) queryLen - entropyWindowSize - 1))], entropyWindowSize)); for (unsigned int i = 0; i < referenceLen; ++i) referenceEntropies.push_back( shannon_H((char*) &s1[max(0, min((int) i - entropyWindowSize / 2, (int) referenceLen - entropyWindowSize - 1))], entropyWindowSize)); } // normalize entropies /* float qsum = 0; float qnorm = 0; float qmax = 0; for (vector<float>::iterator q = queryEntropies.begin(); q != queryEntropies.end(); ++q) { qsum += *q; if (*q > qmax) qmax = *q; } qnorm = qsum / queryEntropies.size(); for (vector<float>::iterator q = queryEntropies.begin(); q != queryEntropies.end(); ++q) *q = *q / qsum + qmax; float rsum = 0; float rnorm = 0; float rmax = 0; for (vector<float>::iterator r = referenceEntropies.begin(); r != referenceEntropies.end(); ++r) { rsum += *r; if (*r > rmax) rmax = *r; } rnorm = rsum / referenceEntropies.size(); for (vector<float>::iterator r = referenceEntropies.begin(); r != referenceEntropies.end(); ++r) *r = *r / rsum + rmax; */ // // construct // // reinitialize our query-dependent arrays if(s2.length() > mCurrentQuerySize) { // calculate the new query array size mCurrentQuerySize = s2.length(); // delete the old arrays if(mQueryGapScores) delete [] mQueryGapScores; if(mBestScores) delete [] mBestScores; // initialize the arrays try { mQueryGapScores = new float[mCurrentQuerySize + 1]; mBestScores = new float[mCurrentQuerySize + 1]; } catch(bad_alloc) { cout << "ERROR: Unable to allocate enough memory for the Smith-Waterman algorithm." << endl; exit(1); } } // reinitialize our reference+query-dependent arrays if(sequenceSumLength > mCurrentAQSumSize) { // calculate the new reference array size mCurrentAQSumSize = sequenceSumLength; // delete the old arrays if(mReversedAnchor) delete [] mReversedAnchor; if(mReversedQuery) delete [] mReversedQuery; // initialize the arrays try { mReversedAnchor = new char[mCurrentAQSumSize + 1]; // reversed sequence #1 mReversedQuery = new char[mCurrentAQSumSize + 1]; // reversed sequence #2 } catch(bad_alloc) { cout << "ERROR: Unable to allocate enough memory for the Smith-Waterman algorithm." << endl; exit(1); } } // initialize the gap score and score vectors uninitialized_fill(mQueryGapScores, mQueryGapScores + queryLen, FLOAT_NEGATIVE_INFINITY); memset((char*)mBestScores, 0, SIZEOF_FLOAT * queryLen); float similarityScore, totalSimilarityScore, bestScoreDiagonal; float queryGapExtendScore, queryGapOpenScore; float referenceGapExtendScore, referenceGapOpenScore, currentAnchorGapScore; unsigned int BestColumn = 0; unsigned int BestRow = 0; BestScore = FLOAT_NEGATIVE_INFINITY; for(unsigned int i = 1, k = queryLen; i < referenceLen; i++, k += queryLen) { currentAnchorGapScore = FLOAT_NEGATIVE_INFINITY; bestScoreDiagonal = mBestScores[0]; for(unsigned int j = 1, l = k + 1; j < queryLen; j++, l++) { // calculate our similarity score similarityScore = mScoringMatrix[s1[i - 1] - 'A'][s2[j - 1] - 'A']; // fill the matrices totalSimilarityScore = bestScoreDiagonal + similarityScore; //cerr << "i: " << i << ", j: " << j << ", totalSimilarityScore: " << totalSimilarityScore << endl; queryGapExtendScore = mQueryGapScores[j] - mGapExtendPenalty; queryGapOpenScore = mBestScores[j] - mGapOpenPenalty; // compute the h**o-polymer gap score if enabled if(mUseHomoPolymerGapOpenPenalty) if((j > 1) && (s2[j - 1] == s2[j - 2])) queryGapOpenScore = mBestScores[j] - mHomoPolymerGapOpenPenalty; // compute the entropy gap score if enabled if (mUseEntropyGapOpenPenalty) { queryGapOpenScore = mBestScores[j] - mGapOpenPenalty * max(queryEntropies.at(j), referenceEntropies.at(i)) * mEntropyGapOpenPenalty; } int gaplen = mSizesOfVerticalGaps[l - queryLen] + 1; if (mUseRepeatGapExtensionPenalty) { map<string, int>& repeats = queryRepeats[j]; // does the sequence which would be inserted or deleted in this gap match the repeat structure which it is embedded in? if (!repeats.empty()) { const pair<string, int>& repeat = *repeats.begin(); int repeatsize = repeat.first.size(); if (gaplen != repeatsize && gaplen % repeatsize != 0) { gaplen = gaplen / repeatsize + repeatsize; } if ((repeat.first.size() * repeat.second) > 3 && gaplen + i < s1.length()) { string gapseq = string(&s1[i], gaplen); if (gapseq == repeat.first || isRepeatUnit(gapseq, repeat.first)) { queryGapExtendScore = mQueryGapScores[j] + mRepeatGapExtensionPenalty / (float) gaplen; // mMaxRepeatGapExtensionPenalty) } else { queryGapExtendScore = mQueryGapScores[j] - mGapExtendPenalty; } } } else { queryGapExtendScore = mQueryGapScores[j] - mGapExtendPenalty; } } if(queryGapExtendScore > queryGapOpenScore) { mQueryGapScores[j] = queryGapExtendScore; mSizesOfVerticalGaps[l] = gaplen; } else mQueryGapScores[j] = queryGapOpenScore; referenceGapExtendScore = currentAnchorGapScore - mGapExtendPenalty; referenceGapOpenScore = mBestScores[j - 1] - mGapOpenPenalty; // compute the h**o-polymer gap score if enabled if(mUseHomoPolymerGapOpenPenalty) if((i > 1) && (s1[i - 1] == s1[i - 2])) referenceGapOpenScore = mBestScores[j - 1] - mHomoPolymerGapOpenPenalty; // compute the entropy gap score if enabled if (mUseEntropyGapOpenPenalty) { referenceGapOpenScore = mBestScores[j - 1] - mGapOpenPenalty * max(queryEntropies.at(j), referenceEntropies.at(i)) * mEntropyGapOpenPenalty; } gaplen = mSizesOfHorizontalGaps[l - 1] + 1; if (mUseRepeatGapExtensionPenalty) { map<string, int>& repeats = referenceRepeats[i]; // does the sequence which would be inserted or deleted in this gap match the repeat structure which it is embedded in? if (!repeats.empty()) { const pair<string, int>& repeat = *repeats.begin(); int repeatsize = repeat.first.size(); if (gaplen != repeatsize && gaplen % repeatsize != 0) { gaplen = gaplen / repeatsize + repeatsize; } if ((repeat.first.size() * repeat.second) > 3 && gaplen + j < s2.length()) { string gapseq = string(&s2[j], gaplen); if (gapseq == repeat.first || isRepeatUnit(gapseq, repeat.first)) { referenceGapExtendScore = currentAnchorGapScore + mRepeatGapExtensionPenalty / (float) gaplen; //mMaxRepeatGapExtensionPenalty) } else { referenceGapExtendScore = currentAnchorGapScore - mGapExtendPenalty; } } } else { referenceGapExtendScore = currentAnchorGapScore - mGapExtendPenalty; } } if(referenceGapExtendScore > referenceGapOpenScore) { currentAnchorGapScore = referenceGapExtendScore; mSizesOfHorizontalGaps[l] = gaplen; } else currentAnchorGapScore = referenceGapOpenScore; bestScoreDiagonal = mBestScores[j]; mBestScores[j] = MaxFloats(totalSimilarityScore, mQueryGapScores[j], currentAnchorGapScore); // determine the traceback direction // diagonal (445364713) > stop (238960195) > up (214378647) > left (166504495) if(mBestScores[j] == 0) mPointers[l] = Directions_STOP; else if(mBestScores[j] == totalSimilarityScore) mPointers[l] = Directions_DIAGONAL; else if(mBestScores[j] == mQueryGapScores[j]) mPointers[l] = Directions_UP; else mPointers[l] = Directions_LEFT; // set the traceback start at the current cell i, j and score if(mBestScores[j] > BestScore) { BestRow = i; BestColumn = j; BestScore = mBestScores[j]; } } } // // traceback // // aligned sequences int gappedAnchorLen = 0; // length of sequence #1 after alignment int gappedQueryLen = 0; // length of sequence #2 after alignment int numMismatches = 0; // the mismatched nucleotide count char c1, c2; int ci = BestRow; int cj = BestColumn; int ck = ci * queryLen; // traceback flag bool keepProcessing = true; while(keepProcessing) { //cerr << ci << " " << cj << " " << ck << " ... " << gappedAnchorLen << " " << gappedQueryLen << endl; // diagonal (445364713) > stop (238960195) > up (214378647) > left (166504495) switch(mPointers[ck + cj]) { case Directions_DIAGONAL: c1 = s1[--ci]; c2 = s2[--cj]; ck -= queryLen; mReversedAnchor[gappedAnchorLen++] = c1; mReversedQuery[gappedQueryLen++] = c2; // increment our mismatch counter if(mScoringMatrix[c1 - 'A'][c2 - 'A'] == mMismatchScore) numMismatches++; break; case Directions_STOP: keepProcessing = false; break; case Directions_UP: for(unsigned int l = 0, len = mSizesOfVerticalGaps[ck + cj]; l < len; l++) { if (ci <= 0) { keepProcessing = false; break; } mReversedAnchor[gappedAnchorLen++] = s1[--ci]; mReversedQuery[gappedQueryLen++] = GAP; ck -= queryLen; numMismatches++; } break; case Directions_LEFT: for(unsigned int l = 0, len = mSizesOfHorizontalGaps[ck + cj]; l < len; l++) { if (cj <= 0) { keepProcessing = false; break; } mReversedAnchor[gappedAnchorLen++] = GAP; mReversedQuery[gappedQueryLen++] = s2[--cj]; numMismatches++; } break; } } // define the reference and query sequences mReversedAnchor[gappedAnchorLen] = 0; mReversedQuery[gappedQueryLen] = 0; // catch sequences with different lengths if(gappedAnchorLen != gappedQueryLen) { cout << "ERROR: The aligned sequences have different lengths after Smith-Waterman-Gotoh algorithm." << endl; exit(1); } // reverse the strings and assign them to our alignment structure reverse(mReversedAnchor, mReversedAnchor + gappedAnchorLen); reverse(mReversedQuery, mReversedQuery + gappedQueryLen); //alignment.Reference = mReversedAnchor; //alignment.Query = mReversedQuery; // set the reference endpoints //alignment.ReferenceBegin = ci; //alignment.ReferenceEnd = BestRow - 1; referenceAl = ci; // set the query endpoints /* if(alignment.IsReverseComplement) { alignment.QueryBegin = s2Length - BestColumn; alignment.QueryEnd = s2Length - cj - 1; // alignment.QueryLength= alignment.QueryBegin - alignment.QueryEnd + 1; } else { alignment.QueryBegin = cj; alignment.QueryEnd = BestColumn - 1; // alignment.QueryLength= alignment.QueryEnd - alignment.QueryBegin + 1; } */ // set the query length and number of mismatches //alignment.QueryLength = alignment.QueryEnd - alignment.QueryBegin + 1; //alignment.NumMismatches = numMismatches; unsigned int alLength = strlen(mReversedAnchor); unsigned int m = 0, d = 0, i = 0; bool dashRegion = false; ostringstream oCigar (ostringstream::out); int insertedBases = 0; if ( cj != 0 ) { if ( cj > 0 ) { oCigar << cj << 'S'; } else { // how do we get negative cj's? referenceAl -= cj; alLength += cj; } } for ( unsigned int j = 0; j < alLength; j++ ) { // m if ( ( mReversedAnchor[j] != GAP ) && ( mReversedQuery[j] != GAP ) ) { if ( dashRegion ) { if ( d != 0 ) oCigar << d << 'D'; else { oCigar << i << 'I'; insertedBases += i; } } dashRegion = false; m++; d = 0; i = 0; } else { if ( !dashRegion && m ) oCigar << m << 'M'; dashRegion = true; m = 0; if ( mReversedAnchor[j] == GAP ) { if ( d != 0 ) oCigar << d << 'D'; i++; d = 0; } else { if ( i != 0) { oCigar << i << 'I'; insertedBases += i; } d++; i = 0; } } } if ( m != 0 ) oCigar << m << 'M'; else if ( d != 0 ) oCigar << d << 'D'; else if ( i != 0 ) oCigar << i << 'I'; if ( BestColumn != s2.length() ) oCigar << s2.length() - BestColumn << 'S'; cigarAl = oCigar.str(); // fix the gap order CorrectHomopolymerGapOrder(alLength, numMismatches); if (mUseEntropyGapOpenPenalty || mUseRepeatGapExtensionPenalty) { int offset = 0; string oldCigar; try { oldCigar = cigarAl; stablyLeftAlign(s2, cigarAl, s1.substr(referenceAl, alLength - insertedBases), offset); } catch (...) { cerr << "an exception occurred when left-aligning " << s1 << " " << s2 << endl; cigarAl = oldCigar; // undo the failed left-realignment attempt offset = 0; } referenceAl += offset; } }
// aligns the query sequence to the reference using the Smith Waterman Gotoh algorithm void CSmithWatermanGotoh::Align(Alignment& alignment, const char* s1, const unsigned int s1Length, const char* s2, const unsigned int s2Length) { if((s1Length == 0) || (s2Length == 0)) { cout << "ERROR: Found a read with a zero length." << endl; exit(1); } unsigned int referenceLen = s1Length + 1; unsigned int queryLen = s2Length + 1; unsigned int sequenceSumLength = s1Length + s2Length; // reinitialize our matrices if((referenceLen * queryLen) > mCurrentMatrixSize) { // calculate the new matrix size mCurrentMatrixSize = referenceLen * queryLen; // delete the old arrays if(mPointers) delete [] mPointers; if(mSizesOfVerticalGaps) delete [] mSizesOfVerticalGaps; if(mSizesOfHorizontalGaps) delete [] mSizesOfHorizontalGaps; try { // initialize the arrays mPointers = new char[mCurrentMatrixSize]; mSizesOfVerticalGaps = new short[mCurrentMatrixSize]; mSizesOfHorizontalGaps = new short[mCurrentMatrixSize]; } catch(bad_alloc) { cout << "ERROR: Unable to allocate enough memory for the Smith-Waterman algorithm." << endl; exit(1); } } // initialize the traceback matrix to STOP memset((char*)mPointers, 0, SIZEOF_CHAR * queryLen); for(unsigned int i = 1; i < referenceLen; i++) mPointers[i * queryLen] = 0; // initialize the gap matrices to 1 uninitialized_fill(mSizesOfVerticalGaps, mSizesOfVerticalGaps + mCurrentMatrixSize, 1); uninitialized_fill(mSizesOfHorizontalGaps, mSizesOfHorizontalGaps + mCurrentMatrixSize, 1); // // construct // // reinitialize our query-dependent arrays if(s2Length > mCurrentQuerySize) { // calculate the new query array size mCurrentQuerySize = s2Length; // delete the old arrays if(mQueryGapScores) delete [] mQueryGapScores; if(mBestScores) delete [] mBestScores; // initialize the arrays try { mQueryGapScores = new float[mCurrentQuerySize + 1]; mBestScores = new float[mCurrentQuerySize + 1]; } catch(bad_alloc) { cout << "ERROR: Unable to allocate enough memory for the Smith-Waterman algorithm." << endl; exit(1); } } // reinitialize our reference+query-dependent arrays if(sequenceSumLength > mCurrentAQSumSize) { // calculate the new reference array size mCurrentAQSumSize = sequenceSumLength; // delete the old arrays if(mReversedAnchor) delete [] mReversedAnchor; if(mReversedQuery) delete [] mReversedQuery; // initialize the arrays try { mReversedAnchor = new char[mCurrentAQSumSize + 1]; // reversed sequence #1 mReversedQuery = new char[mCurrentAQSumSize + 1]; // reversed sequence #2 } catch(bad_alloc) { cout << "ERROR: Unable to allocate enough memory for the Smith-Waterman algorithm." << endl; exit(1); } } // initialize the gap score and score vectors uninitialized_fill(mQueryGapScores, mQueryGapScores + queryLen, FLOAT_NEGATIVE_INFINITY); memset((char*)mBestScores, 0, SIZEOF_FLOAT * queryLen); float similarityScore, totalSimilarityScore, bestScoreDiagonal; float queryGapExtendScore, queryGapOpenScore; float referenceGapExtendScore, referenceGapOpenScore, currentAnchorGapScore; unsigned int BestColumn = 0; unsigned int BestRow = 0; float BestScore = FLOAT_NEGATIVE_INFINITY; for(unsigned int i = 1, k = queryLen; i < referenceLen; i++, k += queryLen) { currentAnchorGapScore = FLOAT_NEGATIVE_INFINITY; bestScoreDiagonal = mBestScores[0]; for(unsigned int j = 1, l = k + 1; j < queryLen; j++, l++) { // calculate our similarity score similarityScore = mScoringMatrix[s1[i - 1] - 'A'][s2[j - 1] - 'A']; // fill the matrices totalSimilarityScore = bestScoreDiagonal + similarityScore; //cout << "i: " << i << ", j: " << j << ", totalSimilarityScore: " << totalSimilarityScore << endl; queryGapExtendScore = mQueryGapScores[j] - mGapExtendPenalty; queryGapOpenScore = mBestScores[j] - mGapOpenPenalty; // compute the h**o-polymer gap score if enabled if(mUseHomoPolymerGapOpenPenalty) if((j > 1) && (s2[j - 1] == s2[j - 2])) queryGapOpenScore = mBestScores[j] - mHomoPolymerGapOpenPenalty; if(queryGapExtendScore > queryGapOpenScore) { mQueryGapScores[j] = queryGapExtendScore; mSizesOfVerticalGaps[l] = (short)(mSizesOfVerticalGaps[l - queryLen] + 1); } else mQueryGapScores[j] = queryGapOpenScore; referenceGapExtendScore = currentAnchorGapScore - mGapExtendPenalty; referenceGapOpenScore = mBestScores[j - 1] - mGapOpenPenalty; // compute the h**o-polymer gap score if enabled if(mUseHomoPolymerGapOpenPenalty) if((i > 1) && (s1[i - 1] == s1[i - 2])) referenceGapOpenScore = mBestScores[j - 1] - mHomoPolymerGapOpenPenalty; if(referenceGapExtendScore > referenceGapOpenScore) { currentAnchorGapScore = referenceGapExtendScore; mSizesOfHorizontalGaps[l] = (short)(mSizesOfHorizontalGaps[l - 1] + 1); } else currentAnchorGapScore = referenceGapOpenScore; bestScoreDiagonal = mBestScores[j]; mBestScores[j] = MaxFloats(totalSimilarityScore, mQueryGapScores[j], currentAnchorGapScore); // determine the traceback direction // diagonal (445364713) > stop (238960195) > up (214378647) > left (166504495) if(mBestScores[j] == 0) mPointers[l] = Directions_STOP; else if(mBestScores[j] == totalSimilarityScore) mPointers[l] = Directions_DIAGONAL; else if(mBestScores[j] == mQueryGapScores[j]) mPointers[l] = Directions_UP; else mPointers[l] = Directions_LEFT; // set the traceback start at the current cell i, j and score if(mBestScores[j] > BestScore) { BestRow = i; BestColumn = j; BestScore = mBestScores[j]; } } } // // traceback // alignment.SwScore = BestScore; // aligned sequences int gappedAnchorLen = 0; // length of sequence #1 after alignment int gappedQueryLen = 0; // length of sequence #2 after alignment int numMismatches = 0; // the mismatched nucleotide count char c1, c2; int ci = BestRow; int cj = BestColumn; int ck = ci * queryLen; // traceback flag bool keepProcessing = true; bool hasGap = false; bool matchRegion = false; unsigned short longestMatch = 0; unsigned short currentMatchLength = 0; while(keepProcessing) { // diagonal (445364713) > stop (238960195) > up (214378647) > left (166504495) switch(mPointers[ck + cj]) { case Directions_DIAGONAL: c1 = s1[--ci]; c2 = s2[--cj]; ck -= queryLen; if ( s1[ci] == s2[cj] ) { matchRegion = true; ++currentMatchLength; } else { matchRegion = false; longestMatch = ( currentMatchLength > longestMatch ) ? currentMatchLength : longestMatch; currentMatchLength = 0; } mReversedAnchor[gappedAnchorLen++] = c1; mReversedQuery[gappedQueryLen++] = c2; // increment our mismatch counter if(mScoringMatrix[c1 - 'A'][c2 - 'A'] == mMismatchScore) numMismatches++; break; case Directions_STOP: if ( matchRegion ) longestMatch = ( currentMatchLength > longestMatch ) ? currentMatchLength : longestMatch; keepProcessing = false; break; case Directions_UP: if ( matchRegion ) { matchRegion = false; longestMatch = ( currentMatchLength > longestMatch ) ? currentMatchLength : longestMatch; currentMatchLength = 0; } for(unsigned int l = 0, len = mSizesOfVerticalGaps[ck + cj]; l < len; l++) { mReversedAnchor[gappedAnchorLen++] = s1[--ci]; mReversedQuery[gappedQueryLen++] = GAP; ck -= queryLen; numMismatches++; } hasGap = true; break; case Directions_LEFT: if ( matchRegion ) { matchRegion = false; longestMatch = ( currentMatchLength > longestMatch ) ? currentMatchLength : longestMatch; currentMatchLength = 0; } for(unsigned int l = 0, len = mSizesOfHorizontalGaps[ck + cj]; l < len; l++) { mReversedAnchor[gappedAnchorLen++] = GAP; mReversedQuery[gappedQueryLen++] = s2[--cj]; numMismatches++; } hasGap = true; break; } } // define the reference and query sequences mReversedAnchor[gappedAnchorLen] = 0; mReversedQuery[gappedQueryLen] = 0; // catch sequences with different lengths if(gappedAnchorLen != gappedQueryLen) { cout << "ERROR: The aligned sequences have different lengths after Smith-Waterman-Gotoh algorithm." << endl; exit(1); } // reverse the strings and assign them to our alignment structure reverse(mReversedAnchor, mReversedAnchor + gappedAnchorLen); reverse(mReversedQuery, mReversedQuery + gappedQueryLen); alignment.Reference = mReversedAnchor; alignment.Query = mReversedQuery; // set the reference endpoints alignment.ReferenceBegin = ci; alignment.ReferenceEnd = BestRow - 1; // set the query endpoints if(alignment.IsReverseStrand) { alignment.QueryBegin = s2Length - BestColumn; alignment.QueryEnd = s2Length - cj - 1; } else { alignment.QueryBegin = cj; alignment.QueryEnd = BestColumn - 1; } // set the query length and number of mismatches alignment.QueryLength = alignment.QueryEnd - alignment.QueryBegin + 1; alignment.NumMismatches = numMismatches; alignment.NumLongestMatchs = longestMatch; // fix the gap order //if(hasGap) CorrectHomopolymerGapOrder(alignment); }
// calculates the score during the forward algorithm float CBandedSmithWaterman::CalculateScore(const char* s1, const char* s2, const unsigned int rowNum, const unsigned int columnNum, float& currentQueryGapScore, const unsigned int rowOffset, const unsigned int columnOffset) { // initialize const unsigned int row = rowNum + rowOffset; const unsigned int column = columnOffset - rowNum + columnNum; const unsigned int position = row * (mBandwidth + 2) + column; // retrieve the similarity scores const float similarityScore = mScoringMatrix[s1[columnNum] - 'A'][s2[rowNum] - 'A']; const float totalSimilarityScore = mBestScores[column] + similarityScore; // ================================ // open a gap in the query sequence // ================================ float queryGapExtendScore = currentQueryGapScore - mGapExtendPenalty; float queryGapOpenScore = mBestScores[column - 1] - mGapOpenPenalty; // compute the h**o-polymer gap score if enabled if(mUseHomoPolymerGapOpenPenalty) if((rowNum > 1) && (s2[rowNum] == s2[rowNum - 1])) queryGapOpenScore = mBestScores[column - 1] - mHomoPolymerGapOpenPenalty; if(queryGapExtendScore > queryGapOpenScore) { currentQueryGapScore = queryGapExtendScore; mPointers[position].mSizeOfHorizontalGaps = mPointers[position - 1].mSizeOfHorizontalGaps + 1; } else currentQueryGapScore = queryGapOpenScore; // ==================================== // open a gap in the reference sequence // ==================================== float anchorGapExtendScore = mAnchorGapScores[column + 1] - mGapExtendPenalty; float anchorGapOpenScore = mBestScores[column + 1] - mGapOpenPenalty; // compute the h**o-polymer gap score if enabled if(mUseHomoPolymerGapOpenPenalty) if((columnNum > 1) && (s1[columnNum] == s1[columnNum - 1])) anchorGapOpenScore = mBestScores[column + 1] - mHomoPolymerGapOpenPenalty; if(anchorGapExtendScore > anchorGapOpenScore) { mAnchorGapScores[column] = anchorGapExtendScore; mPointers[position].mSizeOfVerticalGaps = mPointers[position - mBandwidth - 1].mSizeOfVerticalGaps + 1; } else mAnchorGapScores[column] = anchorGapOpenScore; // ====================================== // calculate the best score and direction // ====================================== //mBestScores[column] = MaxFloats(totalSimilarityScore, mAnchorGapScores[column], currentQueryGapScore); mBestScores[column] = MaxFloats(totalSimilarityScore, currentQueryGapScore, mAnchorGapScores[column]); // determine the traceback direction // diagonal (445364713) > stop (238960195) > up (214378647) > left (166504495) if(mBestScores[column] == 0) mPointers[position].Direction = Directions_STOP; else if(mBestScores[column] == totalSimilarityScore) mPointers[position].Direction = Directions_UP; else if(mBestScores[column] == currentQueryGapScore) mPointers[position].Direction = Directions_LEFT; else mPointers[position].Direction = Directions_DIAGONAL; return mBestScores[column]; }