// aligns the query sequence to the anchor using the Smith Waterman Gotoh algorithm void CBandedSmithWaterman::Align(Alignment& alignment, const char* s1, const unsigned int s1Length, const char* s2, const unsigned int s2Length, HashRegion& hr) { // determine the hash region type unsigned int rowOffset; unsigned int columnOffset; PositionType positionType; if(hr.Begin == 0) { if(hr.QueryBegin == 0) { rowOffset = 1; columnOffset = (mBandwidth / 2) + 1; positionType = Position_REF_AND_QUERY_ZERO; } else { rowOffset = 1 - hr.QueryBegin; columnOffset = (mBandwidth / 2) + 1 + hr.QueryBegin; positionType = Position_REF_ZERO; } } else { if(hr.QueryBegin == 0) { rowOffset = 1; columnOffset = (mBandwidth / 2) + 1 - hr.Begin; positionType = Position_QUERY_ZERO; } else { rowOffset = 1 - hr.QueryBegin; columnOffset = (mBandwidth / 2) + 1 + hr.QueryBegin - hr.Begin; positionType = Position_REF_AND_QUERO_NONZERO; } } // ========================= // Reinitialize the matrices // ========================= ReinitializeMatrices(positionType, s1Length, s2Length, hr); // ======================================= // Banded Smith-Waterman forward algorithm // ======================================= unsigned int bestColumn = 0; unsigned int bestRow = 0; float bestScore = FLOAT_NEGATIVE_INFINITY; float currentQueryGapScore; // rowNum and column indicate the row and column numbers in the Smith-Waterman matrix respectively unsigned int rowNum = hr.QueryBegin; unsigned int columnNum = hr.Begin; // indicates how many rows including blank elements in the Banded SmithWaterman int numBlankElements = (mBandwidth / 2) - columnNum; // upper triangle matrix in Banded Smith-Waterman for( ; numBlankElements > 0; numBlankElements--, rowNum++){ // in the upper triangle matrix, we always start at the 0th column columnNum = 0; // columnEnd indicates how many columns which should be dealt with in the current row unsigned int columnEnd = min((mBandwidth - numBlankElements), (s1Length - columnNum + 1) ); currentQueryGapScore = FLOAT_NEGATIVE_INFINITY; for( unsigned int j = 0; j < columnEnd; j++){ float score = CalculateScore(s1, s2, rowNum, columnNum, currentQueryGapScore, rowOffset, columnOffset); UpdateBestScore(bestRow, bestColumn, bestScore, rowNum, columnNum, score); columnNum++; } // replace the columnNum to the middle column in the Smith-Waterman matrix columnNum = columnNum - (mBandwidth / 2); } // complete matrix in Banded Smith-Waterman unsigned int completeNum = min((s1Length - columnNum - (mBandwidth / 2)), (s2Length - rowNum)); for(unsigned int i = 0; i < completeNum; i++, rowNum++){ columnNum = columnNum - (mBandwidth / 2); // there are mBandwidth columns which should be dealt with in each row currentQueryGapScore = FLOAT_NEGATIVE_INFINITY; for(unsigned int j = 0; j < mBandwidth; j++){ float score = CalculateScore(s1, s2, rowNum, columnNum, currentQueryGapScore, rowOffset, columnOffset); UpdateBestScore(bestRow, bestColumn, bestScore, rowNum, columnNum, score); columnNum++; } // replace the columnNum to the middle column in the Smith-Waterman matrix // because mBandwidth is an odd number, everytime the following equation shifts a column (pluses 1). columnNum = columnNum - (mBandwidth / 2); } // lower triangle matrix numBlankElements = min(mBandwidth, (s2Length - rowNum)); columnNum = columnNum - (mBandwidth / 2); for(unsigned int i = 0; numBlankElements > 0; i++, rowNum++, numBlankElements--) { mBestScores[ mBandwidth - i ] = FLOAT_NEGATIVE_INFINITY;; // columnEnd indicates how many columns which should be dealt with currentQueryGapScore = FLOAT_NEGATIVE_INFINITY; for( unsigned int j = columnNum; j < s1Length; j++){ float score = CalculateScore(s1, s2, rowNum, columnNum, currentQueryGapScore, rowOffset, columnOffset); UpdateBestScore(bestRow, bestColumn, bestScore, rowNum, columnNum, score); columnNum++; } // replace the columnNum to the middle column in the Smith-Waterman matrix columnNum = columnNum - mBandwidth + i + 2; } // ========================================= // Banded Smith-Waterman backtrace algorithm // ========================================= Traceback(alignment, s1, s2, s2Length, bestRow, bestColumn, rowOffset, columnOffset); }
// aligns the query sequence to the anchor using the Smith Waterman Gotoh algorithm void CBandedSmithWaterman::Align(unsigned int& referenceAl, string& cigarAl, const string& s1, const string& s2, pair< pair<unsigned int, unsigned int>, pair<unsigned int, unsigned int> >& hr) { unsigned int rowStart = min(hr.first.first, (unsigned int)hr.second.first); hr.first.first -= rowStart; hr.second.first -= rowStart; //bool isLegalBandWidth = (s2.length() - hr.QueryBegin) > (mBandwidth / 2); // isLegalBandWidth = isLegalBandWidth && ((s1.length() - hr.Begin) > (mBandwidth / 2)); // check the lengths of the input sequences //if( (s1.length() <= 0) || (s2.length() <= 0) || (s1.length() < s2.length()) ) { // printf("ERROR: An unexpected sequence length was encountered during pairwise alignment.\n"); // printf("Sequence lengths are listed as following:\n"); // printf("1. Reference length: %u\n2. Query length: %u\n", s1.length(), s2.length()); //printf("3. Hash region in reference:%4u-%4u\n", hr.Begin + rowStart, hr.End); //printf("4. Hash region in query: %4u-%4u\n", hr.QueryBegin + rowStart, hr.QueryEnd); // exit(1); //} // determine the hash region type unsigned int rowOffset; unsigned int columnOffset; PositionType positionType; if(hr.first.first == 0) { if(hr.second.first == 0) { rowOffset = 1; columnOffset = (mBandwidth / 2) + 1; positionType = Position_REF_AND_QUERY_ZERO; } else { rowOffset = 1 - hr.second.first; columnOffset = (mBandwidth / 2) + 1 + hr.second.first; positionType = Position_REF_ZERO; } } else { if(hr.second.first == 0) { rowOffset = 1; columnOffset = (mBandwidth / 2) + 1 - hr.first.first; positionType = Position_QUERY_ZERO; } else { rowOffset = 1 - hr.second.first; columnOffset = (mBandwidth / 2) + 1 + hr.second.first - hr.first.first; positionType = Position_REF_AND_QUERO_NONZERO; } } // ========================= // Reinitialize the matrices // ========================= ReinitializeMatrices(positionType, s1.length(), s2.length(), hr); // ======================================= // Banded Smith-Waterman forward algorithm // ======================================= unsigned int bestColumn = 0; unsigned int bestRow = 0; float bestScore = FLOAT_NEGATIVE_INFINITY; float currentQueryGapScore; // rowNum and column indicate the row and column numbers in the Smith-Waterman matrix respectively unsigned int rowNum = hr.second.first; unsigned int columnNum = hr.first.first; // indicates how many rows including blank elements in the Banded SmithWaterman int numBlankElements = (mBandwidth / 2) - columnNum; //cout << numBlankElements << endl; // upper triangle matrix in Banded Smith-Waterman for( ; numBlankElements > 0; numBlankElements--, rowNum++){ // in the upper triangle matrix, we always start at the 0th column columnNum = 0; // columnEnd indicates how many columns which should be dealt with in the current row unsigned int columnEnd = min((mBandwidth - numBlankElements), ((unsigned int) s1.length() - columnNum + 1) ); currentQueryGapScore = FLOAT_NEGATIVE_INFINITY; for( unsigned int j = 0; j < columnEnd; j++){ float score = CalculateScore(s1, s2, rowNum, columnNum, currentQueryGapScore, rowOffset, columnOffset); //cout << s1[columnNum] << s2[rowNum] << score << endl; UpdateBestScore(bestRow, bestColumn, bestScore, rowNum, columnNum, score); columnNum++; } // replace the columnNum to the middle column in the Smith-Waterman matrix columnNum = columnNum - (mBandwidth / 2); } // complete matrix in Banded Smith-Waterman unsigned int completeNum = min((s1.length() - columnNum - (mBandwidth / 2)), (s2.length() - rowNum)); //cout << completeNum << endl; for(unsigned int i = 0; i < completeNum; i++, rowNum++){ columnNum = columnNum - (mBandwidth / 2); // there are mBandwidth columns which should be dealt with in each row currentQueryGapScore = FLOAT_NEGATIVE_INFINITY; for(unsigned int j = 0; j < mBandwidth; j++){ float score = CalculateScore(s1, s2, rowNum, columnNum, currentQueryGapScore, rowOffset, columnOffset); UpdateBestScore(bestRow, bestColumn, bestScore, rowNum, columnNum, score); //cout << s1[columnNum] << s2[rowNum] << score << endl; columnNum++; } // replace the columnNum to the middle column in the Smith-Waterman matrix // because mBandwidth is an odd number, everytime the following equation shifts a column (pluses 1). columnNum = columnNum - (mBandwidth / 2); } // lower triangle matrix numBlankElements = min(mBandwidth, ((unsigned int) s2.length() - rowNum)); columnNum = columnNum - (mBandwidth / 2); for(unsigned int i = 0; numBlankElements > 0; i++, rowNum++, numBlankElements--) { mBestScores[ mBandwidth - i ] = FLOAT_NEGATIVE_INFINITY;; // columnEnd indicates how many columns which should be dealt with currentQueryGapScore = FLOAT_NEGATIVE_INFINITY; for( unsigned int j = columnNum; j < s1.length(); j++){ float score = CalculateScore(s1, s2, rowNum, columnNum, currentQueryGapScore, rowOffset, columnOffset); UpdateBestScore(bestRow, bestColumn, bestScore, rowNum, columnNum, score); //cout << s1[columnNum] << s2[rowNum] << score << endl; columnNum++; } // replace the columnNum to the middle column in the Smith-Waterman matrix columnNum = columnNum - mBandwidth + i + 2; } // ========================================= // Banded Smith-Waterman backtrace algorithm // ========================================= Traceback(referenceAl, cigarAl, s1, s2, bestRow, bestColumn, rowOffset, columnOffset); }