// Append msa2 at the end of msa1 void AppendMSA(MSA &msa1, const MSA &msa2) { const unsigned uSeqCount = msa1.GetSeqCount(); const unsigned uColCount1 = msa1.GetColCount(); const unsigned uColCount2 = msa2.GetColCount(); const unsigned uColCountCat = uColCount1 + uColCount2; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { unsigned uId = msa1.GetSeqId(uSeqIndex); unsigned uSeqIndex2; bool bFound = msa2.GetSeqIndex(uId, &uSeqIndex2); if (bFound) { for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex) { const char c = msa2.GetChar(uSeqIndex2, uColIndex); msa1.SetChar(uSeqIndex, uColCount1 + uColIndex, c); } } else { for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex) msa1.SetChar(uSeqIndex, uColCount1 + uColIndex, '-'); } } }
SCORE AlignTwoMSAs(const MSA &msa1, const MSA &msa2, MSA &msaOut, PWPath &Path, bool bLockLeft, bool bLockRight) { const unsigned uLengthA = msa1.GetColCount(); const unsigned uLengthB = msa2.GetColCount(); ProfPos *PA = ProfileFromMSA(msa1); ProfPos *PB = ProfileFromMSA(msa2); if (bLockLeft) { PA[0].m_scoreGapOpen = MINUS_INFINITY; PB[0].m_scoreGapOpen = MINUS_INFINITY; } if (bLockRight) { PA[uLengthA-1].m_scoreGapClose = MINUS_INFINITY; PB[uLengthB-1].m_scoreGapClose = MINUS_INFINITY; } float r = (float) uLengthA/ (float) (uLengthB + 1); // +1 to prevent div 0 if (r < 1) r = 1/r; SCORE Score = GlobalAlign(PA, uLengthA, PB, uLengthB, Path); AlignTwoMSAsGivenPath(Path, msa1, msa2, msaOut); delete[] PA; delete[] PB; return Score; }
void ProfileProfile(MSA &msa1, MSA &msa2, MSA &msaOut) { unsigned uLength1; unsigned uLength2; uLength1 = msa1.GetColCount(); uLength2 = msa2.GetColCount(); Tree tree1; Tree tree2; ProfPos *Prof1 = ProfileFromMSALocal(msa1, tree1); ProfPos *Prof2 = ProfileFromMSALocal(msa2, tree2); PWPath Path; ProfPos *ProfOut; unsigned uLengthOut; Progress("Aligning profiles"); AlignTwoProfs(Prof1, uLength1, 1.0, Prof2, uLength2, 1.0, Path, &ProfOut, &uLengthOut); Progress("Building output"); AlignTwoMSAsGivenPath(Path, msa1, msa2, msaOut); delete[] Prof1; delete[] Prof2; delete[] ProfOut; }
void FindAnchorColsPP(const MSA &msa1, const MSA &msa2, unsigned AnchorCols[], unsigned *ptruAnchorColCount) { const unsigned uColCount = msa1.GetColCount(); if( uColCount != msa2.GetColCount() ) { *ptruAnchorColCount = 0; return; // the profiles must have equal length to find anchor cols } SCORE *MatchScore = new SCORE[uColCount]; SCORE *SmoothScore = new SCORE[uColCount]; unsigned *BestCols = new unsigned[uColCount]; LetterObjScoreXP(msa1, msa2, MatchScore); g_uSmoothWindowLength.get() = 21; // this is better for DNA g_uAnchorSpacing.get() = 96; WindowSmooth(MatchScore, uColCount, g_uSmoothWindowLength.get(), SmoothScore, g_dSmoothScoreCeil.get()); unsigned uBestColCount; // FindBestColsGrade(SmoothScore,uColCount,.85,BestCols,&uBestColCount); FindBestColsComboPP(uColCount, MatchScore, SmoothScore, g_dMinBestColScore.get(), g_dMinSmoothScore.get(), BestCols, &uBestColCount); /* std::cerr << "found " << uBestColCount << " anchor cols:\n"; for( size_t colI = 0; colI < uBestColCount; colI++ ) { if( colI > 0 ) std::cerr << ", "; std::cerr << BestCols[colI]; } std::cerr << std::endl; */ #if TRACE ListBestCols(msa, MatchScore, SmoothScore, BestCols, uBestColCount); #endif MergeBestCols(MatchScore, BestCols, uBestColCount, g_uAnchorSpacing.get(), AnchorCols, ptruAnchorColCount); /* std::cerr << "\n\nafter merging, have " << *ptruAnchorColCount << " anchor cols:\n"; for( size_t colI = 0; colI < *ptruAnchorColCount; colI++ ) { if( colI > 0 ) std::cerr << ", "; std::cerr << AnchorCols[colI]; } std::cerr << std::endl; */ delete[] MatchScore; delete[] SmoothScore; delete[] BestCols; }
// The XP score is the sum of the score of each pair of // sequences between two profiles which are aligned to // each other. Notice that for two given profiles aligned // in different ways, the difference in XP score must be // the same as the difference in SP score because the // score of a pair of sequences in one profile doesn't // depend on the alignment. SCORE ObjScoreXP(const MSA &msa1, const MSA &msa2) { const unsigned uColCount1 = msa1.GetColCount(); const unsigned uColCount2 = msa2.GetColCount(); if (uColCount1 != uColCount2) Quit("ObjScoreXP, alignment lengths differ %u %u", uColCount1, uColCount2); const unsigned uSeqCount1 = msa1.GetSeqCount(); const unsigned uSeqCount2 = msa2.GetSeqCount(); #if TRACE Log(" Score Weight Weight Total\n"); Log("---------- ------ ------ ----------\n"); #endif SCORE scoreTotal = 0; unsigned uPairCount = 0; for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount1; ++uSeqIndex1) { const WEIGHT w1 = msa1.GetSeqWeight(uSeqIndex1); for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqCount2; ++uSeqIndex2) { const WEIGHT w2 = msa2.GetSeqWeight(uSeqIndex2); const WEIGHT w = w1*w2; SCORE scoreLetters = ScoreSeqPairLetters(msa1, uSeqIndex1, msa2, uSeqIndex2); SCORE scoreGaps = ScoreSeqPairGaps(msa1, uSeqIndex1, msa2, uSeqIndex2); SCORE scorePair = scoreLetters + scoreGaps; scoreTotal += w1*w2*scorePair; ++uPairCount; #if TRACE Log("%10.2f %6.3f %6.3f %10.2f >%s >%s\n", scorePair, w1, w2, scorePair*w1*w2, msa1.GetSeqName(uSeqIndex1), msa2.GetSeqName(uSeqIndex2)); #endif } } if (0 == uPairCount) Quit("0 == uPairCount"); #if TRACE Log("msa1=\n"); msa1.LogMe(); Log("msa2=\n"); msa2.LogMe(); Log("XP=%g\n", scoreTotal); #endif // return scoreTotal / uPairCount; return scoreTotal; }
// Objective score defined as the dynamic programming score. // Input is two alignments, which must be of the same length. // Result is the same profile-profile score that is optimized // by dynamic programming. SCORE ObjScoreDP(const MSA &msa1, const MSA &msa2, SCORE MatchScore[]) { const unsigned uColCount = msa1.GetColCount(); if (msa2.GetColCount() != uColCount) Quit("ObjScoreDP, must be same length"); const unsigned uColCount1 = msa1.GetColCount(); const unsigned uColCount2 = msa2.GetColCount(); const ProfPos *PA = ProfileFromMSA(msa1); const ProfPos *PB = ProfileFromMSA(msa2); return ObjScoreDP_Profs(PA, PB, uColCount1, MatchScore); }
static void AppendUnalignedTerminals(const MSA &msaA, unsigned &uColIndexA, unsigned uColCountA, const MSA &msaB, unsigned &uColIndexB, unsigned uColCountB, unsigned uSeqCountA, unsigned uSeqCountB, MSA &msaCombined, unsigned &uColIndexCombined) { #if TRACE Log("AppendUnalignedTerminals ColIxA=%u ColIxB=%u ColIxCmb=%u\n", uColIndexA, uColIndexB, uColIndexCombined); #endif const unsigned uLengthA = msaA.GetColCount(); const unsigned uLengthB = msaB.GetColCount(); unsigned uNewColCount = uColCountA; if (uColCountB > uNewColCount) uNewColCount = uColCountB; for (unsigned n = 0; n < uColCountA; ++n) { for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) { char c = msaA.GetChar(uSeqIndexA, uColIndexA + n); c = UnalignChar(c); msaCombined.SetChar(uSeqIndexA, uColIndexCombined + n, c); } } for (unsigned n = uColCountA; n < uNewColCount; ++n) { for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) msaCombined.SetChar(uSeqIndexA, uColIndexCombined + n, '.'); } for (unsigned n = 0; n < uColCountB; ++n) { for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) { char c = msaB.GetChar(uSeqIndexB, uColIndexB + n); c = UnalignChar(c); msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined + n, c); } } for (unsigned n = uColCountB; n < uNewColCount; ++n) { for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined + n, '.'); } uColIndexCombined += uNewColCount; uColIndexA += uColCountA; uColIndexB += uColCountB; }
// Return true if the given column has no gaps and all // its residues are in the same biochemical group. bool MSAColIsConservative(const MSA &msa, unsigned uColIndex) { extern unsigned ResidueGroup[]; const unsigned uSeqCount = msa.GetColCount(); if (0 == uSeqCount) Quit("MSAColIsConservative: empty alignment"); if (msa.IsGap(0, uColIndex)) return false; unsigned uLetter = msa.GetLetterEx(0, uColIndex); // cppcheck-suppress uninitvar const unsigned uGroup = ResidueGroup[uLetter]; for (unsigned uSeqIndex = 1; uSeqIndex < uSeqCount; ++uSeqIndex) { if (msa.IsGap(uSeqIndex, uColIndex)) return false; uLetter = msa.GetLetter(uSeqIndex, uColIndex); if (ResidueGroup[uLetter] != uGroup) return false; } return true; }
void FindAnchorCols(const MSA &msa, unsigned AnchorCols[], unsigned *ptruAnchorColCount) { MuscleContext *ctx = getMuscleContext(); const unsigned uColCount = msa.GetColCount(); if (uColCount < 16) { *ptruAnchorColCount = 0; return; } SCORE *MatchScore = new SCORE[uColCount]; SCORE *SmoothScore = new SCORE[uColCount]; unsigned *BestCols = new unsigned[uColCount]; GetLetterScores(msa, MatchScore); static_WindowSmooth(MatchScore, uColCount, ctx->params.g_uSmoothWindowLength, SmoothScore, ctx->params.g_dSmoothScoreCeil); unsigned uBestColCount; FindBestColsCombo(msa, MatchScore, SmoothScore, ctx->params.g_dMinBestColScore, ctx->params.g_dMinSmoothScore, BestCols, &uBestColCount); #if TRACE ListBestCols(msa, MatchScore, SmoothScore, BestCols, uBestColCount); #endif MergeBestCols(MatchScore, BestCols, uBestColCount, ctx->params.g_uAnchorSpacing, AnchorCols, ptruAnchorColCount); delete[] MatchScore; delete[] SmoothScore; delete[] BestCols; }
void MHackEnd(MSA &msa) { if (ALPHA_Amino != g_Alpha) return; if (0 == M) return; const unsigned uSeqCount = msa.GetSeqCount(); const unsigned uColCount = msa.GetColCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { unsigned uId = msa.GetSeqId(uSeqIndex); if (M[uId]) { for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { if (!msa.IsGap(uSeqIndex, uColIndex)) { msa.SetChar(uSeqIndex, uColIndex, 'M'); break; } } } } delete[] M; M = 0; }
void StripGapColumns( MSA& msa ) { unsigned uCurCol = 0; for( unsigned uColIndex = 0; uColIndex < msa.GetColCount(); uColIndex++ ) { if( !msa.IsGapColumn(uColIndex) ) { for( unsigned uGapSeq = 0; uGapSeq < msa.GetSeqCount(); uGapSeq++ ) { msa.SetChar(uGapSeq, uCurCol, msa.GetChar(uGapSeq,uColIndex)); } uCurCol++; } } msa.DeleteColumns(uCurCol, msa.GetColCount()-uCurCol); }
void MSAFromColRange(const MSA &msaIn, unsigned uFromColIndex, unsigned uColCount, MSA &msaOut) { const unsigned uSeqCount = msaIn.GetSeqCount(); const unsigned uInColCount = msaIn.GetColCount(); if (uFromColIndex + uColCount - 1 > uInColCount) Quit("MSAFromColRange, out of bounds"); msaOut.SetSize(uSeqCount, uColCount); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const char *ptrName = msaIn.GetSeqName(uSeqIndex); unsigned uId = msaIn.GetSeqId(uSeqIndex); msaOut.SetSeqName(uSeqIndex, ptrName); msaOut.SetSeqId(uSeqIndex, uId); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { const char c = msaIn.GetChar(uSeqIndex, uFromColIndex + uColIndex); msaOut.SetChar(uSeqIndex, uColIndex, c); } } }
// Append msa2 at the end of msa1 void MSAAppend(MSA &msa1, const MSA &msa2) { const unsigned uSeqCount = msa1.GetSeqCount(); const unsigned uColCount1 = msa1.GetColCount(); const unsigned uColCount2 = msa2.GetColCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { unsigned uId = msa1.GetSeqId(uSeqIndex); unsigned uSeqIndex2 = msa2.GetSeqIndex(uId); for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex) { const char c = msa2.GetChar(uSeqIndex2, uColIndex); msa1.SetChar(uSeqIndex, uColCount1 + uColIndex, c); } } }
void convertMSA2MAlignment(MSA& msa, const DNAAlphabet* al, MultipleSequenceAlignment& res) { assert(res->isEmpty()); MuscleContext *ctx = getMuscleContext(); res->setAlphabet(al); ctx->output_uIds.clear(); for(int i=0, n = msa.GetSeqCount(); i < n; i++) { QString name = msa.GetSeqName(i); QByteArray seq; seq.reserve(msa.GetColCount()); for (int j = 0, m = msa.GetColCount(); j < m ; j++) { char c = msa.GetChar(i, j); seq.append(c); } ctx->output_uIds.append(ctx->tmp_uIds[msa.GetSeqId(i)]); res->addRow(name, seq); } }
// Objective score defined as the sum of profile-sequence // scores for each sequence in the alignment. The profile // is computed from the entire alignment, so this includes // the score of each sequence against itself. This is to // avoid recomputing the profile each time, so we reduce // complexity but introduce a questionable approximation. // The goal is to see if we can exploit the apparent // improvement in performance of log-expectation score // over the usual sum-of-pairs by optimizing this // objective score in the iterative refinement stage. SCORE ObjScorePS(const MSA &msa, SCORE MatchScore[]) { if (g_PPScore != PPSCORE_LE) Quit("FastScoreMSA_LASimple: LA"); const unsigned uSeqCount = msa.GetSeqCount(); const unsigned uColCount = msa.GetColCount(); const ProfPos *Prof = ProfileFromMSA(msa); if (0 != MatchScore) for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) MatchScore[uColIndex] = 0; SCORE scoreTotal = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const WEIGHT weightSeq = msa.GetSeqWeight(uSeqIndex); SCORE scoreSeq = 0; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { const ProfPos &PP = Prof[uColIndex]; if (msa.IsGap(uSeqIndex, uColIndex)) { bool bOpen = (0 == uColIndex || !msa.IsGap(uSeqIndex, uColIndex - 1)); bool bClose = (uColCount - 1 == uColIndex || !msa.IsGap(uSeqIndex, uColIndex + 1)); if (bOpen) scoreSeq += PP.m_scoreGapOpen; if (bClose) scoreSeq += PP.m_scoreGapClose; //if (!bOpen && !bClose) // scoreSeq += PP.m_scoreGapExtend; } else if (msa.IsWildcard(uSeqIndex, uColIndex)) continue; else { unsigned uLetter = msa.GetLetter(uSeqIndex, uColIndex); const SCORE scoreMatch = PP.m_AAScores[uLetter]; if (0 != MatchScore) MatchScore[uColIndex] += weightSeq*scoreMatch; scoreSeq += scoreMatch; } } scoreTotal += weightSeq*scoreSeq; } delete[] Prof; return scoreTotal; }
void DeleteGappedCols(MSA &msa) { unsigned uColIndex = 0; for (;;) { if (uColIndex >= msa.GetColCount()) break; if (msa.IsGapColumn(uColIndex)) msa.DeleteCol(uColIndex); else ++uColIndex; } }
// "Catenate" two MSAs (by bad analogy with UNIX cat command). // msa1 and msa2 must have same sequence names, but possibly // in a different order. // msaCat is the combined alignment produce by appending // sequences in msa2 to sequences in msa1. void MSACat(const MSA &msa1, const MSA &msa2, MSA &msaCat) { const unsigned uSeqCount = msa1.GetSeqCount(); const unsigned uColCount1 = msa1.GetColCount(); const unsigned uColCount2 = msa2.GetColCount(); const unsigned uColCountCat = uColCount1 + uColCount2; msaCat.SetSize(uSeqCount, uColCountCat); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { for (unsigned uColIndex = 0; uColIndex < uColCount1; ++uColIndex) { const char c = msa1.GetChar(uSeqIndex, uColIndex); msaCat.SetChar(uSeqIndex, uColIndex, c); } const char *ptrSeqName = msa1.GetSeqName(uSeqIndex); unsigned uSeqIndex2; msaCat.SetSeqName(uSeqIndex, ptrSeqName); bool bFound = msa2.GetSeqIndex(ptrSeqName, &uSeqIndex2); if (bFound) { for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex) { const char c = msa2.GetChar(uSeqIndex2, uColIndex); msaCat.SetChar(uSeqIndex, uColCount1 + uColIndex, c); } } else { for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex) msaCat.SetChar(uSeqIndex, uColCount1 + uColIndex, '-'); } } }
static SCORE ScoreLetters(const MSA &msa, const unsigned Edges[], unsigned uEdgeCount) { const unsigned uColCount = msa.GetColCount(); // Letters SCORE Score = 0; for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const unsigned uColIndex = Edges[uEdgeIndex]; assert(uColIndex < uColCount); Score += ScoreColLetters(msa, uColIndex); } return Score; }
void MSAFromSeqRange(const MSA &msaIn, unsigned uFromSeqIndex, unsigned uSeqCount, MSA &msaOut) { const unsigned uColCount = msaIn.GetColCount(); msaOut.SetSize(uSeqCount, uColCount); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const char *ptrName = msaIn.GetSeqName(uFromSeqIndex + uSeqIndex); msaOut.SetSeqName(uSeqIndex, ptrName); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { const char c = msaIn.GetChar(uFromSeqIndex + uSeqIndex, uColIndex); msaOut.SetChar(uSeqIndex, uColIndex, c); } } }
void Stabilize(const MSA &msa, MSA &msaStable) { const unsigned uSeqCount = msa.GetSeqCount(); const unsigned uColCount = msa.GetColCount(); msaStable.SetSize(uSeqCount, uColCount); for (unsigned uId = 0; uId < uSeqCount; ++uId) { const unsigned uSeqIndex = msa.GetSeqIndex(uId); msaStable.SetSeqName(uId, msa.GetSeqName(uSeqIndex)); msaStable.SetSeqId(uSeqIndex, uId); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { const char c = msa.GetChar(uSeqIndex, uColIndex); msaStable.SetChar(uId, uColIndex, c); } } }
void MSAFromSeqSubset(const MSA &msaIn, const unsigned uSeqIndexes[], unsigned uSeqCount, MSA &msaOut) { const unsigned uColCount = msaIn.GetColCount(); msaOut.SetSize(uSeqCount, uColCount); for (unsigned uSeqIndexOut = 0; uSeqIndexOut < uSeqCount; ++uSeqIndexOut) { unsigned uSeqIndexIn = uSeqIndexes[uSeqIndexOut]; const char *ptrName = msaIn.GetSeqName(uSeqIndexIn); unsigned uId = msaIn.GetSeqId(uSeqIndexIn); msaOut.SetSeqName(uSeqIndexOut, ptrName); msaOut.SetSeqId(uSeqIndexOut, uId); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { const char c = msaIn.GetChar(uSeqIndexIn, uColIndex); msaOut.SetChar(uSeqIndexOut, uColIndex, c); } } }
// Similarity score static double Sigma(const MSA &msa, unsigned SeqIndex1, unsigned SeqIndex2, unsigned *ptrLength) { unsigned Length = 0; double Score = 0; const unsigned ColCount = msa.GetColCount(); for (unsigned ColIndex = 0; ColIndex < ColCount; ++ColIndex) { unsigned Letter1 = msa.GetLetterEx(SeqIndex1, ColIndex); unsigned Letter2 = msa.GetLetterEx(SeqIndex2, ColIndex); if (Letter1 >= 20 || Letter2 >= 20) continue; ++Length; Score += BLOSUM62[Letter1][Letter2]; } *ptrLength = Length; return Score; }
// Best col only if all following criteria satisfied: // (1) Score >= min // (2) Smoothed score >= min // (3) No gaps. static void FindBestColsCombo(const MSA &msa, const SCORE Score[], const SCORE SmoothScore[], double dMinScore, double dMinSmoothScore, unsigned BestCols[], unsigned *ptruBestColCount) { const unsigned uColCount = msa.GetColCount(); unsigned uBestColCount = 0; for (unsigned uIndex = 0; uIndex < uColCount; ++uIndex) { if (Score[uIndex] < dMinScore) continue; if (SmoothScore[uIndex] < dMinSmoothScore) continue; if (msa.ColumnHasGap(uIndex)) continue; BestCols[uBestColCount] = uIndex; ++uBestColCount; } *ptruBestColCount = uBestColCount; }
void WriteScoreFile(const MSA &msa) { MuscleContext *ctx = getMuscleContext(); FILE *f = fopen(ctx->params.g_pstrScoreFileName, "w"); if (0 == f) Quit("Cannot open score file '%s' errno=%d", ctx->params.g_pstrScoreFileName, errno); const unsigned uColCount = msa.GetColCount(); const unsigned uSeqCount = msa.GetSeqCount(); for (unsigned uCol = 0; uCol < uColCount; ++uCol) { double Score = GetColScore(msa, uCol); fprintf(f, "%10.3f ", Score); for (unsigned uSeq = 0; uSeq < uSeqCount; ++uSeq) { char c = msa.GetChar(uSeq, uCol); fprintf(f, "%c", c); } fprintf(f, "\n"); } fclose(f); }
// TODO: This could be much faster, no need to look // at all columns. static void FindIntersectingGaps(const MSA &msa, unsigned SeqIndex) { MuscleContext *ctx = getMuscleContext(); GAPINFO** &g_Gaps = ctx->scoregaps.g_Gaps; bool* &g_ColDiff = ctx->scoregaps.g_ColDiff; const unsigned ColCount = msa.GetColCount(); bool InGap = false; bool Intersects = false; unsigned Start = uInsane; for (unsigned Col = 0; Col <= ColCount; ++Col) { bool Gap = ((Col != ColCount) && msa.IsGap(SeqIndex, Col)); if (Gap) { if (!InGap) { InGap = true; Start = Col; } if (g_ColDiff[Col]) Intersects = true; } else if (InGap) { InGap = false; if (Intersects) { GAPINFO *GI = NewGapInfo(); GI->Start = Start; GI->End = Col - 1; GI->Next = g_Gaps[SeqIndex]; g_Gaps[SeqIndex] = GI; } Intersects = false; } } }
void GetLetterScores(const MSA &msa, SCORE Scores[]) { const unsigned uColCount = msa.GetColCount(); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) Scores[uColIndex] = ScoreColLetters(msa, uColIndex); }
SCORE ScoreSeqPairGaps(const MSA &msa1, unsigned uSeqIndex1, const MSA &msa2, unsigned uSeqIndex2) { const unsigned uColCount = msa1.GetColCount(); const unsigned uColCount2 = msa2.GetColCount(); if (uColCount != uColCount2) Quit("ScoreSeqPairGaps, different lengths"); #if TRACE_SEQPAIR { Log("\n"); Log("ScoreSeqPairGaps\n"); MSA msaTmp; msaTmp.SetSize(2, uColCount); msaTmp.CopySeq(0, msa1, uSeqIndex1); msaTmp.CopySeq(1, msa2, uSeqIndex2); msaTmp.LogMe(); } #endif SCORE scoreGaps = 0; bool bGapping1 = false; bool bGapping2 = false; unsigned uColStart = 0; bool bLeftTermGap = false; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex); bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex); if (!bGap1 || !bGap2) { if (bGap1 || bGap2) bLeftTermGap = true; uColStart = uColIndex; break; } } unsigned uColEnd = uColCount - 1; bool bRightTermGap = false; for (int iColIndex = (int) uColCount - 1; iColIndex >= 0; --iColIndex) { bool bGap1 = msa1.IsGap(uSeqIndex1, iColIndex); bool bGap2 = msa2.IsGap(uSeqIndex2, iColIndex); if (!bGap1 || !bGap2) { if (bGap1 || bGap2) bRightTermGap = true; uColEnd = (unsigned) iColIndex; break; } } #if TRACE_SEQPAIR Log("LeftTermGap=%d RightTermGap=%d\n", bLeftTermGap, bRightTermGap); #endif for (unsigned uColIndex = uColStart; uColIndex <= uColEnd; ++uColIndex) { bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex); bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex); if (bGap1 && bGap2) continue; if (bGap1) { if (!bGapping1) { #if TRACE_SEQPAIR Log("Gap open seq 1 col %d\n", uColIndex); #endif if (uColIndex == uColStart) scoreGaps += TermGapScore(true); else scoreGaps += g_scoreGapOpen; bGapping1 = true; } else scoreGaps += g_scoreGapExtend; continue; } else if (bGap2) { if (!bGapping2) { #if TRACE_SEQPAIR Log("Gap open seq 2 col %d\n", uColIndex); #endif if (uColIndex == uColStart) scoreGaps += TermGapScore(true); else scoreGaps += g_scoreGapOpen; bGapping2 = true; } else scoreGaps += g_scoreGapExtend; continue; } bGapping1 = false; bGapping2 = false; } if (bGapping1 || bGapping2) { scoreGaps -= g_scoreGapOpen; scoreGaps += TermGapScore(true); } return scoreGaps; }
SCORE ScoreSeqPairLetters(const MSA &msa1, unsigned uSeqIndex1, const MSA &msa2, unsigned uSeqIndex2) { const unsigned uColCount = msa1.GetColCount(); const unsigned uColCount2 = msa2.GetColCount(); if (uColCount != uColCount2) Quit("ScoreSeqPairLetters, different lengths"); #if TRACE_SEQPAIR { Log("\n"); Log("ScoreSeqPairLetters\n"); MSA msaTmp; msaTmp.SetSize(2, uColCount); msaTmp.CopySeq(0, msa1, uSeqIndex1); msaTmp.CopySeq(1, msa2, uSeqIndex2); msaTmp.LogMe(); } #endif SCORE scoreLetters = 0; SCORE scoreGaps = 0; bool bGapping1 = false; bool bGapping2 = false; unsigned uColStart = 0; bool bLeftTermGap = false; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex); bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex); if (!bGap1 || !bGap2) { if (bGap1 || bGap2) bLeftTermGap = true; uColStart = uColIndex; break; } } unsigned uColEnd = uColCount - 1; bool bRightTermGap = false; for (int iColIndex = (int) uColCount - 1; iColIndex >= 0; --iColIndex) { bool bGap1 = msa1.IsGap(uSeqIndex1, iColIndex); bool bGap2 = msa2.IsGap(uSeqIndex2, iColIndex); if (!bGap1 || !bGap2) { if (bGap1 || bGap2) bRightTermGap = true; uColEnd = (unsigned) iColIndex; break; } } #if TRACE_SEQPAIR Log("LeftTermGap=%d RightTermGap=%d\n", bLeftTermGap, bRightTermGap); #endif for (unsigned uColIndex = uColStart; uColIndex <= uColEnd; ++uColIndex) { unsigned uLetter1 = msa1.GetLetterEx(uSeqIndex1, uColIndex); if (uLetter1 >= g_AlphaSize) continue; unsigned uLetter2 = msa2.GetLetterEx(uSeqIndex2, uColIndex); if (uLetter2 >= g_AlphaSize) continue; SCORE scoreMatch = (*g_ptrScoreMatrix)[uLetter1][uLetter2]; scoreLetters += scoreMatch; } return scoreLetters; }
// The usual sum-of-pairs objective score: sum the score // of the alignment of each pair of sequences. SCORE ObjScoreSP(const MSA &msa, SCORE MatchScore[]) { #if TRACE Log("==================ObjScoreSP==============\n"); Log("msa=\n"); msa.LogMe(); #endif g_SPScoreLetters = 0; g_SPScoreGaps = 0; if (0 != MatchScore) { const unsigned uColCount = msa.GetColCount(); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) MatchScore[uColIndex] = 0; } const unsigned uSeqCount = msa.GetSeqCount(); SCORE scoreTotal = 0; unsigned uPairCount = 0; #if TRACE Log("Seq1 Seq2 wt1 wt2 Letters Gaps Unwt.Score Wt.Score Total\n"); Log("---- ---- ------ ------ ---------- ---------- ---------- ---------- ----------\n"); #endif for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1) { const WEIGHT w1 = msa.GetSeqWeight(uSeqIndex1); for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount; ++uSeqIndex2) { const WEIGHT w2 = msa.GetSeqWeight(uSeqIndex2); const WEIGHT w = w1*w2; SCORE scoreLetters = ScoreSeqPairLetters(msa, uSeqIndex1, msa, uSeqIndex2); SCORE scoreGaps = ScoreSeqPairGaps(msa, uSeqIndex1, msa, uSeqIndex2); SCORE scorePair = scoreLetters + scoreGaps; ++uPairCount; scoreTotal += w*scorePair; g_SPScoreLetters += w*scoreLetters; g_SPScoreGaps += w*scoreGaps; #if TRACE Log("%4d %4d %6.3f %6.3f %10.2f %10.2f %10.2f %10.2f %10.2f >%s >%s\n", uSeqIndex1, uSeqIndex2, w1, w2, scoreLetters, scoreGaps, scorePair, scorePair*w1*w2, scoreTotal, msa.GetSeqName(uSeqIndex1), msa.GetSeqName(uSeqIndex2)); #endif } } #if TEST_SPFAST { SCORE f = ObjScoreSPFast(msa); Log("Fast = %.6g\n", f); Log("Brute = %.6g\n", scoreTotal); if (BTEq(f, scoreTotal)) Log("Agree\n"); else Log("** DISAGREE **\n"); } #endif // return scoreTotal / uPairCount; return scoreTotal; }
static SCORE ScoreSeqPair(const MSA &msa1, unsigned uSeqIndex1, const MSA &msa2, unsigned uSeqIndex2, SCORE *ptrLetters, SCORE *ptrGaps) { g_ptrMSA1.get() = &msa1; g_ptrMSA2.get() = &msa2; g_uSeqIndex1.get() = uSeqIndex1; g_uSeqIndex2.get() = uSeqIndex2; const unsigned uColCount = msa1.GetColCount(); const unsigned uColCount2 = msa2.GetColCount(); if (uColCount != uColCount2) Quit("ScoreSeqPair, different lengths"); #if TRACE Log("ScoreSeqPair\n"); Log("%16.16s ", msa1.GetSeqName(uSeqIndex1)); for (unsigned i = 0; i < uColCount; ++i) Log("%c", msa1.GetChar(uSeqIndex1, i)); Log("\n"); Log("%16.16s ", msa2.GetSeqName(uSeqIndex2)); for (unsigned i = 0; i < uColCount; ++i) Log("%c", msa1.GetChar(uSeqIndex2, i)); Log("\n"); #endif SCORE scoreTotal = 0; // Substitution scores unsigned uFirstLetter1 = uInsane; unsigned uFirstLetter2 = uInsane; unsigned uLastLetter1 = uInsane; unsigned uLastLetter2 = uInsane; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex); bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex); bool bWildcard1 = msa1.IsWildcard(uSeqIndex1, uColIndex); bool bWildcard2 = msa2.IsWildcard(uSeqIndex2, uColIndex); if (!bGap1) { if (uInsane == uFirstLetter1) uFirstLetter1 = uColIndex; uLastLetter1 = uColIndex; } if (!bGap2) { if (uInsane == uFirstLetter2) uFirstLetter2 = uColIndex; uLastLetter2 = uColIndex; } if (bGap1 || bGap2 || bWildcard1 || bWildcard2) continue; unsigned uLetter1 = msa1.GetLetter(uSeqIndex1, uColIndex); unsigned uLetter2 = msa2.GetLetter(uSeqIndex2, uColIndex); SCORE scoreMatch = (*g_ptrScoreMatrix.get())[uLetter1][uLetter2]; scoreTotal += scoreMatch; #if TRACE Log("%c <-> %c = %7.1f %10.1f\n", msa1.GetChar(uSeqIndex1, uColIndex), msa2.GetChar(uSeqIndex2, uColIndex), scoreMatch, scoreTotal); #endif } *ptrLetters = scoreTotal; // Gap penalties unsigned uGapLength = uInsane; unsigned uGapStartCol = uInsane; bool bGapping1 = false; bool bGapping2 = false; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex); bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex); if (bGap1 && bGap2) continue; if (bGapping1) { if (bGap1) ++uGapLength; else { bGapping1 = false; bool bNTerm = (uFirstLetter2 == uGapStartCol); bool bCTerm = (uLastLetter2 + 1 == uColIndex); SCORE scoreGap = GapPenalty(uGapLength, bNTerm || bCTerm); scoreTotal += scoreGap; #if TRACE LogGap(uGapStartCol, uColIndex - 1, uGapLength, bNTerm, bCTerm); Log("GAP %7.1f %10.1f\n", scoreGap, scoreTotal); #endif } continue; } else { if (bGap1) { uGapStartCol = uColIndex; bGapping1 = true; uGapLength = 1; continue; } } if (bGapping2) { if (bGap2) ++uGapLength; else { bGapping2 = false; bool bNTerm = (uFirstLetter1 == uGapStartCol); bool bCTerm = (uLastLetter1 + 1 == uColIndex); SCORE scoreGap = GapPenalty(uGapLength, bNTerm || bCTerm); scoreTotal += scoreGap; #if TRACE LogGap(uGapStartCol, uColIndex - 1, uGapLength, bNTerm, bCTerm); Log("GAP %7.1f %10.1f\n", scoreGap, scoreTotal); #endif } } else { if (bGap2) { uGapStartCol = uColIndex; bGapping2 = true; uGapLength = 1; } } } if (bGapping1 || bGapping2) { SCORE scoreGap = GapPenalty(uGapLength, true); scoreTotal += scoreGap; #if TRACE LogGap(uGapStartCol, uColCount - 1, uGapLength, false, true); Log("GAP %7.1f %10.1f\n", scoreGap, scoreTotal); #endif } *ptrGaps = scoreTotal - *ptrLetters; return scoreTotal; }