void AlignTwoMSAsGivenPathSW(const PWPath &Path, const MSA &msaA, const MSA &msaB, MSA &msaCombined) { msaCombined.Clear(); #if TRACE Log("AlignTwoMSAsGivenPathSW\n"); Log("Template A:\n"); msaA.LogMe(); Log("Template B:\n"); msaB.LogMe(); #endif const unsigned uColCountA = msaA.GetColCount(); const unsigned uColCountB = msaB.GetColCount(); const unsigned uSeqCountA = msaA.GetSeqCount(); const unsigned uSeqCountB = msaB.GetSeqCount(); msaCombined.SetSeqCount(uSeqCountA + uSeqCountB); // Copy sequence names into combined MSA for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) { msaCombined.SetSeqName(uSeqIndexA, msaA.GetSeqName(uSeqIndexA)); msaCombined.SetSeqId(uSeqIndexA, msaA.GetSeqId(uSeqIndexA)); } for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) { msaCombined.SetSeqName(uSeqCountA + uSeqIndexB, msaB.GetSeqName(uSeqIndexB)); msaCombined.SetSeqId(uSeqCountA + uSeqIndexB, msaB.GetSeqId(uSeqIndexB)); } unsigned uColIndexA = 0; unsigned uColIndexB = 0; unsigned uColIndexCombined = 0; const unsigned uEdgeCount = Path.GetEdgeCount(); for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const PWEdge &Edge = Path.GetEdge(uEdgeIndex); #if TRACE Log("\nEdge %u %c%u.%u\n", uEdgeIndex, Edge.cType, Edge.uPrefixLengthA, Edge.uPrefixLengthB); #endif const char cType = Edge.cType; const unsigned uPrefixLengthA = Edge.uPrefixLengthA; unsigned uColCountA = 0; if (uPrefixLengthA > 0) { const unsigned uNodeIndexA = uPrefixLengthA - 1; const unsigned uTplColIndexA = uNodeIndexA; if (uTplColIndexA > uColIndexA) uColCountA = uTplColIndexA - uColIndexA; } const unsigned uPrefixLengthB = Edge.uPrefixLengthB; unsigned uColCountB = 0; if (uPrefixLengthB > 0) { const unsigned uNodeIndexB = uPrefixLengthB - 1; const unsigned uTplColIndexB = uNodeIndexB; if (uTplColIndexB > uColIndexB) uColCountB = uTplColIndexB - uColIndexB; } AppendUnalignedTerminals(msaA, uColIndexA, uColCountA, msaB, uColIndexB, uColCountB, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined); switch (cType) { case 'M': { assert(uPrefixLengthA > 0); assert(uPrefixLengthB > 0); const unsigned uColA = uPrefixLengthA - 1; const unsigned uColB = uPrefixLengthB - 1; assert(uColIndexA == uColA); assert(uColIndexB == uColB); AppendMatch(msaA, uColIndexA, msaB, uColIndexB, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined); break; } case 'D': { assert(uPrefixLengthA > 0); const unsigned uColA = uPrefixLengthA - 1; assert(uColIndexA == uColA); AppendDelete(msaA, uColIndexA, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined); break; } case 'I': { assert(uPrefixLengthB > 0); const unsigned uColB = uPrefixLengthB - 1; assert(uColIndexB == uColB); AppendInsert(msaB, uColIndexB, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined); break; } default: assert(false); } } unsigned uInsertColCountA = uColCountA - uColIndexA; unsigned uInsertColCountB = uColCountB - uColIndexB; AppendUnalignedTerminals(msaA, uColIndexA, uInsertColCountA, msaB, uColIndexB, uInsertColCountB, uSeqCountA, uSeqCountB, msaCombined, uColIndexCombined); }
void SPTest() { SetPPScore(PPSCORE_SV); SetListFileName("c:\\tmp\\muscle.log", false); TextFile file1("c:\\tmp\\msa1.afa"); TextFile file2("c:\\tmp\\msa2.afa"); MSA msa1; MSA msa2; msa1.FromFile(file1); msa2.FromFile(file2); Log("msa1=\n"); msa1.LogMe(); Log("msa2=\n"); msa2.LogMe(); const unsigned uColCount = msa1.GetColCount(); if (msa2.GetColCount() != uColCount) Quit("Different lengths"); const unsigned uSeqCount1 = msa1.GetSeqCount(); const unsigned uSeqCount2 = msa2.GetSeqCount(); const unsigned uSeqCount = uSeqCount1 + uSeqCount2; MSA::SetIdCount(uSeqCount); for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount1; ++uSeqIndex1) { msa1.SetSeqWeight(uSeqIndex1, 1.0); msa1.SetSeqId(uSeqIndex1, uSeqIndex1); } for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqCount2; ++uSeqIndex2) { msa2.SetSeqWeight(uSeqIndex2, 1.0); msa2.SetSeqId(uSeqIndex2, uSeqCount1 + uSeqIndex2); } MSA alnA; MSA alnB; char strPathA[1024]; char strPathB[1024]; MakePath(uColCount, INDELS, strPathA); MakePath(uColCount, INDELS, strPathB); PWPath PathA; PWPath PathB; PathA.FromStr(strPathA); PathB.FromStr(strPathB); Log("PathA=\n"); PathA.LogMe(); Log("PathB=\n"); PathB.LogMe(); AlignTwoMSAsGivenPath(PathA, msa1, msa2, alnA); AlignTwoMSAsGivenPath(PathB, msa1, msa2, alnB); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { alnA.SetSeqWeight(uSeqIndex, 1.0); alnB.SetSeqWeight(uSeqIndex, 1.0); } unsigned Seqs1[1024]; unsigned Seqs2[1024]; for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount1; ++uSeqIndex1) Seqs1[uSeqIndex1] = uSeqIndex1; for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqCount2; ++uSeqIndex2) Seqs2[uSeqIndex2] = uSeqCount1 + uSeqIndex2; MSA msaA1; MSA msaA2; MSA msaB1; MSA msaB2; MSAFromSeqSubset(alnA, Seqs1, uSeqCount1, msaA1); MSAFromSeqSubset(alnB, Seqs1, uSeqCount1, msaB1); MSAFromSeqSubset(alnA, Seqs2, uSeqCount2, msaA2); MSAFromSeqSubset(alnB, Seqs2, uSeqCount2, msaB2); for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount1; ++uSeqIndex1) { msaA1.SetSeqWeight(uSeqIndex1, 1.0); msaB1.SetSeqWeight(uSeqIndex1, 1.0); } for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqCount2; ++uSeqIndex2) { msaA2.SetSeqWeight(uSeqIndex2, 1.0); msaB2.SetSeqWeight(uSeqIndex2, 1.0); } Log("msaA1=\n"); msaA1.LogMe(); Log("msaB1=\n"); msaB1.LogMe(); Log("msaA2=\n"); msaA2.LogMe(); Log("msaB2=\n"); msaB2.LogMe(); Log("alnA=\n"); alnA.LogMe(); Log("AlnB=\n"); alnB.LogMe(); Log("\nSPA\n---\n"); SCORE SPA = ObjScoreSP(alnA); Log("\nSPB\n---\n"); SCORE SPB = ObjScoreSP(alnB); Log("\nXPA\n---\n"); SCORE XPA = ObjScoreXP(msaA1, msaA2); Log("\nXPB\n---\n"); SCORE XPB = ObjScoreXP(msaB1, msaB2); Log("SPA=%.4g SPB=%.4g Diff=%.4g\n", SPA, SPB, SPA - SPB); Log("XPA=%.4g XPB=%.4g Diff=%.4g\n", XPA, XPB, XPA - XPB); }
SCORE ScoreGaps(const MSA &msa, const unsigned DiffCols[], unsigned DiffColCount) { MuscleContext *ctx = getMuscleContext(); unsigned &g_ColCount = ctx->scoregaps.g_ColCount; unsigned &g_MaxSeqCount = ctx->scoregaps.g_MaxSeqCount; unsigned &g_MaxColCount = ctx->scoregaps.g_MaxColCount; GAPINFO** &g_Gaps = ctx->scoregaps.g_Gaps; bool* &g_ColDiff = ctx->scoregaps.g_ColDiff; #if TRACE { Log("ScoreGaps\n"); Log("DiffCols "); for (unsigned i = 0; i < DiffColCount; ++i) Log(" %u", DiffCols[i]); Log("\n"); Log("msa=\n"); msa.LogMe(); Log("\n"); } #endif const unsigned SeqCount = msa.GetSeqCount(); const unsigned ColCount = msa.GetColCount(); g_ColCount = ColCount; if (SeqCount > g_MaxSeqCount) { delete[] g_Gaps; g_MaxSeqCount = SeqCount + 256; g_Gaps = new GAPINFO *[g_MaxSeqCount]; } memset(g_Gaps, 0, SeqCount*sizeof(GAPINFO *)); if (ColCount > g_MaxColCount) { delete[] g_ColDiff; g_MaxColCount = ColCount + 256; g_ColDiff = new bool[g_MaxColCount]; } memset(g_ColDiff, 0, g_ColCount*sizeof(bool)); for (unsigned i = 0; i < DiffColCount; ++i) { unsigned Col = DiffCols[i]; assert(Col < ColCount); g_ColDiff[Col] = true; } for (unsigned SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) FindIntersectingGaps(msa, SeqIndex); #if TRACE { Log("\n"); Log("Intersecting gaps:\n"); Log(" "); for (unsigned Col = 0; Col < ColCount; ++Col) Log("%c", g_ColDiff[Col] ? '*' : ' '); Log("\n"); Log(" "); for (unsigned Col = 0; Col < ColCount; ++Col) Log("%d", Col%10); Log("\n"); for (unsigned Seq = 0; Seq < SeqCount; ++Seq) { Log("%3d: ", Seq); for (unsigned Col = 0; Col < ColCount; ++Col) Log("%c", msa.GetChar(Seq, Col)); Log(" :: "); for (GAPINFO *GI = g_Gaps[Seq]; GI; GI = GI->Next) Log(" (%d,%d)", GI->Start, GI->End); Log(" >%s\n", msa.GetSeqName(Seq)); } Log("\n"); } #endif SCORE Score = 0; for (unsigned Seq1 = 0; Seq1 < SeqCount; ++Seq1) { const WEIGHT w1 = msa.GetSeqWeight(Seq1); for (unsigned Seq2 = Seq1 + 1; Seq2 < SeqCount; ++Seq2) { const WEIGHT w2 = msa.GetSeqWeight(Seq2); // const SCORE Pair = ScorePair(Seq1, Seq2); const SCORE Pair = ScoreSeqPairGaps(msa, Seq1, msa, Seq2); Score += w1*w2*Pair; #if TRACE Log("Seq1=%u Seq2=%u ScorePair=%.4g w1=%.4g w2=%.4g Sum=%.4g\n", Seq1, Seq2, Pair, w1, w2, Score); #endif } } return Score; }
// Return true if any changes made bool RefineVert(MSA &msaIn, const Tree &tree, unsigned uIters) { bool bAnyChanges = false; const unsigned uColCountIn = msaIn.GetColCount(); const unsigned uSeqCountIn = msaIn.GetSeqCount(); if (uColCountIn < 3 || uSeqCountIn < 3) return false; unsigned *AnchorCols = new unsigned[uColCountIn]; unsigned uAnchorColCount; SetMSAWeightsMuscle(msaIn); FindAnchorCols(msaIn, AnchorCols, &uAnchorColCount); const unsigned uRangeCount = uAnchorColCount + 1; Range *Ranges = new Range[uRangeCount]; #if TRACE Log("%u ranges\n", uRangeCount); #endif ColsToRanges(AnchorCols, uAnchorColCount, uColCountIn, Ranges); ListVertSavings(uColCountIn, uAnchorColCount, Ranges, uRangeCount); #if TRACE { Log("Anchor cols: "); for (unsigned i = 0; i < uAnchorColCount; ++i) Log(" %u", AnchorCols[i]); Log("\n"); Log("Ranges:\n"); for (unsigned i = 0; i < uRangeCount; ++i) Log("%4u - %4u\n", Ranges[i].m_uBestColLeft, Ranges[i].m_uBestColRight); } #endif delete[] AnchorCols; MSA msaOut; msaOut.SetSize(uSeqCountIn, 0); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCountIn; ++uSeqIndex) { const char *ptrName = msaIn.GetSeqName(uSeqIndex); unsigned uId = msaIn.GetSeqId(uSeqIndex); msaOut.SetSeqName(uSeqIndex, ptrName); msaOut.SetSeqId(uSeqIndex, uId); } for (unsigned uRangeIndex = 0; uRangeIndex < uRangeCount; ++uRangeIndex) { MSA msaRange; const Range &r = Ranges[uRangeIndex]; const unsigned uFromColIndex = r.m_uBestColLeft; const unsigned uRangeColCount = r.m_uBestColRight - uFromColIndex; if (0 == uRangeColCount) continue; else if (1 == uRangeColCount) { MSAFromColRange(msaIn, uFromColIndex, 1, msaRange); MSAAppend(msaOut, msaRange); continue; } MSAFromColRange(msaIn, uFromColIndex, uRangeColCount, msaRange); #if TRACE Log("\n-------------\n"); Log("Range %u - %u count=%u\n", r.m_uBestColLeft, r.m_uBestColRight, uRangeColCount); Log("Before:\n"); msaRange.LogMe(); #endif bool bLockLeft = (0 != uRangeIndex); bool bLockRight = (uRangeCount - 1 != uRangeIndex); bool bAnyChangesThisBlock = RefineHoriz(msaRange, tree, uIters, bLockLeft, bLockRight); bAnyChanges = (bAnyChanges || bAnyChangesThisBlock); #if TRACE Log("After:\n"); msaRange.LogMe(); #endif MSAAppend(msaOut, msaRange); #if TRACE Log("msaOut after Cat:\n"); msaOut.LogMe(); #endif } #if DEBUG // Sanity check AssertMSAEqIgnoreCaseAndGaps(msaIn, msaOut); #endif delete[] Ranges; if (bAnyChanges) msaIn.Copy(msaOut); return bAnyChanges; }
SCORE ScoreSeqPairLetters(const MSA &msa1, unsigned uSeqIndex1, const MSA &msa2, unsigned uSeqIndex2) { const unsigned uColCount = msa1.GetColCount(); const unsigned uColCount2 = msa2.GetColCount(); if (uColCount != uColCount2) Quit("ScoreSeqPairLetters, different lengths"); #if TRACE_SEQPAIR { Log("\n"); Log("ScoreSeqPairLetters\n"); MSA msaTmp; msaTmp.SetSize(2, uColCount); msaTmp.CopySeq(0, msa1, uSeqIndex1); msaTmp.CopySeq(1, msa2, uSeqIndex2); msaTmp.LogMe(); } #endif SCORE scoreLetters = 0; SCORE scoreGaps = 0; bool bGapping1 = false; bool bGapping2 = false; unsigned uColStart = 0; bool bLeftTermGap = false; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex); bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex); if (!bGap1 || !bGap2) { if (bGap1 || bGap2) bLeftTermGap = true; uColStart = uColIndex; break; } } unsigned uColEnd = uColCount - 1; bool bRightTermGap = false; for (int iColIndex = (int) uColCount - 1; iColIndex >= 0; --iColIndex) { bool bGap1 = msa1.IsGap(uSeqIndex1, iColIndex); bool bGap2 = msa2.IsGap(uSeqIndex2, iColIndex); if (!bGap1 || !bGap2) { if (bGap1 || bGap2) bRightTermGap = true; uColEnd = (unsigned) iColIndex; break; } } #if TRACE_SEQPAIR Log("LeftTermGap=%d RightTermGap=%d\n", bLeftTermGap, bRightTermGap); #endif for (unsigned uColIndex = uColStart; uColIndex <= uColEnd; ++uColIndex) { unsigned uLetter1 = msa1.GetLetterEx(uSeqIndex1, uColIndex); if (uLetter1 >= g_AlphaSize) continue; unsigned uLetter2 = msa2.GetLetterEx(uSeqIndex2, uColIndex); if (uLetter2 >= g_AlphaSize) continue; SCORE scoreMatch = (*g_ptrScoreMatrix)[uLetter1][uLetter2]; scoreLetters += scoreMatch; } return scoreLetters; }
static void ProgressiveAlignSubfams(const Tree &tree, const unsigned Subfams[], unsigned uSubfamCount, const MSA SubfamMSAs[], MSA &msa) { const unsigned uNodeCount = tree.GetNodeCount(); bool *Ready = new bool[uNodeCount]; MSA **MSAs = new MSA *[uNodeCount]; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { Ready[uNodeIndex] = false; MSAs[uNodeIndex] = 0; } for (unsigned uSubfamIndex = 0; uSubfamIndex < uSubfamCount; ++uSubfamIndex) { unsigned uNodeIndex = Subfams[uSubfamIndex]; Ready[uNodeIndex] = true; MSA *ptrMSA = new MSA; // TODO: Wasteful copy, needs re-design ptrMSA->Copy(SubfamMSAs[uSubfamIndex]); MSAs[uNodeIndex] = ptrMSA; } for (unsigned uNodeIndex = tree.FirstDepthFirstNode(); NULL_NEIGHBOR != uNodeIndex; uNodeIndex = tree.NextDepthFirstNode(uNodeIndex)) { if (tree.IsLeaf(uNodeIndex)) continue; unsigned uRight = tree.GetRight(uNodeIndex); unsigned uLeft = tree.GetLeft(uNodeIndex); if (!Ready[uRight] || !Ready[uLeft]) continue; MSA *ptrLeft = MSAs[uLeft]; MSA *ptrRight = MSAs[uRight]; assert(ptrLeft != 0 && ptrRight != 0); MSA *ptrParent = new MSA; PWPath Path; AlignTwoMSAs(*ptrLeft, *ptrRight, *ptrParent, Path); MSAs[uNodeIndex] = ptrParent; Ready[uNodeIndex] = true; Ready[uLeft] = false; Ready[uRight] = false; delete MSAs[uLeft]; delete MSAs[uRight]; MSAs[uLeft] = 0; MSAs[uRight] = 0; } #if DEBUG { unsigned uReadyCount = 0; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (Ready[uNodeIndex]) { assert(tree.IsRoot(uNodeIndex)); ++uReadyCount; assert(0 != MSAs[uNodeIndex]); } else assert(0 == MSAs[uNodeIndex]); } assert(1 == uReadyCount); } #endif const unsigned uRoot = tree.GetRootNodeIndex(); MSA *ptrRootAlignment = MSAs[uRoot]; msa.Copy(*ptrRootAlignment); delete ptrRootAlignment; #if TRACE Log("After refine subfamilies, root alignment=\n"); msa.LogMe(); #endif }
// The usual sum-of-pairs objective score: sum the score // of the alignment of each pair of sequences. SCORE ObjScoreSP(const MSA &msa, SCORE MatchScore[]) { #if TRACE Log("==================ObjScoreSP==============\n"); Log("msa=\n"); msa.LogMe(); #endif g_SPScoreLetters = 0; g_SPScoreGaps = 0; if (0 != MatchScore) { const unsigned uColCount = msa.GetColCount(); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) MatchScore[uColIndex] = 0; } const unsigned uSeqCount = msa.GetSeqCount(); SCORE scoreTotal = 0; unsigned uPairCount = 0; #if TRACE Log("Seq1 Seq2 wt1 wt2 Letters Gaps Unwt.Score Wt.Score Total\n"); Log("---- ---- ------ ------ ---------- ---------- ---------- ---------- ----------\n"); #endif for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1) { const WEIGHT w1 = msa.GetSeqWeight(uSeqIndex1); for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount; ++uSeqIndex2) { const WEIGHT w2 = msa.GetSeqWeight(uSeqIndex2); const WEIGHT w = w1*w2; SCORE scoreLetters = ScoreSeqPairLetters(msa, uSeqIndex1, msa, uSeqIndex2); SCORE scoreGaps = ScoreSeqPairGaps(msa, uSeqIndex1, msa, uSeqIndex2); SCORE scorePair = scoreLetters + scoreGaps; ++uPairCount; scoreTotal += w*scorePair; g_SPScoreLetters += w*scoreLetters; g_SPScoreGaps += w*scoreGaps; #if TRACE Log("%4d %4d %6.3f %6.3f %10.2f %10.2f %10.2f %10.2f %10.2f >%s >%s\n", uSeqIndex1, uSeqIndex2, w1, w2, scoreLetters, scoreGaps, scorePair, scorePair*w1*w2, scoreTotal, msa.GetSeqName(uSeqIndex1), msa.GetSeqName(uSeqIndex2)); #endif } } #if TEST_SPFAST { SCORE f = ObjScoreSPFast(msa); Log("Fast = %.6g\n", f); Log("Brute = %.6g\n", scoreTotal); if (BTEq(f, scoreTotal)) Log("Agree\n"); else Log("** DISAGREE **\n"); } #endif // return scoreTotal / uPairCount; return scoreTotal; }
SCORE ScoreSeqPairGaps(const MSA &msa1, unsigned uSeqIndex1, const MSA &msa2, unsigned uSeqIndex2) { const unsigned uColCount = msa1.GetColCount(); const unsigned uColCount2 = msa2.GetColCount(); if (uColCount != uColCount2) Quit("ScoreSeqPairGaps, different lengths"); #if TRACE_SEQPAIR { Log("\n"); Log("ScoreSeqPairGaps\n"); MSA msaTmp; msaTmp.SetSize(2, uColCount); msaTmp.CopySeq(0, msa1, uSeqIndex1); msaTmp.CopySeq(1, msa2, uSeqIndex2); msaTmp.LogMe(); } #endif SCORE scoreGaps = 0; bool bGapping1 = false; bool bGapping2 = false; unsigned uColStart = 0; bool bLeftTermGap = false; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex); bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex); if (!bGap1 || !bGap2) { if (bGap1 || bGap2) bLeftTermGap = true; uColStart = uColIndex; break; } } unsigned uColEnd = uColCount - 1; bool bRightTermGap = false; for (int iColIndex = (int) uColCount - 1; iColIndex >= 0; --iColIndex) { bool bGap1 = msa1.IsGap(uSeqIndex1, iColIndex); bool bGap2 = msa2.IsGap(uSeqIndex2, iColIndex); if (!bGap1 || !bGap2) { if (bGap1 || bGap2) bRightTermGap = true; uColEnd = (unsigned) iColIndex; break; } } #if TRACE_SEQPAIR Log("LeftTermGap=%d RightTermGap=%d\n", bLeftTermGap, bRightTermGap); #endif for (unsigned uColIndex = uColStart; uColIndex <= uColEnd; ++uColIndex) { bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex); bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex); if (bGap1 && bGap2) continue; if (bGap1) { if (!bGapping1) { #if TRACE_SEQPAIR Log("Gap open seq 1 col %d\n", uColIndex); #endif if (uColIndex == uColStart) scoreGaps += TermGapScore(true); else scoreGaps += g_scoreGapOpen; bGapping1 = true; } else scoreGaps += g_scoreGapExtend; continue; } else if (bGap2) { if (!bGapping2) { #if TRACE_SEQPAIR Log("Gap open seq 2 col %d\n", uColIndex); #endif if (uColIndex == uColStart) scoreGaps += TermGapScore(true); else scoreGaps += g_scoreGapOpen; bGapping2 = true; } else scoreGaps += g_scoreGapExtend; continue; } bGapping1 = false; bGapping2 = false; } if (bGapping1 || bGapping2) { scoreGaps -= g_scoreGapOpen; scoreGaps += TermGapScore(true); } return scoreGaps; }
/* a version of ScoreSeqPairGaps that computes a per-residue score */ SCORE ScoreSeqPairGaps(const MSA &msa1, unsigned uSeqIndex1, const MSA &msa2, unsigned uSeqIndex2, SCORE MatchScore[] ) { const unsigned uColCount = msa1.GetColCount(); const unsigned uColCount2 = msa2.GetColCount(); if (uColCount != uColCount2) Quit("ScoreSeqPairGaps, different lengths"); #if TRACE_SEQPAIR { Log("\n"); Log("ScoreSeqPairGaps\n"); MSA msaTmp; msaTmp.SetSize(2, uColCount); msaTmp.CopySeq(0, msa1, uSeqIndex1); msaTmp.CopySeq(1, msa2, uSeqIndex2); msaTmp.LogMe(); } #endif SCORE scoreGaps = 0; bool bGapping1 = false; bool bGapping2 = false; unsigned uColStart = 0; bool bLeftTermGap = false; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex); bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex); if (!bGap1 || !bGap2) { if (bGap1 || bGap2) bLeftTermGap = true; uColStart = uColIndex; break; } } unsigned uColEnd = uColCount - 1; bool bRightTermGap = false; for (int iColIndex = (int) uColCount - 1; iColIndex >= 0; --iColIndex) { bool bGap1 = msa1.IsGap(uSeqIndex1, iColIndex); bool bGap2 = msa2.IsGap(uSeqIndex2, iColIndex); if (!bGap1 || !bGap2) { if (bGap1 || bGap2) bRightTermGap = true; uColEnd = (unsigned) iColIndex; break; } } #if TRACE_SEQPAIR Log("LeftTermGap=%d RightTermGap=%d\n", bLeftTermGap, bRightTermGap); #endif unsigned gap_left_col = 0; SCORE cur_gap_score = 0; for (unsigned uColIndex = uColStart; uColIndex <= uColEnd; ++uColIndex) { bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex); bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex); if (bGap1 && bGap2) continue; if (bGap1) { if (!bGapping1) { #if TRACE_SEQPAIR Log("Gap open seq 1 col %d\n", uColIndex); #endif gap_left_col = uColIndex; if (uColIndex == uColStart) { scoreGaps += TermGapScore(true); cur_gap_score += TermGapScore(true); }else{ scoreGaps += g_scoreGapOpen.get(); cur_gap_score += g_scoreGapOpen.get(); } bGapping1 = true; } else { scoreGaps += g_scoreGapExtend.get(); cur_gap_score += g_scoreGapExtend.get(); } continue; } else if (bGap2) { if (!bGapping2) { #if TRACE_SEQPAIR Log("Gap open seq 2 col %d\n", uColIndex); #endif gap_left_col = uColIndex; if (uColIndex == uColStart) { scoreGaps += TermGapScore(true); cur_gap_score += TermGapScore(true); }else{ scoreGaps += g_scoreGapOpen.get(); cur_gap_score += g_scoreGapOpen.get(); } bGapping2 = true; } else { scoreGaps += g_scoreGapExtend.get(); cur_gap_score += g_scoreGapExtend.get(); } continue; } if( MatchScore != NULL && (bGapping1 || bGapping2) ) { // spread the total gap penalty evenly across all columns SCORE per_site_penalty = cur_gap_score / (uColIndex-gap_left_col); for( unsigned uGapIndex = gap_left_col; uGapIndex < uColIndex; ++uGapIndex ) { MatchScore[uGapIndex] = per_site_penalty; } gap_left_col = uInsane; cur_gap_score = 0; } bGapping1 = false; bGapping2 = false; } if (bGapping1 || bGapping2) { scoreGaps -= g_scoreGapOpen.get(); scoreGaps += TermGapScore(true); cur_gap_score -= g_scoreGapOpen.get(); cur_gap_score += TermGapScore(true); if( MatchScore != NULL ) { // spread the total gap penalty evenly across all columns SCORE per_site_penalty = cur_gap_score / (uColCount-gap_left_col); for( unsigned uGapIndex = gap_left_col; uGapIndex < uColCount; ++uGapIndex ) { MatchScore[uGapIndex] = per_site_penalty; } } } return scoreGaps; }
// Return true if any changes made void AnchoredProfileProfile(MSA &msa1, MSA &msa2, MSA &msaOut) { const unsigned uColCountIn = msa1.GetColCount(); const unsigned uSeqCountIn = msa1.GetSeqCount() + msa2.GetSeqCount(); unsigned *AnchorCols = new unsigned[uColCountIn]; unsigned uAnchorColCount; PrepareMSAforScoring(msa1); PrepareMSAforScoring(msa2); FindAnchorColsPP(msa1, msa2, AnchorCols, &uAnchorColCount); const unsigned uRangeCount = uAnchorColCount + 1; Range *Ranges = new Range[uRangeCount]; #if TRACE Log("%u ranges\n", uRangeCount); #endif ColsToRanges(AnchorCols, uAnchorColCount, uColCountIn, Ranges); ListVertSavings(uColCountIn, uAnchorColCount, Ranges, uRangeCount); #if TRACE { Log("Anchor cols: "); for (unsigned i = 0; i < uAnchorColCount; ++i) Log(" %u", AnchorCols[i]); Log("\n"); Log("Ranges:\n"); for (unsigned i = 0; i < uRangeCount; ++i) Log("%4u - %4u\n", Ranges[i].m_uBestColLeft, Ranges[i].m_uBestColRight); } #endif delete[] AnchorCols; msaOut.SetSize(uSeqCountIn, 0); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCountIn; ++uSeqIndex) { const char *ptrName; unsigned uId; if( uSeqIndex < msa1.GetSeqCount() ) { msa1.SetSeqId(uSeqIndex, uSeqIndex); ptrName = msa1.GetSeqName(uSeqIndex); } else { msa2.SetSeqId(uSeqIndex-msa1.GetSeqCount(), uSeqIndex); ptrName = msa2.GetSeqName(uSeqIndex-msa1.GetSeqCount()); } msaOut.SetSeqName(uSeqIndex, ptrName); msaOut.SetSeqId(uSeqIndex, uSeqIndex); } for (unsigned uRangeIndex = 0; uRangeIndex < uRangeCount; ++uRangeIndex) { MSA msaRange1; MSA msaRange2; MSA msaRangeOut; const Range &r = Ranges[uRangeIndex]; const unsigned uFromColIndex = r.m_uBestColLeft; const unsigned uRangeColCount = r.m_uBestColRight - uFromColIndex; if (0 == uRangeColCount) continue; /* else if (1 == uRangeColCount) { MSAFromColRange(msaIn, uFromColIndex, 1, msaRange); MSAAppend(msaOut, msaRange); continue; } */ MSAFromColRange(msa1, uFromColIndex, uRangeColCount, msaRange1); MSAFromColRange(msa2, uFromColIndex, uRangeColCount, msaRange2); StripGapColumns(msaRange1); StripGapColumns(msaRange2); #if TRACE Log("\n-------------\n"); Log("Range %u - %u count=%u\n", r.m_uBestColLeft, r.m_uBestColRight, uRangeColCount); Log("Before:\n"); msaRange1.LogMe(); msaRange2.LogMe(); #endif ProfileProfile(msaRange1, msaRange2, msaRangeOut); #if TRACE Log("After:\n"); msaRangeOut.LogMe(); #endif for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCountIn; ++uSeqIndex) msaRangeOut.SetSeqId(uSeqIndex, uSeqIndex); MSAAppend(msaOut, msaRangeOut); #if TRACE Log("msaOut after Cat:\n"); msaOut.LogMe(); #endif } delete[] Ranges; }
// this is a version of the profile x profile score that computes // a per-site score suitable for use with anchoring heuristics SCORE LetterObjScoreXP(const MSA &msa1, const MSA &msa2, SCORE MatchScore[]) { const unsigned uColCount1 = msa1.GetColCount(); const unsigned uColCount2 = msa2.GetColCount(); if (uColCount1 != uColCount2) Quit("ObjScoreXP, alignment lengths differ %u %u", uColCount1, uColCount2); const unsigned uSeqCount1 = msa1.GetSeqCount(); const unsigned uSeqCount2 = msa2.GetSeqCount(); #if TRACE Log(" Score Weight Weight Total\n"); Log("---------- ------ ------ ----------\n"); #endif SCORE* mmScore = NULL; SCORE* ggScore = NULL; if( MatchScore != NULL ) { mmScore = new SCORE[uColCount1]; ggScore = new SCORE[uColCount1]; memset( MatchScore, 0, sizeof(SCORE)*uColCount1 ); } SCORE scoreTotal = 0; unsigned uPairCount = 0; for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount1; ++uSeqIndex1) { const WEIGHT w1 = msa1.GetSeqWeight(uSeqIndex1); for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqCount2; ++uSeqIndex2) { if( mmScore != NULL ) memset( mmScore, 0, sizeof(SCORE)*uColCount1 ); if( ggScore != NULL ) memset( ggScore, 0, sizeof(SCORE)*uColCount1 ); const WEIGHT w2 = msa2.GetSeqWeight(uSeqIndex2); const WEIGHT w = w1*w2; SCORE scoreLetters = ScoreSeqPairLetters(msa1, uSeqIndex1, msa2, uSeqIndex2, mmScore); SCORE scoreGaps = ScoreSeqPairGaps(msa1, uSeqIndex1, msa2, uSeqIndex2, ggScore); SCORE scorePair = scoreLetters + scoreGaps; scoreTotal += w*scorePair; ++uPairCount; if( MatchScore != NULL ) for( unsigned uColIndex = 0; uColIndex < uColCount1; ++uColIndex ) MatchScore[uColIndex] += w*(mmScore[uColIndex]+ggScore[uColIndex]); #if TRACE Log("%10.2f %6.3f %6.3f %10.2f >%s >%s\n", scorePair, w1, w2, scorePair*w1*w2, msa1.GetSeqName(uSeqIndex1), msa2.GetSeqName(uSeqIndex2)); #endif } } if (0 == uPairCount) Quit("0 == uPairCount"); #if TRACE Log("msa1=\n"); msa1.LogMe(); Log("msa2=\n"); msa2.LogMe(); Log("XP=%g\n", scoreTotal); #endif // return scoreTotal / uPairCount; if( mmScore != NULL ) delete[] mmScore; if( ggScore != NULL ) delete[] ggScore; return scoreTotal; }