void DistPWScoreDist(const SeqVect &v, DistFunc &DF) { SEQWEIGHT SeqWeightSave = GetSeqWeightMethod(); SetSeqWeightMethod(SEQWEIGHT_Henikoff); const unsigned uSeqCount = v.Length(); DF.SetCount(uSeqCount); const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2; unsigned uCount = 0; SetProgressDesc("PW ScoreDist"); for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1) { const Seq &s1 = v.GetSeq(uSeqIndex1); MSA msa1; msa1.FromSeq(s1); for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqIndex1; ++uSeqIndex2) { if (0 == uCount%20) Progress(uCount, uPairCount); ++uCount; const Seq &s2 = v.GetSeq(uSeqIndex2); MSA msa2; msa2.FromSeq(s2); PWPath Path; MSA msaOut; AlignTwoMSAs(msa1, msa2, msaOut, Path, false, false); float d = (float) GetScoreDist(msaOut, 0, 1); DF.SetDist(uSeqIndex1, uSeqIndex2, d); } } ProgressStepsDone(); SetSeqWeightMethod(SeqWeightSave); }
ProgNode *ProgressiveAlignE(const SeqVect &v, const Tree &GuideTree, MSA &a) { assert(GuideTree.IsRooted()); #if TRACE Log("GuideTree:\n"); GuideTree.LogMe(); #endif const unsigned uSeqCount = v.Length(); const unsigned uNodeCount = 2*uSeqCount - 1; const unsigned uIterCount = uSeqCount - 1; WEIGHT *Weights = new WEIGHT[uSeqCount]; CalcClustalWWeights(GuideTree, Weights); ProgNode *ProgNodes = new ProgNode[uNodeCount]; unsigned uJoin = 0; unsigned uTreeNodeIndex = GuideTree.FirstDepthFirstNode(); SetProgressDesc("Align node"); do { if (GuideTree.IsLeaf(uTreeNodeIndex)) { if (uTreeNodeIndex >= uNodeCount) Quit("TreeNodeIndex=%u NodeCount=%u\n", uTreeNodeIndex, uNodeCount); ProgNode &Node = ProgNodes[uTreeNodeIndex]; unsigned uId = GuideTree.GetLeafId(uTreeNodeIndex); if (uId >= uSeqCount) Quit("Seq index out of range"); const Seq &s = *(v[uId]); Node.m_MSA.FromSeq(s); Node.m_MSA.SetSeqId(0, uId); Node.m_uLength = Node.m_MSA.GetColCount(); Node.m_Weight = Weights[uId]; // TODO: Term gaps settable Node.m_Prof = ProfileFromMSA(Node.m_MSA); Node.m_EstringL = 0; Node.m_EstringR = 0; #if TRACE Log("Leaf id=%u\n", uId); Log("MSA=\n"); Node.m_MSA.LogMe(); Log("Profile (from MSA)=\n"); ListProfile(Node.m_Prof, Node.m_uLength, &Node.m_MSA); #endif } else { Progress(uJoin, uSeqCount - 1); ++uJoin; const unsigned uMergeNodeIndex = uTreeNodeIndex; ProgNode &Parent = ProgNodes[uMergeNodeIndex]; const unsigned uLeft = GuideTree.GetLeft(uTreeNodeIndex); const unsigned uRight = GuideTree.GetRight(uTreeNodeIndex); if (g_bVerbose) { Log("Align: ("); LogLeafNames(GuideTree, uLeft); Log(") ("); LogLeafNames(GuideTree, uRight); Log(")\n"); } ProgNode &Node1 = ProgNodes[uLeft]; ProgNode &Node2 = ProgNodes[uRight]; #if TRACE Log("AlignTwoMSAs:\n"); #endif AlignTwoProfs( Node1.m_Prof, Node1.m_uLength, Node1.m_Weight, Node2.m_Prof, Node2.m_uLength, Node2.m_Weight, Parent.m_Path, &Parent.m_Prof, &Parent.m_uLength); #if TRACE_LENGTH_DELTA { unsigned L = Node1.m_uLength; unsigned R = Node2.m_uLength; unsigned P = Parent.m_Path.GetEdgeCount(); unsigned Max = L > R ? L : R; unsigned d = P - Max; Log("LD%u;%u;%u;%u\n", L, R, P, d); } #endif PathToEstrings(Parent.m_Path, &Parent.m_EstringL, &Parent.m_EstringR); Parent.m_Weight = Node1.m_Weight + Node2.m_Weight; #if VALIDATE { #if TRACE Log("AlignTwoMSAs:\n"); #endif PWPath TmpPath; AlignTwoMSAs(Node1.m_MSA, Node2.m_MSA, Parent.m_MSA, TmpPath); ProfPos *P1 = ProfileFromMSA(Node1.m_MSA, true); ProfPos *P2 = ProfileFromMSA(Node2.m_MSA, true); unsigned uLength = Parent.m_MSA.GetColCount(); ProfPos *TmpProf = ProfileFromMSA(Parent.m_MSA, true); #if TRACE Log("Node1 MSA=\n"); Node1.m_MSA.LogMe(); Log("Node1 prof=\n"); ListProfile(Node1.m_Prof, Node1.m_MSA.GetColCount(), &Node1.m_MSA); Log("Node1 prof (from MSA)=\n"); ListProfile(P1, Node1.m_MSA.GetColCount(), &Node1.m_MSA); AssertProfsEq(Node1.m_Prof, Node1.m_uLength, P1, Node1.m_MSA.GetColCount()); Log("Node2 prof=\n"); ListProfile(Node2.m_Prof, Node2.m_MSA.GetColCount(), &Node2.m_MSA); Log("Node2 MSA=\n"); Node2.m_MSA.LogMe(); Log("Node2 prof (from MSA)=\n"); ListProfile(P2, Node2.m_MSA.GetColCount(), &Node2.m_MSA); AssertProfsEq(Node2.m_Prof, Node2.m_uLength, P2, Node2.m_MSA.GetColCount()); TmpPath.AssertEqual(Parent.m_Path); Log("Parent MSA=\n"); Parent.m_MSA.LogMe(); Log("Parent prof=\n"); ListProfile(Parent.m_Prof, Parent.m_uLength, &Parent.m_MSA); Log("Parent prof (from MSA)=\n"); ListProfile(TmpProf, Parent.m_MSA.GetColCount(), &Parent.m_MSA); #endif // TRACE AssertProfsEq(Parent.m_Prof, Parent.m_uLength, TmpProf, Parent.m_MSA.GetColCount()); delete[] P1; delete[] P2; delete[] TmpProf; } #endif // VALIDATE Node1.m_MSA.Clear(); Node2.m_MSA.Clear(); // Don't delete profiles, may need them for tree refinement. //delete[] Node1.m_Prof; //delete[] Node2.m_Prof; //Node1.m_Prof = 0; //Node2.m_Prof = 0; } uTreeNodeIndex = GuideTree.NextDepthFirstNode(uTreeNodeIndex); } while (NULL_NEIGHBOR != uTreeNodeIndex); ProgressStepsDone(); if (g_bBrenner) MakeRootMSABrenner((SeqVect &) v, GuideTree, ProgNodes, a); else MakeRootMSA(v, GuideTree, ProgNodes, a); #if VALIDATE { unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex(); const ProgNode &RootProgNode = ProgNodes[uRootNodeIndex]; AssertMSAEq(a, RootProgNode.m_MSA); } #endif delete[] Weights; return ProgNodes; }
static void ProgressiveAlignSubfams(const Tree &tree, const unsigned Subfams[], unsigned uSubfamCount, const MSA SubfamMSAs[], MSA &msa) { const unsigned uNodeCount = tree.GetNodeCount(); bool *Ready = new bool[uNodeCount]; MSA **MSAs = new MSA *[uNodeCount]; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { Ready[uNodeIndex] = false; MSAs[uNodeIndex] = 0; } for (unsigned uSubfamIndex = 0; uSubfamIndex < uSubfamCount; ++uSubfamIndex) { unsigned uNodeIndex = Subfams[uSubfamIndex]; Ready[uNodeIndex] = true; MSA *ptrMSA = new MSA; // TODO: Wasteful copy, needs re-design ptrMSA->Copy(SubfamMSAs[uSubfamIndex]); MSAs[uNodeIndex] = ptrMSA; } for (unsigned uNodeIndex = tree.FirstDepthFirstNode(); NULL_NEIGHBOR != uNodeIndex; uNodeIndex = tree.NextDepthFirstNode(uNodeIndex)) { if (tree.IsLeaf(uNodeIndex)) continue; unsigned uRight = tree.GetRight(uNodeIndex); unsigned uLeft = tree.GetLeft(uNodeIndex); if (!Ready[uRight] || !Ready[uLeft]) continue; MSA *ptrLeft = MSAs[uLeft]; MSA *ptrRight = MSAs[uRight]; assert(ptrLeft != 0 && ptrRight != 0); MSA *ptrParent = new MSA; PWPath Path; AlignTwoMSAs(*ptrLeft, *ptrRight, *ptrParent, Path); MSAs[uNodeIndex] = ptrParent; Ready[uNodeIndex] = true; Ready[uLeft] = false; Ready[uRight] = false; delete MSAs[uLeft]; delete MSAs[uRight]; MSAs[uLeft] = 0; MSAs[uRight] = 0; } #if DEBUG { unsigned uReadyCount = 0; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (Ready[uNodeIndex]) { assert(tree.IsRoot(uNodeIndex)); ++uReadyCount; assert(0 != MSAs[uNodeIndex]); } else assert(0 == MSAs[uNodeIndex]); } assert(1 == uReadyCount); } #endif const unsigned uRoot = tree.GetRootNodeIndex(); MSA *ptrRootAlignment = MSAs[uRoot]; msa.Copy(*ptrRootAlignment); delete ptrRootAlignment; delete[] Ready; #if TRACE Log("After refine subfamilies, root alignment=\n"); msa.LogMe(); #endif }
bool TryRealign(MSA &msaIn, const Tree &tree, const unsigned Leaves1[], unsigned uCount1, const unsigned Leaves2[], unsigned uCount2, SCORE *ptrscoreBefore, SCORE *ptrscoreAfter, bool bLockLeft, bool bLockRight) { #if TRACE Log("TryRealign, msaIn=\n"); #endif MuscleContext *ctx = getMuscleContext(); const unsigned uSeqCount = msaIn.GetSeqCount(); unsigned *Ids1 = new unsigned[uSeqCount]; unsigned *Ids2 = new unsigned[uSeqCount]; LeafIndexesToIds(tree, Leaves1, uCount1, Ids1); LeafIndexesToIds(tree, Leaves2, uCount2, Ids2); MSA msa1; MSA msa2; MSASubsetByIds(msaIn, Ids1, uCount1, msa1); MSASubsetByIds(msaIn, Ids2, uCount2, msa2); #if DEBUG ValidateMuscleIds(msa1); ValidateMuscleIds(msa2); #endif // Computing the objective score may be expensive for // large numbers of sequences. As a speed optimization, // we check whether the alignment changes. If it does // not change, there is no need to compute the objective // score. We test for the alignment changing by comparing // the Viterbi paths before and after re-aligning. PWPath pathBefore; pathBefore.FromMSAPair(msa1, msa2); DeleteGappedCols(msa1); DeleteGappedCols(msa2); if (0 == msa1.GetColCount() || 0 == msa2.GetColCount()) { delete[] Ids1; delete[] Ids2; return false; } MSA msaRealigned; PWPath pathAfter; AlignTwoMSAs(msa1, msa2, msaRealigned, pathAfter, bLockLeft, bLockRight); bool bAnyChanges = !pathAfter.Equal(pathBefore); unsigned uDiffCount1; unsigned uDiffCount2; unsigned* Edges1 = ctx->refinehoriz.Edges1; unsigned* Edges2 = ctx->refinehoriz.Edges2; DiffPaths(pathBefore, pathAfter, Edges1, &uDiffCount1, Edges2, &uDiffCount2); #if TRACE Log("TryRealign, msa1=\n"); Log("\nmsa2=\n"); Log("\nRealigned (changes %s)=\n", bAnyChanges ? "TRUE" : "FALSE"); #endif if (!bAnyChanges) { *ptrscoreBefore = 0; *ptrscoreAfter = 0; delete[] Ids1; delete[] Ids2; return false; } SetMSAWeightsMuscle(msaIn); SetMSAWeightsMuscle(msaRealigned); #if DIFFOBJSCORE const SCORE scoreDiff = DiffObjScore(msaIn, pathBefore, Edges1, uDiffCount1, msaRealigned, pathAfter, Edges2, uDiffCount2); bool bAccept = (scoreDiff > 0); *ptrscoreBefore = 0; *ptrscoreAfter = scoreDiff; //const SCORE scoreBefore = ObjScoreIds(msaIn, Ids1, uCount1, Ids2, uCount2); //const SCORE scoreAfter = ObjScoreIds(msaRealigned, Ids1, uCount1, Ids2, uCount2); //Log("Diff = %.3g %.3g\n", scoreDiff, scoreAfter - scoreBefore); #else const SCORE scoreBefore = ObjScoreIds(msaIn, Ids1, uCount1, Ids2, uCount2); const SCORE scoreAfter = ObjScoreIds(msaRealigned, Ids1, uCount1, Ids2, uCount2); bool bAccept = (scoreAfter > scoreBefore); #if TRACE Log("Score %g -> %g Accept %s\n", scoreBefore, scoreAfter, bAccept ? "TRUE" : "FALSE"); #endif *ptrscoreBefore = scoreBefore; *ptrscoreAfter = scoreAfter; #endif if (bAccept) msaIn.Copy(msaRealigned); delete[] Ids1; delete[] Ids2; return bAccept; }
void ProgressiveAlign(const SeqVect &v, const Tree &GuideTree, MSA &a) { assert(GuideTree.IsRooted()); #if TRACE Log("GuideTree:\n"); GuideTree.LogMe(); #endif const unsigned uSeqCount = v.Length(); const unsigned uNodeCount = 2*uSeqCount - 1; ProgNode *ProgNodes = new ProgNode[uNodeCount]; unsigned uJoin = 0; unsigned uTreeNodeIndex = GuideTree.FirstDepthFirstNode(); SetProgressDesc("Align node"); do { if (GuideTree.IsLeaf(uTreeNodeIndex)) { if (uTreeNodeIndex >= uNodeCount) Quit("TreeNodeIndex=%u NodeCount=%u\n", uTreeNodeIndex, uNodeCount); ProgNode &Node = ProgNodes[uTreeNodeIndex]; unsigned uId = GuideTree.GetLeafId(uTreeNodeIndex); if (uId >= uSeqCount) Quit("Seq index out of range"); const Seq &s = *(v[uId]); Node.m_MSA.FromSeq(s); Node.m_MSA.SetSeqId(0, uId); Node.m_uLength = Node.m_MSA.GetColCount(); } else { Progress(uJoin, uSeqCount - 1); ++uJoin; const unsigned uMergeNodeIndex = uTreeNodeIndex; ProgNode &Parent = ProgNodes[uMergeNodeIndex]; const unsigned uLeft = GuideTree.GetLeft(uTreeNodeIndex); const unsigned uRight = GuideTree.GetRight(uTreeNodeIndex); ProgNode &Node1 = ProgNodes[uLeft]; ProgNode &Node2 = ProgNodes[uRight]; PWPath Path; AlignTwoMSAs(Node1.m_MSA, Node2.m_MSA, Parent.m_MSA, Path); Parent.m_uLength = Parent.m_MSA.GetColCount(); Node1.m_MSA.Clear(); Node2.m_MSA.Clear(); } uTreeNodeIndex = GuideTree.NextDepthFirstNode(uTreeNodeIndex); } while (NULL_NEIGHBOR != uTreeNodeIndex); ProgressStepsDone(); unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex(); const ProgNode &RootProgNode = ProgNodes[uRootNodeIndex]; a.Copy(RootProgNode.m_MSA); delete[] ProgNodes; ProgNodes = 0; }
void ProgAlignSubFams() { MSA msaOut; SetOutputFileName(g_pstrOutFileName.get()); SetInputFileName(g_pstrInFileName.get()); SetMaxIters(g_uMaxIters.get()); SetSeqWeightMethod(g_SeqWeight1.get()); TextFile fileIn(g_pstrInFileName.get()); SeqVect v; v.FromFASTAFile(fileIn); const unsigned uSeqCount = v.Length(); if (0 == uSeqCount) Quit("No sequences in input file"); ALPHA Alpha = ALPHA_Undefined; switch (g_SeqType.get()) { case SEQTYPE_Auto: Alpha = v.GuessAlpha(); break; case SEQTYPE_Protein: Alpha = ALPHA_Amino; break; case SEQTYPE_DNA: Alpha = ALPHA_DNA; break; case SEQTYPE_RNA: Alpha = ALPHA_RNA; break; default: Quit("Invalid seq type"); } SetAlpha(Alpha); v.FixAlpha(); PTR_SCOREMATRIX UserMatrix = 0; if (0 != g_pstrMatrixFileName.get()) { const char *FileName = g_pstrMatrixFileName.get(); const char *Path = getenv("MUSCLE_MXPATH"); if (Path != 0) { size_t n = strlen(Path) + 1 + strlen(FileName) + 1; char *NewFileName = new char[n]; sprintf(NewFileName, "%s/%s", Path, FileName); FileName = NewFileName; } TextFile File(FileName); UserMatrix = ReadMx(File); g_Alpha = ALPHA_Amino; g_PPScore = PPSCORE_SP; } SetPPScore(); if (0 != UserMatrix) g_ptrScoreMatrix = UserMatrix; if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha) { SetPPScore(PPSCORE_SPN); g_Distance1.get() = DISTANCE_Kmer4_6; } unsigned uMaxL = 0; unsigned uTotL = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { unsigned L = v.GetSeq(uSeqIndex).Length(); uTotL += L; if (L > uMaxL) uMaxL = L; } SetIter(1); g_bDiags.get() = g_bDiags1.get(); SetSeqStats(uSeqCount, uMaxL, uTotL/uSeqCount); SetMuscleSeqVect(v); MSA::SetIdCount(uSeqCount); // Initialize sequence ids. // From this point on, ids must somehow propogate from here. for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) v.SetSeqId(uSeqIndex, uSeqIndex); if (uSeqCount > 1) MHackStart(v); if (0 == uSeqCount) { msaOut.Clear(); return; } if (1 == uSeqCount && ALPHA_Amino == Alpha) { const Seq &s = v.GetSeq(0); msaOut.FromSeq(s); return; } Tree GuideTree; TreeFromSeqVect(v, GuideTree, g_Cluster1.get(), g_Distance1.get(), g_Root1.get()); SetMuscleTree(GuideTree); MSA msa; if (g_bLow.get()) { ProgNode *ProgNodes = 0; ProgNodes = ProgressiveAlignE(v, GuideTree, msa); delete[] ProgNodes; } else ProgressiveAlign(v, GuideTree, msa); SetCurrentAlignment(msa); TreeFromMSA(msa, GuideTree, g_Cluster2.get(), g_Distance2.get(), g_Root2.get()); SetMuscleTree(GuideTree); unsigned *SubFams = new unsigned[uSeqCount]; unsigned uSubFamCount; SubFam(GuideTree, g_uMaxSubFamCount.get(), SubFams, &uSubFamCount); SetProgressDesc("Align node"); const unsigned uNodeCount = 2*uSeqCount - 1; ProgNode *ProgNodes = new ProgNode[uNodeCount]; bool *NodeIsSubFam = new bool[uNodeCount]; bool *NodeInSubFam = new bool[uNodeCount]; for (unsigned i = 0; i < uNodeCount; ++i) { NodeIsSubFam[i] = false; NodeInSubFam[i] = false; } for (unsigned i = 0; i < uSubFamCount; ++i) { unsigned uNodeIndex = SubFams[i]; assert(uNodeIndex < uNodeCount); NodeIsSubFam[uNodeIndex] = true; SetInFam(GuideTree, uNodeIndex, NodeInSubFam); } unsigned uJoin = 0; unsigned uTreeNodeIndex = GuideTree.FirstDepthFirstNode(); do { if (NodeIsSubFam[uTreeNodeIndex]) { #if TRACE Log("Node %d: align subfam\n", uTreeNodeIndex); #endif ProgNode &Node = ProgNodes[uTreeNodeIndex]; AlignSubFam(v, GuideTree, uTreeNodeIndex, Node.m_MSA); Node.m_uLength = Node.m_MSA.GetColCount(); } else if (!NodeInSubFam[uTreeNodeIndex]) { #if TRACE Log("Node %d: align two subfams\n", uTreeNodeIndex); #endif Progress(uJoin, uSubFamCount - 1); ++uJoin; const unsigned uMergeNodeIndex = uTreeNodeIndex; ProgNode &Parent = ProgNodes[uMergeNodeIndex]; const unsigned uLeft = GuideTree.GetLeft(uTreeNodeIndex); const unsigned uRight = GuideTree.GetRight(uTreeNodeIndex); ProgNode &Node1 = ProgNodes[uLeft]; ProgNode &Node2 = ProgNodes[uRight]; PWPath Path; AlignTwoMSAs(Node1.m_MSA, Node2.m_MSA, Parent.m_MSA, Path); Parent.m_uLength = Parent.m_MSA.GetColCount(); Node1.m_MSA.Clear(); Node2.m_MSA.Clear(); } else { #if TRACE Log("Node %d: in subfam\n", uTreeNodeIndex); #endif ; } uTreeNodeIndex = GuideTree.NextDepthFirstNode(uTreeNodeIndex); } while (NULL_NEIGHBOR != uTreeNodeIndex); ProgressStepsDone(); unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex(); ProgNode &RootProgNode = ProgNodes[uRootNodeIndex]; TextFile fOut(g_pstrOutFileName.get(), true); MHackEnd(RootProgNode.m_MSA); RootProgNode.m_MSA.ToFile(fOut); delete[] NodeInSubFam; delete[] NodeIsSubFam; delete[] ProgNodes; delete[] SubFams; ProgNodes = 0; NodeInSubFam = 0; NodeIsSubFam = 0; SubFams = 0; }