void ClusterByHeight(const Tree &tree, double dMaxHeight, unsigned Subtrees[], unsigned *ptruSubtreeCount) { if (!tree.IsRooted()) Quit("ClusterByHeight: requires rooted tree"); #if TRACE Log("ClusterByHeight, max height=%g\n", dMaxHeight); #endif unsigned uSubtreeCount = 0; const unsigned uNodeCount = tree.GetNodeCount(); for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (tree.IsRoot(uNodeIndex)) continue; unsigned uParent = tree.GetParent(uNodeIndex); double dHeight = tree.GetNodeHeight(uNodeIndex); double dParentHeight = tree.GetNodeHeight(uParent); #if TRACE Log("Node %3u Height %5.2f ParentHeight %5.2f\n", uNodeIndex, dHeight, dParentHeight); #endif if (dParentHeight > dMaxHeight && dHeight <= dMaxHeight) { Subtrees[uSubtreeCount] = uNodeIndex; #if TRACE Log("Subtree[%u]=%u\n", uSubtreeCount, uNodeIndex); #endif ++uSubtreeCount; } } *ptruSubtreeCount = uSubtreeCount; }
// Return false when done bool PhyEnumEdges(const Tree &tree, PhyEnumEdgeState &ES) { unsigned uNode1 = uInsane; if (!ES.m_bInit) { if (tree.GetNodeCount() <= 1) { ES.m_uNodeIndex1 = NULL_NEIGHBOR; ES.m_uNodeIndex2 = NULL_NEIGHBOR; return false; } uNode1 = tree.FirstDepthFirstNode(); ES.m_bInit = true; } else { uNode1 = tree.NextDepthFirstNode(ES.m_uNodeIndex1); if (NULL_NEIGHBOR == uNode1) return false; if (tree.IsRooted() && tree.IsRoot(uNode1)) { uNode1 = tree.NextDepthFirstNode(uNode1); if (NULL_NEIGHBOR == uNode1) return false; } } unsigned uNode2 = tree.GetParent(uNode1); ES.m_uNodeIndex1 = uNode1; ES.m_uNodeIndex2 = uNode2; return true; }
// Divide a tree containing N leaves into k families by // cutting the tree at a horizontal line at some height. // Each internal node defines a height for the cut, // considering all internal nodes enumerates all distinct // cuts. Visit internal nodes in decreasing order of height. // Visiting the node corresponds to moving the horizontal // line down to cut the tree at the height of that node. // We consider the cut to be "infinitestimally below" // the node, so the effect is to remove the current node // from the list of subfamilies and add its two children. // We must visit a parent before its children (so care may // be needed to handle zero edge lengths properly). // We assume that N is small, and write dumb O(N^2) code. // More efficient strategies are possible for large N // by maintaining a list of nodes sorted by height. void ClusterBySubfamCount(const Tree &tree, unsigned uSubfamCount, unsigned Subfams[], unsigned *ptruSubfamCount) { const unsigned uNodeCount = tree.GetNodeCount(); const unsigned uLeafCount = (uNodeCount + 1)/2; // Special case: empty tree if (0 == uNodeCount) { *ptruSubfamCount = 0; return; } // Special case: more subfamilies than leaves if (uSubfamCount >= uLeafCount) { for (unsigned n = 0; n < uLeafCount; ++n) Subfams[n] = n; *ptruSubfamCount = uLeafCount; return; } // Initialize list of subfamilies to be root Subfams[0] = tree.GetRootNodeIndex(); // Iterate for (unsigned i = 1; i < uSubfamCount; ++i) ClusterBySubfamCount_Iteration(tree, Subfams, i); *ptruSubfamCount = uSubfamCount; }
void RefineTreeE(MSA &msa, const SeqVect &v, Tree &tree, ProgNode *ProgNodes) { MuscleContext *ctx = getMuscleContext(); const unsigned uSeqCount = msa.GetSeqCount(); if (tree.GetLeafCount() != uSeqCount) Quit("Refine tree, tree has different number of nodes"); if (uSeqCount < 3) return; #if DEBUG ValidateMuscleIds(msa); ValidateMuscleIds(tree); #endif const unsigned uNodeCount = tree.GetNodeCount(); unsigned *uNewNodeIndexToOldNodeIndex= new unsigned[uNodeCount]; Tree Tree2; TreeFromMSA(msa, Tree2, ctx->params.g_Cluster2, ctx->params.g_Distance2, ctx->params.g_Root2, ctx->params.g_pstrDistMxFileName2); #if DEBUG ValidateMuscleIds(Tree2); #endif DiffTreesE(Tree2, tree, uNewNodeIndexToOldNodeIndex); unsigned uRoot = Tree2.GetRootNodeIndex(); if (NODE_CHANGED == uNewNodeIndexToOldNodeIndex[uRoot]) { MSA msa2; RealignDiffsE(msa, v, Tree2, tree, uNewNodeIndexToOldNodeIndex, msa2, ProgNodes); if (!ctx->isCanceled()) { tree.Copy(Tree2); msa.Copy(msa2); #if DEBUG ValidateMuscleIds(msa2); #endif } } delete[] uNewNodeIndexToOldNodeIndex; if (ctx->isCanceled()) { throw MuscleException("Canceled"); } SetCurrentAlignment(msa); ProgressStepsDone(); }
static void LogLeafNames(const Tree &tree, unsigned uNodeIndex) { const unsigned uNodeCount = tree.GetNodeCount(); unsigned *Leaves = new unsigned[uNodeCount]; unsigned uLeafCount; GetLeaves(tree, uNodeIndex, Leaves, &uLeafCount); for (unsigned i = 0; i < uLeafCount; ++i) { if (i > 0) Log(","); Log("%s", tree.GetLeafName(Leaves[i])); } delete[] Leaves; }
void GetInternalNodesInHeightOrder(const Tree &tree, unsigned NodeIndexes[]) { const unsigned uNodeCount = tree.GetNodeCount(); if (uNodeCount < 3) Quit("GetInternalNodesInHeightOrder: %u nodes, none are internal", uNodeCount); const unsigned uInternalNodeCount = (uNodeCount - 1)/2; double *Heights = new double[uInternalNodeCount]; unsigned uIndex = 0; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (tree.IsLeaf(uNodeIndex)) continue; NodeIndexes[uIndex] = uNodeIndex; Heights[uIndex] = tree.GetNodeHeight(uNodeIndex); ++uIndex; } if (uIndex != uInternalNodeCount) Quit("Internal error: GetInternalNodesInHeightOrder"); // Simple but slow bubble sort (probably don't care about speed here) bool bDone = false; while (!bDone) { bDone = true; for (unsigned i = 0; i < uInternalNodeCount - 1; ++i) { if (Heights[i] > Heights[i+1]) { double dTmp = Heights[i]; Heights[i] = Heights[i+1]; Heights[i+1] = dTmp; unsigned uTmp = NodeIndexes[i]; NodeIndexes[i] = NodeIndexes[i+1]; NodeIndexes[i+1] = uTmp; bDone = false; } } } #if TRACE Log("Internal node index Height\n"); Log("------------------- --------\n"); // 1234567890123456789 123456789 for (unsigned n = 0; n < uInternalNodeCount; ++n) Log("%19u %9.3f\n", NodeIndexes[n], Heights[n]); #endif delete[] Heights; }
// Identify subfamilies in a tree. // Returns array of internal node indexes, one for each subfamily. // First try is to select groups by height (which should approximate // minimum percent identity), if this gives too many subfamilies then // we cut at a point that gives the maximum allowed number of subfams. static void GetSubfams(const Tree &tree, double dMaxHeight, unsigned uMaxSubfamCount, unsigned **ptrptrSubfams, unsigned *ptruSubfamCount) { const unsigned uNodeCount = tree.GetNodeCount(); unsigned *Subfams = new unsigned[uNodeCount]; unsigned uSubfamCount; ClusterByHeight(tree, dMaxHeight, Subfams, &uSubfamCount); if (uSubfamCount > uMaxSubfamCount) ClusterBySubfamCount(tree, uMaxSubfamCount, Subfams, &uSubfamCount); *ptrptrSubfams = Subfams; *ptruSubfamCount = uSubfamCount; }
void ApplyMinEdgeLength(Tree &tree, double dMinEdgeLength) { const unsigned uNodeCount = tree.GetNodeCount(); for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { const unsigned uNeighborCount = tree.GetNeighborCount(uNodeIndex); for (unsigned n = 0; n < uNeighborCount; ++n) { const unsigned uNeighborNodeIndex = tree.GetNeighbor(uNodeIndex, n); if (!tree.HasEdgeLength(uNodeIndex, uNeighborNodeIndex)) continue; if (tree.GetEdgeLength(uNodeIndex, uNeighborNodeIndex) < dMinEdgeLength) tree.SetEdgeLength(uNodeIndex, uNeighborNodeIndex, dMinEdgeLength); } } }
static void LogSubfams(const Tree &tree, const unsigned Subfams[], unsigned uSubfamCount) { const unsigned uNodeCount = tree.GetNodeCount(); Log("%u subfamilies found\n", uSubfamCount); Log("Subfam Sequence\n"); Log("------ --------\n"); unsigned *Leaves = new unsigned[uNodeCount]; for (unsigned uSubfamIndex = 0; uSubfamIndex < uSubfamCount; ++uSubfamIndex) { unsigned uSubfamNodeIndex = Subfams[uSubfamIndex]; unsigned uLeafCount; GetLeaves(tree, uSubfamNodeIndex, Leaves, &uLeafCount); for (unsigned uLeafIndex = 0; uLeafIndex < uLeafCount; ++uLeafIndex) Log("%6u %s\n", uSubfamIndex + 1, tree.GetLeafName(Leaves[uLeafIndex])); Log("\n"); } delete[] Leaves; }
static unsigned GetNextNodeIndex(const Tree &tree, unsigned uPrevNodeIndex) { if (g_bStable) { const unsigned uNodeCount = tree.GetNodeCount(); unsigned uNodeIndex = uPrevNodeIndex; for (;;) { ++uNodeIndex; if (uNodeIndex >= uNodeCount) return NULL_NEIGHBOR; if (tree.IsLeaf(uNodeIndex)) return uNodeIndex; } } unsigned uNodeIndex = uPrevNodeIndex; for (;;) { uNodeIndex = tree.NextDepthFirstNode(uNodeIndex); if (NULL_NEIGHBOR == uNodeIndex || tree.IsLeaf(uNodeIndex)) return uNodeIndex; } }
void TestBiPart() { SetListFileName("c:\\tmp\\lobster.log", false); Tree tree; TextFile fileIn("c:\\tmp\\test.phy"); tree.FromFile(fileIn); tree.LogMe(); const unsigned uNodeCount = tree.GetNodeCount(); unsigned *Leaves1 = new unsigned[uNodeCount]; unsigned *Leaves2 = new unsigned[uNodeCount]; PhyEnumEdgeState ES; bool bDone = false; for (;;) { unsigned uCount1 = uInsane; unsigned uCount2 = uInsane; bool bOk = PhyEnumBiParts(tree, ES, Leaves1, &uCount1, Leaves2, &uCount2); Log("PEBP=%d ES.Init=%d ES.ni1=%d ES.ni2=%d\n", bOk, ES.m_bInit, ES.m_uNodeIndex1, ES.m_uNodeIndex2); if (!bOk) break; Log("\n"); Log("Part1: "); for (unsigned n = 0; n < uCount1; ++n) Log(" %d(%s)", Leaves1[n], tree.GetLeafName(Leaves1[n])); Log("\n"); Log("Part2: "); for (unsigned n = 0; n < uCount2; ++n) Log(" %d(%s)", Leaves2[n], tree.GetLeafName(Leaves2[n])); Log("\n"); } }
void CalcClustalWWeights(const Tree &tree, WEIGHT Weights[]) { #if TRACE Log("CalcClustalWWeights\n"); tree.LogMe(); #endif const unsigned uLeafCount = tree.GetLeafCount(); if (0 == uLeafCount) return; else if (1 == uLeafCount) { Weights[0] = (WEIGHT) 1.0; return; } else if (2 == uLeafCount) { Weights[0] = (WEIGHT) 0.5; Weights[1] = (WEIGHT) 0.5; return; } if (!tree.IsRooted()) Quit("CalcClustalWWeights requires rooted tree"); const unsigned uNodeCount = tree.GetNodeCount(); unsigned *LeavesUnderNode = new unsigned[uNodeCount]; memset(LeavesUnderNode, 0, uNodeCount*sizeof(unsigned)); const unsigned uRootNodeIndex = tree.GetRootNodeIndex(); unsigned uLeavesUnderRoot = CountLeaves(tree, uRootNodeIndex, LeavesUnderNode); if (uLeavesUnderRoot != uLeafCount) Quit("WeightsFromTreee: Internal error, root count %u %u", uLeavesUnderRoot, uLeafCount); #if TRACE Log("Node Leaves Length Strength\n"); Log("---- ------ -------- --------\n"); // 1234 123456 12345678 12345678 #endif double *Strengths = new double[uNodeCount]; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (tree.IsRoot(uNodeIndex)) { Strengths[uNodeIndex] = 0.0; continue; } const unsigned uParent = tree.GetParent(uNodeIndex); const double dLength = tree.GetEdgeLength(uNodeIndex, uParent); const unsigned uLeaves = LeavesUnderNode[uNodeIndex]; const double dStrength = dLength / (double) uLeaves; Strengths[uNodeIndex] = dStrength; #if TRACE Log("%4u %6u %8g %8g\n", uNodeIndex, uLeaves, dLength, dStrength); #endif } #if TRACE Log("\n"); Log(" Seq Path..Weight\n"); Log("-------------------- ------------\n"); #endif for (unsigned n = 0; n < uLeafCount; ++n) { const unsigned uLeafNodeIndex = tree.LeafIndexToNodeIndex(n); #if TRACE Log("%20.20s %4u ", tree.GetLeafName(uLeafNodeIndex), uLeafNodeIndex); #endif if (!tree.IsLeaf(uLeafNodeIndex)) Quit("CalcClustalWWeights: leaf"); double dWeight = 0; unsigned uNode = uLeafNodeIndex; while (!tree.IsRoot(uNode)) { dWeight += Strengths[uNode]; uNode = tree.GetParent(uNode); #if TRACE Log("->%u(%g)", uNode, Strengths[uNode]); #endif } if (dWeight < 0.0001) { #if TRACE Log("zero->one"); #endif dWeight = 1.0; } Weights[n] = (WEIGHT) dWeight; #if TRACE Log(" = %g\n", dWeight); #endif } delete[] Strengths; delete[] LeavesUnderNode; Normalize(Weights, uLeafCount); }
void DiffTrees(const Tree &Tree1, const Tree &Tree2, Tree &Diffs, unsigned IdToDiffsLeafNodeIndex[]) { #if TRACE Log("Tree1:\n"); Tree1.LogMe(); Log("\n"); Log("Tree2:\n"); Tree2.LogMe(); #endif if (!Tree1.IsRooted() || !Tree2.IsRooted()) Quit("DiffTrees: requires rooted trees"); const unsigned uNodeCount = Tree1.GetNodeCount(); const unsigned uNodeCount2 = Tree2.GetNodeCount(); const unsigned uLeafCount = Tree1.GetLeafCount(); const unsigned uLeafCount2 = Tree2.GetLeafCount(); assert(uLeafCount == uLeafCount2); if (uNodeCount != uNodeCount2) Quit("DiffTrees: different node counts"); // Allocate tables so we can convert tree node index to // and from the unique id with a O(1) lookup. unsigned *NodeIndexToId1 = new unsigned[uNodeCount]; unsigned *IdToNodeIndex2 = new unsigned[uNodeCount]; bool *bIsBachelor1 = new bool[uNodeCount]; bool *bIsDiff1 = new bool[uNodeCount]; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { NodeIndexToId1[uNodeIndex] = uNodeCount; bIsBachelor1[uNodeIndex] = false; bIsDiff1[uNodeIndex] = false; // Use uNodeCount as value meaning "not set". IdToNodeIndex2[uNodeIndex] = uNodeCount; } // Initialize node index <-> id lookup tables for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (Tree1.IsLeaf(uNodeIndex)) { const unsigned uId = Tree1.GetLeafId(uNodeIndex); if (uId >= uNodeCount) Quit("Diff trees requires existing leaf ids in range 0 .. (N-1)"); NodeIndexToId1[uNodeIndex] = uId; } if (Tree2.IsLeaf(uNodeIndex)) { const unsigned uId = Tree2.GetLeafId(uNodeIndex); if (uId >= uNodeCount) Quit("Diff trees requires existing leaf ids in range 0 .. (N-1)"); IdToNodeIndex2[uId] = uNodeIndex; } } // Validity check. This verifies that the ids // pre-assigned to the leaves in Tree1 are unique // (note that the id<N check above does not rule // out two leaves having duplicate ids). for (unsigned uId = 0; uId < uLeafCount; ++uId) { unsigned uNodeIndex2 = IdToNodeIndex2[uId]; if (uNodeCount == uNodeIndex2) Quit("DiffTrees, check 2"); } // Ids assigned to internal nodes are N, N+1 ... // An internal node id uniquely identifies a set // of two or more leaves. unsigned uInternalNodeId = uLeafCount; // Depth-first traversal of tree. // The order guarantees that a node is visited before // its parent is visited. for (unsigned uNodeIndex1 = Tree1.FirstDepthFirstNode(); NULL_NEIGHBOR != uNodeIndex1; uNodeIndex1 = Tree1.NextDepthFirstNode(uNodeIndex1)) { #if TRACE Log("Main loop: Node1=%u IsLeaf=%d IsBachelor=%d\n", uNodeIndex1, Tree1.IsLeaf(uNodeIndex1), bIsBachelor1[uNodeIndex1]); #endif // Leaves are trivial; nothing to do. if (Tree1.IsLeaf(uNodeIndex1) || bIsBachelor1[uNodeIndex1]) continue; // If either child is a bachelor, flag // this node as a bachelor and continue. unsigned uLeft1 = Tree1.GetLeft(uNodeIndex1); if (bIsBachelor1[uLeft1]) { bIsBachelor1[uNodeIndex1] = true; continue; } unsigned uRight1 = Tree1.GetRight(uNodeIndex1); if (bIsBachelor1[uRight1]) { bIsBachelor1[uNodeIndex1] = true; continue; } // Both children are married. // Married nodes are guaranteed to have an id. unsigned uIdLeft = NodeIndexToId1[uLeft1]; unsigned uIdRight = NodeIndexToId1[uRight1]; if (uIdLeft == uNodeCount || uIdRight == uNodeCount) Quit("DiffTrees, check 5"); // uLeft2 is the spouse of uLeft1, and similarly for uRight2. unsigned uLeft2 = IdToNodeIndex2[uIdLeft]; unsigned uRight2 = IdToNodeIndex2[uIdRight]; if (uLeft2 == uNodeCount || uRight2 == uNodeCount) Quit("DiffTrees, check 6"); // If the spouses of uLeft1 and uRight1 have the same // parent, then this parent is the spouse of uNodeIndex1. // Otherwise, uNodeIndex1 is a diff. unsigned uParentLeft2 = Tree2.GetParent(uLeft2); unsigned uParentRight2 = Tree2.GetParent(uRight2); #if TRACE Log("L1=%u R1=%u L2=%u R2=%u PL2=%u PR2=%u\n", uLeft1, uRight1, uLeft2, uRight2, uParentLeft2, uParentRight2); #endif if (uParentLeft2 == uParentRight2) { NodeIndexToId1[uNodeIndex1] = uInternalNodeId; IdToNodeIndex2[uInternalNodeId] = uParentLeft2; ++uInternalNodeId; } else bIsBachelor1[uNodeIndex1] = true; } unsigned uDiffCount = 0; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (bIsBachelor1[uNodeIndex]) continue; if (Tree1.IsRoot(uNodeIndex)) { // Special case: if no bachelors, consider the // root a diff. if (!bIsBachelor1[uNodeIndex]) bIsDiff1[uNodeIndex] = true; continue; } const unsigned uParent = Tree1.GetParent(uNodeIndex); if (bIsBachelor1[uParent]) { bIsDiff1[uNodeIndex] = true; ++uDiffCount; } } #if TRACE Log("Tree1:\n"); Log("Node Id Bach Diff Name\n"); Log("---- ---- ---- ---- ----\n"); for (unsigned n = 0; n < uNodeCount; ++n) { Log("%4u %4u %d %d", n, NodeIndexToId1[n], bIsBachelor1[n], bIsDiff1[n]); if (Tree1.IsLeaf(n)) Log(" %s", Tree1.GetLeafName(n)); Log("\n"); } Log("\n"); Log("Tree2:\n"); Log("Node Id Name\n"); Log("---- ---- ----\n"); for (unsigned n = 0; n < uNodeCount; ++n) { Log("%4u ", n); if (Tree2.IsLeaf(n)) Log(" %s", Tree2.GetLeafName(n)); Log("\n"); } #endif Diffs.CreateRooted(); const unsigned uDiffsRootIndex = Diffs.GetRootNodeIndex(); const unsigned uRootIndex1 = Tree1.GetRootNodeIndex(); for (unsigned n = 0; n < uLeafCount; ++n) IdToDiffsLeafNodeIndex[n] = uNodeCount; BuildDiffs(Tree1, uRootIndex1, bIsDiff1, Diffs, uDiffsRootIndex, IdToDiffsLeafNodeIndex); #if TRACE Log("\n"); Log("Diffs:\n"); Diffs.LogMe(); Log("\n"); Log("IdToDiffsLeafNodeIndex:"); for (unsigned n = 0; n < uLeafCount; ++n) { if (n%16 == 0) Log("\n"); else Log(" "); Log("%u=%u", n, IdToDiffsLeafNodeIndex[n]); } Log("\n"); #endif for (unsigned n = 0; n < uLeafCount; ++n) if (IdToDiffsLeafNodeIndex[n] == uNodeCount) Quit("TreeDiffs check 7"); delete[] NodeIndexToId1; delete[] IdToNodeIndex2; delete[] bIsBachelor1; delete[] bIsDiff1; }
void DoMuscle() { SetOutputFileName(g_pstrOutFileName.get()); SetInputFileName(g_pstrInFileName.get()); SetMaxIters(g_uMaxIters.get()); SetSeqWeightMethod(g_SeqWeight1.get()); TextFile fileIn(g_pstrInFileName.get()); SeqVect v; v.FromFASTAFile(fileIn); const unsigned uSeqCount = v.Length(); if (0 == uSeqCount) Quit("No sequences in input file"); ALPHA Alpha = ALPHA_Undefined; switch (g_SeqType.get()) { case SEQTYPE_Auto: Alpha = v.GuessAlpha(); break; case SEQTYPE_Protein: Alpha = ALPHA_Amino; break; case SEQTYPE_DNA: Alpha = ALPHA_DNA; break; case SEQTYPE_RNA: Alpha = ALPHA_RNA; break; default: Quit("Invalid seq type"); } SetAlpha(Alpha); v.FixAlpha(); // // AED 21/12/06: Moved matrix loading code inside the PP param function so it gets called for all alignment types // SetPPScore(); unsigned uMaxL = 0; unsigned uTotL = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { unsigned L = v.GetSeq(uSeqIndex).Length(); uTotL += L; if (L > uMaxL) uMaxL = L; } SetIter(1); g_bDiags.get() = g_bDiags1.get(); SetSeqStats(uSeqCount, uMaxL, uTotL/uSeqCount); SetMuscleSeqVect(v); MSA::SetIdCount(uSeqCount); // Initialize sequence ids. // From this point on, ids must somehow propogate from here. for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) v.SetSeqId(uSeqIndex, uSeqIndex); if (0 == uSeqCount) Quit("Input file '%s' has no sequences", g_pstrInFileName.get()); if (1 == uSeqCount) { TextFile fileOut(g_pstrOutFileName.get(), true); v.ToFile(fileOut); return; } if (uSeqCount > 1) MHackStart(v); // First iteration Tree GuideTree; if (0 != g_pstrUseTreeFileName.get()) { // Discourage users... if (!g_bUseTreeNoWarn.get()) fprintf(stderr, g_strUseTreeWarning); // Read tree from file TextFile TreeFile(g_pstrUseTreeFileName.get()); GuideTree.FromFile(TreeFile); // Make sure tree is rooted if (!GuideTree.IsRooted()) Quit("User tree must be rooted"); if (GuideTree.GetLeafCount() != uSeqCount) Quit("User tree does not match input sequences"); const unsigned uNodeCount = GuideTree.GetNodeCount(); for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (!GuideTree.IsLeaf(uNodeIndex)) continue; const char *LeafName = GuideTree.GetLeafName(uNodeIndex); unsigned uSeqIndex; bool SeqFound = v.FindName(LeafName, &uSeqIndex); if (!SeqFound) Quit("Label %s in tree does not match sequences", LeafName); unsigned uId = v.GetSeqIdFromName(LeafName); GuideTree.SetLeafId(uNodeIndex, uId); } } else TreeFromSeqVect(v, GuideTree, g_Cluster1.get(), g_Distance1.get(), g_Root1.get(), g_pstrDistMxFileName1.get()); const char *Tree1 = ValueOpt("Tree1"); if (0 != Tree1) { TextFile f(Tree1, true); GuideTree.ToFile(f); if (g_bClusterOnly.get()) return; } SetMuscleTree(GuideTree); ValidateMuscleIds(GuideTree); MSA msa; ProgNode *ProgNodes = 0; if (g_bLow.get()) ProgNodes = ProgressiveAlignE(v, GuideTree, msa); else ProgressiveAlign(v, GuideTree, msa); SetCurrentAlignment(msa); if (0 != g_pstrComputeWeightsFileName.get()) { extern void OutWeights(const char *FileName, const MSA &msa); SetMSAWeightsMuscle(msa); OutWeights(g_pstrComputeWeightsFileName.get(), msa); return; } ValidateMuscleIds(msa); if (1 == g_uMaxIters.get() || 2 == uSeqCount) { //TextFile fileOut(g_pstrOutFileName.get(), true); //MHackEnd(msa); //msa.ToFile(fileOut); MuscleOutput(msa); return; } if (0 == g_pstrUseTreeFileName.get()) { g_bDiags.get() = g_bDiags2.get(); SetIter(2); if (g_bLow.get()) { if (0 != g_uMaxTreeRefineIters.get()) RefineTreeE(msa, v, GuideTree, ProgNodes); } else RefineTree(msa, GuideTree); const char *Tree2 = ValueOpt("Tree2"); if (0 != Tree2) { TextFile f(Tree2, true); GuideTree.ToFile(f); } } SetSeqWeightMethod(g_SeqWeight2.get()); SetMuscleTree(GuideTree); if (g_bAnchors.get()) RefineVert(msa, GuideTree, g_uMaxIters.get() - 2); else RefineHoriz(msa, GuideTree, g_uMaxIters.get() - 2, false, false); #if 0 // Refining by subfamilies is disabled as it didn't give better // results. I tried doing this before and after RefineHoriz. // Should get back to this as it seems like this should work. RefineSubfams(msa, GuideTree, g_uMaxIters.get() - 2); #endif ValidateMuscleIds(msa); ValidateMuscleIds(GuideTree); //TextFile fileOut(g_pstrOutFileName.get(), true); //MHackEnd(msa); //msa.ToFile(fileOut); MuscleOutput(msa); }
void DiffTreesE(const Tree &NewTree, const Tree &OldTree, unsigned NewNodeIndexToOldNodeIndex[]) { #if TRACE Log("DiffTreesE NewTree:\n"); NewTree.LogMe(); Log("\n"); Log("OldTree:\n"); OldTree.LogMe(); #endif if (!NewTree.IsRooted() || !OldTree.IsRooted()) Quit("DiffTrees: requires rooted trees"); const unsigned uNodeCount = NewTree.GetNodeCount(); const unsigned uOldNodeCount = OldTree.GetNodeCount(); const unsigned uLeafCount = NewTree.GetLeafCount(); const unsigned uOldLeafCount = OldTree.GetLeafCount(); if (uNodeCount != uOldNodeCount || uLeafCount != uOldLeafCount) Quit("DiffTreesE: different node counts"); { unsigned *IdToOldNodeIndex = new unsigned[uNodeCount]; for (unsigned uOldNodeIndex = 0; uOldNodeIndex < uNodeCount; ++uOldNodeIndex) { if (OldTree.IsLeaf(uOldNodeIndex)) { unsigned Id = OldTree.GetLeafId(uOldNodeIndex); IdToOldNodeIndex[Id] = uOldNodeIndex; } } // Initialize NewNodeIndexToOldNodeIndex[] // All internal nodes are marked as changed, but may be updated later. for (unsigned uNewNodeIndex = 0; uNewNodeIndex < uNodeCount; ++uNewNodeIndex) { if (NewTree.IsLeaf(uNewNodeIndex)) { unsigned uId = NewTree.GetLeafId(uNewNodeIndex); assert(uId < uLeafCount); unsigned uOldNodeIndex = IdToOldNodeIndex[uId]; assert(uOldNodeIndex < uNodeCount); NewNodeIndexToOldNodeIndex[uNewNodeIndex] = uOldNodeIndex; } else NewNodeIndexToOldNodeIndex[uNewNodeIndex] = NODE_CHANGED; } delete[] IdToOldNodeIndex; } // Depth-first traversal of tree. // The order guarantees that a node is visited before // its parent is visited. for (unsigned uNewNodeIndex = NewTree.FirstDepthFirstNode(); NULL_NEIGHBOR != uNewNodeIndex; uNewNodeIndex = NewTree.NextDepthFirstNode(uNewNodeIndex)) { if (NewTree.IsLeaf(uNewNodeIndex)) continue; // If either child is changed, flag this node as changed and continue. unsigned uNewLeft = NewTree.GetLeft(uNewNodeIndex); unsigned uOldLeft = NewNodeIndexToOldNodeIndex[uNewLeft]; if (NODE_CHANGED == uOldLeft) { NewNodeIndexToOldNodeIndex[uNewLeft] = NODE_CHANGED; continue; } unsigned uNewRight = NewTree.GetRight(uNewNodeIndex); unsigned uOldRight = NewNodeIndexToOldNodeIndex[uNewRight]; if (NODE_CHANGED == NewNodeIndexToOldNodeIndex[uNewRight]) { NewNodeIndexToOldNodeIndex[uNewRight] = NODE_CHANGED; continue; } unsigned uOldParentLeft = OldTree.GetParent(uOldLeft); unsigned uOldParentRight = OldTree.GetParent(uOldRight); if (uOldParentLeft == uOldParentRight) NewNodeIndexToOldNodeIndex[uNewNodeIndex] = uOldParentLeft; else NewNodeIndexToOldNodeIndex[uNewNodeIndex] = NODE_CHANGED; } #if TRACE { Log("NewToOld "); for (unsigned uNewNodeIndex = 0; uNewNodeIndex < uNodeCount; ++uNewNodeIndex) { Log(" [%3u]=", uNewNodeIndex); if (NODE_CHANGED == NewNodeIndexToOldNodeIndex[uNewNodeIndex]) Log(" X"); else Log("%3u", NewNodeIndexToOldNodeIndex[uNewNodeIndex]); if ((uNewNodeIndex+1)%8 == 0) Log("\n "); } Log("\n"); } #endif #if DEBUG { for (unsigned uNewNodeIndex = 0; uNewNodeIndex < uNodeCount; ++uNewNodeIndex) { unsigned uOld = NewNodeIndexToOldNodeIndex[uNewNodeIndex]; if (NewTree.IsLeaf(uNewNodeIndex)) { if (uOld >= uNodeCount) { Log("NewNode=%u uOld=%u > uNodeCount=%u\n", uNewNodeIndex, uOld, uNodeCount); Quit("Diff check failed"); } unsigned uIdNew = NewTree.GetLeafId(uNewNodeIndex); unsigned uIdOld = OldTree.GetLeafId(uOld); if (uIdNew != uIdOld) { Log("NewNode=%u uOld=%u IdNew=%u IdOld=%u\n", uNewNodeIndex, uOld, uIdNew, uIdOld); Quit("Diff check failed"); } continue; } if (NODE_CHANGED == uOld) continue; unsigned uNewLeft = NewTree.GetLeft(uNewNodeIndex); unsigned uNewRight = NewTree.GetRight(uNewNodeIndex); unsigned uOldLeft = OldTree.GetLeft(uOld); unsigned uOldRight = OldTree.GetRight(uOld); unsigned uNewLeftPartner = NewNodeIndexToOldNodeIndex[uNewLeft]; unsigned uNewRightPartner = NewNodeIndexToOldNodeIndex[uNewRight]; bool bSameNotRotated = (uNewLeftPartner == uOldLeft && uNewRightPartner == uOldRight); bool bSameRotated = (uNewLeftPartner == uOldRight && uNewRightPartner == uOldLeft); if (!bSameNotRotated && !bSameRotated) { Log("NewNode=%u NewL=%u NewR=%u\n", uNewNodeIndex, uNewLeft, uNewRight); Log("OldNode=%u OldL=%u OldR=%u\n", uOld, uOldLeft, uOldRight); Log("NewLPartner=%u NewRPartner=%u\n", uNewLeftPartner, uNewRightPartner); Quit("Diff check failed"); } } } #endif }
void DoMuscle(CompositeVect*CVLocation) { SetOutputFileName(g_pstrOutFileName); SetInputFileName(g_pstrInFileName); SetMaxIters(g_uMaxIters); SetSeqWeightMethod(g_SeqWeight1); TextFile fileIn(g_pstrInFileName); SeqVect v; v.FromFASTAFile(fileIn); const unsigned uSeqCount = v.Length(); if (0 == uSeqCount) Quit("No sequences in input file"); ALPHA Alpha = ALPHA_Undefined; switch (g_SeqType) { case SEQTYPE_Auto: Alpha = v.GuessAlpha(); break; case SEQTYPE_Protein: Alpha = ALPHA_Amino; break; case SEQTYPE_DNA: Alpha = ALPHA_DNA; break; case SEQTYPE_RNA: Alpha = ALPHA_RNA; break; default: Quit("Invalid seq type"); } SetAlpha(Alpha); v.FixAlpha(); PTR_SCOREMATRIX UserMatrix = 0; if (0 != g_pstrMatrixFileName) { const char *FileName = g_pstrMatrixFileName; const char *Path = getenv("MUSCLE_MXPATH"); if (Path != 0) { size_t n = strlen(Path) + 1 + strlen(FileName) + 1; char *NewFileName = new char[n]; sprintf(NewFileName, "%s/%s", Path, FileName); FileName = NewFileName; } TextFile File(FileName); UserMatrix = ReadMx(File); g_Alpha = ALPHA_Amino; g_PPScore = PPSCORE_SP; } SetPPScore(); if (0 != UserMatrix) g_ptrScoreMatrix = UserMatrix; unsigned uMaxL = 0; unsigned uTotL = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { unsigned L = v.GetSeq(uSeqIndex).Length(); uTotL += L; if (L > uMaxL) uMaxL = L; } SetIter(1); g_bDiags = g_bDiags1; SetSeqStats(uSeqCount, uMaxL, uTotL/uSeqCount); SetMuscleSeqVect(v); MSA::SetIdCount(uSeqCount); // Initialize sequence ids. // From this point on, ids must somehow propogate from here. for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) v.SetSeqId(uSeqIndex, uSeqIndex); if (0 == uSeqCount) Quit("Input file '%s' has no sequences", g_pstrInFileName); if (1 == uSeqCount) { TextFile fileOut(g_pstrOutFileName, true); v.ToFile(fileOut); return; } if (uSeqCount > 1) MHackStart(v); // First iteration Tree GuideTree; if (0 != g_pstrUseTreeFileName) { // Discourage users... if (!g_bUseTreeNoWarn) fprintf(stderr, "%s", g_strUseTreeWarning); // Read tree from file TextFile TreeFile(g_pstrUseTreeFileName); GuideTree.FromFile(TreeFile); // Make sure tree is rooted if (!GuideTree.IsRooted()) Quit("User tree must be rooted"); if (GuideTree.GetLeafCount() != uSeqCount) Quit("User tree does not match input sequences"); const unsigned uNodeCount = GuideTree.GetNodeCount(); for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (!GuideTree.IsLeaf(uNodeIndex)) continue; const char *LeafName = GuideTree.GetLeafName(uNodeIndex); unsigned uSeqIndex; bool SeqFound = v.FindName(LeafName, &uSeqIndex); if (!SeqFound) Quit("Label %s in tree does not match sequences", LeafName); unsigned uId = v.GetSeqIdFromName(LeafName); GuideTree.SetLeafId(uNodeIndex, uId); } } else TreeFromSeqVect(v, GuideTree, g_Cluster1, g_Distance1, g_Root1, g_pstrDistMxFileName1); const char *Tree1 = ValueOpt("Tree1"); if (0 != Tree1) { TextFile f(Tree1, true); GuideTree.ToFile(f); if (g_bClusterOnly) return; } SetMuscleTree(GuideTree); ValidateMuscleIds(GuideTree); MSA msa; msa.SetCompositeVector(CVLocation); ProgNode *ProgNodes = 0; if (g_bLow) ProgNodes = ProgressiveAlignE(v, GuideTree, msa); else ProgressiveAlign(v, GuideTree, msa); SetCurrentAlignment(msa); if (0 != g_pstrComputeWeightsFileName) { extern void OutWeights(const char *FileName, const MSA &msa); SetMSAWeightsMuscle(msa); OutWeights(g_pstrComputeWeightsFileName, msa); return; } ValidateMuscleIds(msa); if (1 == g_uMaxIters || 2 == uSeqCount) { //TextFile fileOut(g_pstrOutFileName, true); //MHackEnd(msa); //msa.ToFile(fileOut); MuscleOutput(msa); return; } if (0 == g_pstrUseTreeFileName) { g_bDiags = g_bDiags2; SetIter(2); if (g_bLow) { if (0 != g_uMaxTreeRefineIters) RefineTreeE(msa, v, GuideTree, ProgNodes); } else RefineTree(msa, GuideTree); const char *Tree2 = ValueOpt("Tree2"); if (0 != Tree2) { TextFile f(Tree2, true); GuideTree.ToFile(f); } } SetSeqWeightMethod(g_SeqWeight2); SetMuscleTree(GuideTree); if (g_bAnchors) RefineVert(msa, GuideTree, g_uMaxIters - 2); else RefineHoriz(msa, GuideTree, g_uMaxIters - 2, false, false); #if 0 // Refining by subfamilies is disabled as it didn't give better // results. I tried doing this before and after RefineHoriz. // Should get back to this as it seems like this should work. RefineSubfams(msa, GuideTree, g_uMaxIters - 2); #endif ValidateMuscleIds(msa); ValidateMuscleIds(GuideTree); //TextFile fileOut(g_pstrOutFileName, true); //MHackEnd(msa); //msa.ToFile(fileOut); MuscleOutput(msa); }
static void ProgressiveAlignSubfams(const Tree &tree, const unsigned Subfams[], unsigned uSubfamCount, const MSA SubfamMSAs[], MSA &msa) { const unsigned uNodeCount = tree.GetNodeCount(); bool *Ready = new bool[uNodeCount]; MSA **MSAs = new MSA *[uNodeCount]; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { Ready[uNodeIndex] = false; MSAs[uNodeIndex] = 0; } for (unsigned uSubfamIndex = 0; uSubfamIndex < uSubfamCount; ++uSubfamIndex) { unsigned uNodeIndex = Subfams[uSubfamIndex]; Ready[uNodeIndex] = true; MSA *ptrMSA = new MSA; // TODO: Wasteful copy, needs re-design ptrMSA->Copy(SubfamMSAs[uSubfamIndex]); MSAs[uNodeIndex] = ptrMSA; } for (unsigned uNodeIndex = tree.FirstDepthFirstNode(); NULL_NEIGHBOR != uNodeIndex; uNodeIndex = tree.NextDepthFirstNode(uNodeIndex)) { if (tree.IsLeaf(uNodeIndex)) continue; unsigned uRight = tree.GetRight(uNodeIndex); unsigned uLeft = tree.GetLeft(uNodeIndex); if (!Ready[uRight] || !Ready[uLeft]) continue; MSA *ptrLeft = MSAs[uLeft]; MSA *ptrRight = MSAs[uRight]; assert(ptrLeft != 0 && ptrRight != 0); MSA *ptrParent = new MSA; PWPath Path; AlignTwoMSAs(*ptrLeft, *ptrRight, *ptrParent, Path); MSAs[uNodeIndex] = ptrParent; Ready[uNodeIndex] = true; Ready[uLeft] = false; Ready[uRight] = false; delete MSAs[uLeft]; delete MSAs[uRight]; MSAs[uLeft] = 0; MSAs[uRight] = 0; } #if DEBUG { unsigned uReadyCount = 0; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (Ready[uNodeIndex]) { assert(tree.IsRoot(uNodeIndex)); ++uReadyCount; assert(0 != MSAs[uNodeIndex]); } else assert(0 == MSAs[uNodeIndex]); } assert(1 == uReadyCount); } #endif const unsigned uRoot = tree.GetRootNodeIndex(); MSA *ptrRootAlignment = MSAs[uRoot]; msa.Copy(*ptrRootAlignment); delete ptrRootAlignment; delete[] Ready; #if TRACE Log("After refine subfamilies, root alignment=\n"); msa.LogMe(); #endif }
bool RefineSubfams(MSA &msa, const Tree &tree, unsigned uIters) { const unsigned uSeqCount = msa.GetSeqCount(); if (uSeqCount < 3) return false; const double dMaxHeight = 0.6; const unsigned uMaxSubfamCount = 16; const unsigned uNodeCount = tree.GetNodeCount(); unsigned *Subfams; unsigned uSubfamCount; GetSubfams(tree, dMaxHeight, uMaxSubfamCount, &Subfams, &uSubfamCount); assert(uSubfamCount <= uSeqCount); if (g_bVerbose.get()) LogSubfams(tree, Subfams, uSubfamCount); MSA *SubfamMSAs = new MSA[uSubfamCount]; unsigned *Leaves = new unsigned[uSeqCount]; unsigned *Ids = new unsigned[uSeqCount]; bool bAnyChanges = false; for (unsigned uSubfamIndex = 0; uSubfamIndex < uSubfamCount; ++uSubfamIndex) { unsigned uSubfam = Subfams[uSubfamIndex]; unsigned uLeafCount; GetLeaves(tree, uSubfam, Leaves, &uLeafCount); assert(uLeafCount <= uSeqCount); LeafIndexesToIds(tree, Leaves, uLeafCount, Ids); MSA &msaSubfam = SubfamMSAs[uSubfamIndex]; MSASubsetByIds(msa, Ids, uLeafCount, msaSubfam); DeleteGappedCols(msaSubfam); #if TRACE Log("Subfam %u MSA=\n", uSubfamIndex); msaSubfam.LogMe(); #endif if (msaSubfam.GetSeqCount() <= 2) continue; // TODO ///////////////////////////////////////// // Try using existing tree, may actually hurt to // re-estimate, may also be a waste of CPU & mem. ///////////////////////////////////////////////// Tree SubfamTree; TreeFromMSA(msaSubfam, SubfamTree, g_Cluster2.get(), g_Distance2.get(), g_Root2.get()); bool bAnyChangesThisSubfam; if (g_bAnchors.get()) bAnyChangesThisSubfam = RefineVert(msaSubfam, SubfamTree, uIters); else bAnyChangesThisSubfam = RefineHoriz(msaSubfam, SubfamTree, uIters, false, false); #if TRACE Log("Subfam %u Changed %d\n", uSubfamIndex, bAnyChangesThisSubfam); #endif if (bAnyChangesThisSubfam) bAnyChanges = true; } if (bAnyChanges) ProgressiveAlignSubfams(tree, Subfams, uSubfamCount, SubfamMSAs, msa); delete[] Leaves; delete[] Subfams; delete[] SubfamMSAs; return bAnyChanges; }
void MakeRootMSA(const SeqVect &v, const Tree &GuideTree, ProgNode Nodes[], MSA &a) { #if TRACE Log("MakeRootMSA Tree="); GuideTree.LogMe(); #endif const unsigned uSeqCount = v.GetSeqCount(); unsigned uColCount = uInsane; unsigned uSeqIndex = 0; const unsigned uTreeNodeCount = GuideTree.GetNodeCount(); const unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex(); const PWPath &RootPath = Nodes[uRootNodeIndex].m_Path; const unsigned uRootColCount = RootPath.GetEdgeCount(); const unsigned uEstringSize = uRootColCount + 1; short *Estring1 = new short[uEstringSize]; short *Estring2 = new short[uEstringSize]; SetProgressDesc("Root alignment"); unsigned uTreeNodeIndex = GetFirstNodeIndex(GuideTree); do { Progress(uSeqIndex, uSeqCount); unsigned uId = GuideTree.GetLeafId(uTreeNodeIndex); const Seq &s = *(v[uId]); Seq sRootE; short *es = MakeRootSeqE(s, GuideTree, uTreeNodeIndex, Nodes, sRootE, Estring1, Estring2); Nodes[uTreeNodeIndex].m_EstringL = EstringNewCopy(es); #if VALIDATE Seq sRoot; MakeRootSeq(s, GuideTree, uTreeNodeIndex, Nodes, sRoot); if (!sRoot.Eq(sRootE)) { Log("sRoot="); sRoot.LogMe(); Log("sRootE="); sRootE.LogMe(); Quit("Root seqs differ"); } #if TRACE Log("MakeRootSeq=\n"); sRoot.LogMe(); #endif #endif if (uInsane == uColCount) { uColCount = sRootE.Length(); a.SetSize(uSeqCount, uColCount); } else { assert(uColCount == sRootE.Length()); } a.SetSeqName(uSeqIndex, s.GetName()); a.SetSeqId(uSeqIndex, uId); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) a.SetChar(uSeqIndex, uColIndex, sRootE[uColIndex]); ++uSeqIndex; uTreeNodeIndex = GetNextNodeIndex(GuideTree, uTreeNodeIndex); } while (NULL_NEIGHBOR != uTreeNodeIndex); delete[] Estring1; delete[] Estring2; ProgressStepsDone(); assert(uSeqIndex == uSeqCount); }
void FindRoot(const Tree &tree, unsigned *ptruNode1, unsigned *ptruNode2, double *ptrdLength1, double *ptrdLength2, ROOT RootMethod) { #if TRACE tree.LogMe(); #endif if (tree.IsRooted()) Quit("FindRoot: tree already rooted"); const unsigned uNodeCount = tree.GetNodeCount(); const unsigned uLeafCount = tree.GetLeafCount(); if (uNodeCount < 2) Quit("Root: don't support trees with < 2 edges"); EdgeInfo **EIs = new EdgeInfo *[uNodeCount]; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) EIs[uNodeIndex] = new EdgeInfo[3]; EdgeList Edges; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) if (tree.IsLeaf(uNodeIndex)) { unsigned uParent = tree.GetNeighbor1(uNodeIndex); Edges.Add(uParent, uNodeIndex); } #if TRACE Log("Edges: "); Edges.LogMe(); #endif // Main loop: iterate until all distances known double dAllMaxDist = -1e20; unsigned uMaxFrom = NULL_NEIGHBOR; unsigned uMaxTo = NULL_NEIGHBOR; for (;;) { EdgeList NextEdges; #if TRACE Log("\nTop of main loop\n"); Log("Edges: "); Edges.LogMe(); Log("MDs:\n"); ListEIs(EIs, uNodeCount); #endif // For all edges const unsigned uEdgeCount = Edges.GetCount(); if (0 == uEdgeCount) break; for (unsigned n = 0; n < uEdgeCount; ++n) { unsigned uNodeFrom; unsigned uNodeTo; Edges.GetEdge(n, &uNodeFrom, &uNodeTo); CalcInfo(tree, uNodeFrom, uNodeTo, EIs); #if TRACE Log("Edge %u -> %u\n", uNodeFrom, uNodeTo); #endif const unsigned uNeighborCount = tree.GetNeighborCount(uNodeFrom); for (unsigned i = 0; i < uNeighborCount; ++i) { const unsigned uNeighborIndex = tree.GetNeighbor(uNodeFrom, i); if (!Known(tree, EIs, uNeighborIndex, uNodeFrom) && AllKnownOut(tree, EIs, uNeighborIndex, uNodeFrom)) NextEdges.Add(uNeighborIndex, uNodeFrom); } } Edges.Copy(NextEdges); } #if TRACE ListEIs(EIs, uNodeCount); #endif switch (RootMethod) { case ROOT_MidLongestSpan: RootByMidLongestSpan(tree, EIs, ptruNode1, ptruNode2, ptrdLength1, ptrdLength2); break; case ROOT_MinAvgLeafDist: RootByMinAvgLeafDist(tree, EIs, ptruNode1, ptruNode2, ptrdLength1, ptrdLength2); break; default: Quit("Invalid RootMethod=%d", RootMethod); } for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) delete[] EIs[uNodeIndex]; delete[] EIs; }
static void RootByMidLongestSpan(const Tree &tree, EdgeInfo **EIs, unsigned *ptruNode1, unsigned *ptruNode2, double *ptrdLength1, double *ptrdLength2) { const unsigned uNodeCount = tree.GetNodeCount(); unsigned uLeaf1 = NULL_NEIGHBOR; unsigned uMostDistantLeaf = NULL_NEIGHBOR; double dMaxDist = -VERY_LARGE_DOUBLE; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (!tree.IsLeaf(uNodeIndex)) continue; const unsigned uNode2 = tree.GetNeighbor1(uNodeIndex); if (NULL_NEIGHBOR == uNode2) Quit("RootByMidLongestSpan: internal error 0"); const double dEdgeLength = tree.GetEdgeLength(uNodeIndex, uNode2); const EdgeInfo &EI = EIs[uNodeIndex][0]; if (!EI.m_bSet) Quit("RootByMidLongestSpan: internal error 1"); if (EI.m_uNode1 != uNodeIndex || EI.m_uNode2 != uNode2) Quit("RootByMidLongestSpan: internal error 2"); const double dSpanLength = dEdgeLength + EI.m_dMaxDistToLeaf; if (dSpanLength > dMaxDist) { dMaxDist = dSpanLength; uLeaf1 = uNodeIndex; uMostDistantLeaf = EI.m_uMostDistantLeaf; } } if (NULL_NEIGHBOR == uLeaf1) Quit("RootByMidLongestSpan: internal error 3"); const double dTreeHeight = dMaxDist/2.0; unsigned uNode1 = uLeaf1; unsigned uNode2 = tree.GetNeighbor1(uLeaf1); double dAccumSpanLength = 0; #if TRACE Log("RootByMidLongestSpan: span=%u", uLeaf1); #endif for (;;) { const double dEdgeLength = tree.GetEdgeLength(uNode1, uNode2); #if TRACE Log("->%u(%g;%g)", uNode2, dEdgeLength, dAccumSpanLength); #endif if (dAccumSpanLength + dEdgeLength >= dTreeHeight) { *ptruNode1 = uNode1; *ptruNode2 = uNode2; *ptrdLength1 = dTreeHeight - dAccumSpanLength; *ptrdLength2 = dEdgeLength - *ptrdLength1; #if TRACE { const EdgeInfo &EI = EIs[uLeaf1][0]; Log("...\n"); Log("Midpoint: Leaf1=%u Leaf2=%u Node1=%u Node2=%u Length1=%g Length2=%g\n", uLeaf1, EI.m_uMostDistantLeaf, *ptruNode1, *ptruNode2, *ptrdLength1, *ptrdLength2); } #endif return; } if (tree.IsLeaf(uNode2)) Quit("RootByMidLongestSpan: internal error 4"); dAccumSpanLength += dEdgeLength; const unsigned uSub = tree.GetNeighborSubscript(uNode1, uNode2); const EdgeInfo &EI = EIs[uNode1][uSub]; if (!EI.m_bSet) Quit("RootByMidLongestSpan: internal error 5"); uNode1 = uNode2; uNode2 = EI.m_uMaxStep; } }
void Tree::PruneTree(const Tree &tree, unsigned Subfams[], unsigned uSubfamCount) { if (!tree.IsRooted()) Quit("Tree::PruneTree: requires rooted tree"); Clear(); m_uNodeCount = 2*uSubfamCount - 1; InitCache(m_uNodeCount); const unsigned uUnprunedNodeCount = tree.GetNodeCount(); unsigned *uUnprunedToPrunedIndex = new unsigned[uUnprunedNodeCount]; unsigned *uPrunedToUnprunedIndex = new unsigned[m_uNodeCount]; for (unsigned n = 0; n < uUnprunedNodeCount; ++n) uUnprunedToPrunedIndex[n] = NULL_NEIGHBOR; for (unsigned n = 0; n < m_uNodeCount; ++n) uPrunedToUnprunedIndex[n] = NULL_NEIGHBOR; // Create mapping between unpruned and pruned node indexes unsigned uInternalNodeIndex = uSubfamCount; for (unsigned uSubfamIndex = 0; uSubfamIndex < uSubfamCount; ++uSubfamIndex) { unsigned uUnprunedNodeIndex = Subfams[uSubfamIndex]; uUnprunedToPrunedIndex[uUnprunedNodeIndex] = uSubfamIndex; uPrunedToUnprunedIndex[uSubfamIndex] = uUnprunedNodeIndex; for (;;) { uUnprunedNodeIndex = tree.GetParent(uUnprunedNodeIndex); if (tree.IsRoot(uUnprunedNodeIndex)) break; // Already visited this node? if (NULL_NEIGHBOR != uUnprunedToPrunedIndex[uUnprunedNodeIndex]) break; uUnprunedToPrunedIndex[uUnprunedNodeIndex] = uInternalNodeIndex; uPrunedToUnprunedIndex[uInternalNodeIndex] = uUnprunedNodeIndex; ++uInternalNodeIndex; } } const unsigned uUnprunedRootIndex = tree.GetRootNodeIndex(); uUnprunedToPrunedIndex[uUnprunedRootIndex] = uInternalNodeIndex; uPrunedToUnprunedIndex[uInternalNodeIndex] = uUnprunedRootIndex; #if TRACE { Log("Pruned to unpruned:\n"); for (unsigned i = 0; i < m_uNodeCount; ++i) Log(" [%u]=%u", i, uPrunedToUnprunedIndex[i]); Log("\n"); Log("Unpruned to pruned:\n"); for (unsigned i = 0; i < uUnprunedNodeCount; ++i) { unsigned n = uUnprunedToPrunedIndex[i]; if (n != NULL_NEIGHBOR) Log(" [%u]=%u", i, n); } Log("\n"); } #endif if (uInternalNodeIndex != m_uNodeCount - 1) Quit("Tree::PruneTree, Internal error"); // Nodes 0, 1 ... are the leaves for (unsigned uSubfamIndex = 0; uSubfamIndex < uSubfamCount; ++uSubfamIndex) { char szName[32]; sprintf(szName, "Subfam_%u", uSubfamIndex + 1); m_ptrName[uSubfamIndex] = strsave(szName); } for (unsigned uPrunedNodeIndex = uSubfamCount; uPrunedNodeIndex < m_uNodeCount; ++uPrunedNodeIndex) { unsigned uUnprunedNodeIndex = uPrunedToUnprunedIndex[uPrunedNodeIndex]; const unsigned uUnprunedLeft = tree.GetLeft(uUnprunedNodeIndex); const unsigned uUnprunedRight = tree.GetRight(uUnprunedNodeIndex); const unsigned uPrunedLeft = uUnprunedToPrunedIndex[uUnprunedLeft]; const unsigned uPrunedRight = uUnprunedToPrunedIndex[uUnprunedRight]; const double dLeftLength = tree.GetEdgeLength(uUnprunedNodeIndex, uUnprunedLeft); const double dRightLength = tree.GetEdgeLength(uUnprunedNodeIndex, uUnprunedRight); m_uNeighbor2[uPrunedNodeIndex] = uPrunedLeft; m_uNeighbor3[uPrunedNodeIndex] = uPrunedRight; m_dEdgeLength1[uPrunedLeft] = dLeftLength; m_dEdgeLength1[uPrunedRight] = dRightLength; m_uNeighbor1[uPrunedLeft] = uPrunedNodeIndex; m_uNeighbor1[uPrunedRight] = uPrunedNodeIndex; m_bHasEdgeLength1[uPrunedLeft] = true; m_bHasEdgeLength1[uPrunedRight] = true; m_dEdgeLength2[uPrunedNodeIndex] = dLeftLength; m_dEdgeLength3[uPrunedNodeIndex] = dRightLength; m_bHasEdgeLength2[uPrunedNodeIndex] = true; m_bHasEdgeLength3[uPrunedNodeIndex] = true; } m_uRootNodeIndex = uUnprunedToPrunedIndex[uUnprunedRootIndex]; m_bRooted = true; Validate(); delete[] uUnprunedToPrunedIndex; }
static void RootByMinAvgLeafDist(const Tree &tree, EdgeInfo **EIs, unsigned *ptruNode1, unsigned *ptruNode2, double *ptrdLength1, double *ptrdLength2) { const unsigned uNodeCount = tree.GetNodeCount(); const unsigned uLeafCount = tree.GetLeafCount(); unsigned uNode1 = NULL_NEIGHBOR; unsigned uNode2 = NULL_NEIGHBOR; double dMinHeight = VERY_LARGE_DOUBLE; double dBestLength1 = VERY_LARGE_DOUBLE; double dBestLength2 = VERY_LARGE_DOUBLE; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { const unsigned uNeighborCount = tree.GetNeighborCount(uNodeIndex); for (unsigned uSub = 0; uSub < uNeighborCount; ++uSub) { const unsigned uNeighborIndex = tree.GetNeighbor(uNodeIndex, uSub); // Avoid visiting same edge a second time in reversed order. if (uNeighborIndex < uNodeIndex) continue; const unsigned uSubRev = tree.GetNeighborSubscript(uNeighborIndex, uNodeIndex); if (NULL_NEIGHBOR == uSubRev) Quit("RootByMinAvgLeafDist, internal error 1"); // Get info for edges Node1->Node2 and Node2->Node1 (reversed) const EdgeInfo &EI = EIs[uNodeIndex][uSub]; const EdgeInfo &EIRev = EIs[uNeighborIndex][uSubRev]; if (EI.m_uNode1 != uNodeIndex || EI.m_uNode2 != uNeighborIndex || EIRev.m_uNode1 != uNeighborIndex || EIRev.m_uNode2 != uNodeIndex) Quit("RootByMinAvgLeafDist, internal error 2"); if (!EI.m_bSet) Quit("RootByMinAvgLeafDist, internal error 3"); if (uLeafCount != EI.m_uLeafCount + EIRev.m_uLeafCount) Quit("RootByMinAvgLeafDist, internal error 4"); const double dEdgeLength = tree.GetEdgeLength(uNodeIndex, uNeighborIndex); if (dEdgeLength != tree.GetEdgeLength(uNeighborIndex, uNodeIndex)) Quit("RootByMinAvgLeafDist, internal error 5"); // Consider point p on edge 12 in tree (1=Node, 2=Neighbor). // // ----- ---- // | | // 1----p--2 // | | // ----- ---- // // Define: // ADLp = average distance to leaves to left of point p. // ADRp = average distance to leaves to right of point p. // L = edge length = distance 12 // x = distance 1p // So distance p2 = L - x. // Average distance from p to leaves on left of p is: // ADLp = ADL1 + x // Average distance from p to leaves on right of p is: // ADRp = ADR2 + (L - x) // To be a root, we require these two distances to be equal, // ADLp = ADRp // ADL1 + x = ADR2 + (L - x) // Solving for x, // x = (ADR2 - ADL1 + L)/2 // If 0 <= x <= L, we can place the root on edge 12. const double ADL1 = EI.m_dTotalDistToLeaves / EI.m_uLeafCount; const double ADR2 = EIRev.m_dTotalDistToLeaves / EIRev.m_uLeafCount; const double x = (ADR2 - ADL1 + dEdgeLength)/2.0; if (x >= 0 && x <= dEdgeLength) { const double dLength1 = x; const double dLength2 = dEdgeLength - x; const double dHeight1 = EI.m_dMaxDistToLeaf + dLength1; const double dHeight2 = EIRev.m_dMaxDistToLeaf + dLength2; const double dHeight = dHeight1 >= dHeight2 ? dHeight1 : dHeight2; #if TRACE Log("Candidate root Node1=%u Node2=%u Height=%g\n", uNodeIndex, uNeighborIndex, dHeight); #endif if (dHeight < dMinHeight) { uNode1 = uNodeIndex; uNode2 = uNeighborIndex; dBestLength1 = dLength1; dBestLength2 = dLength2; dMinHeight = dHeight; } } } } if (NULL_NEIGHBOR == uNode1 || NULL_NEIGHBOR == uNode2) Quit("RootByMinAvgLeafDist, internal error 6"); #if TRACE Log("Best root Node1=%u Node2=%u Length1=%g Length2=%g Height=%g\n", uNode1, uNode2, dBestLength1, dBestLength2, dMinHeight); #endif *ptruNode1 = uNode1; *ptruNode2 = uNode2; *ptrdLength1 = dBestLength1; *ptrdLength2 = dBestLength2; }