static void SeqVectFromMSACols(const MSA &msa, unsigned uColFrom, unsigned uColTo, SeqVect &v) { v.Clear(); const unsigned uSeqCount = msa.GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { Seq s; SeqFromMSACols(msa, uSeqIndex, uColFrom, uColTo, s); v.AppendSeq(s); } }
unsigned EstringOp(const short es[], const Seq &sIn, MSA &a) { unsigned uSymbols; unsigned uIndels; EstringCounts(es, &uSymbols, &uIndels); assert(sIn.Length() == uSymbols); unsigned uColCount = uSymbols + uIndels; a.Clear(); a.SetSize(1, uColCount); a.SetSeqName(0, sIn.GetName()); a.SetSeqId(0, sIn.GetId()); unsigned p = 0; unsigned uColIndex = 0; for (;;) { int n = *es++; if (0 == n) break; if (n > 0) for (int i = 0; i < n; ++i) { char c = sIn[p++]; a.SetChar(0, uColIndex++, c); } else for (int i = 0; i < -n; ++i) a.SetChar(0, uColIndex++, '-'); } assert(uColIndex == uColCount); return uColCount; }
void Stabilize(const MSA &msa, MSA &msaStable) { const unsigned uSeqCount = msa.GetSeqCount(); const unsigned uColCount = msa.GetColCount(); msaStable.SetSize(uSeqCount, uColCount); for (unsigned uId = 0; uId < uSeqCount; ++uId) { const unsigned uSeqIndex = msa.GetSeqIndex(uId); msaStable.SetSeqName(uId, msa.GetSeqName(uSeqIndex)); msaStable.SetSeqId(uSeqIndex, uId); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { const char c = msa.GetChar(uSeqIndex, uColIndex); msaStable.SetChar(uId, uColIndex, c); } } }
static void AppendUnalignedTerminals(const MSA &msaA, unsigned &uColIndexA, unsigned uColCountA, const MSA &msaB, unsigned &uColIndexB, unsigned uColCountB, unsigned uSeqCountA, unsigned uSeqCountB, MSA &msaCombined, unsigned &uColIndexCombined) { #if TRACE Log("AppendUnalignedTerminals ColIxA=%u ColIxB=%u ColIxCmb=%u\n", uColIndexA, uColIndexB, uColIndexCombined); #endif const unsigned uLengthA = msaA.GetColCount(); const unsigned uLengthB = msaB.GetColCount(); unsigned uNewColCount = uColCountA; if (uColCountB > uNewColCount) uNewColCount = uColCountB; for (unsigned n = 0; n < uColCountA; ++n) { for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) { char c = msaA.GetChar(uSeqIndexA, uColIndexA + n); c = UnalignChar(c); msaCombined.SetChar(uSeqIndexA, uColIndexCombined + n, c); } } for (unsigned n = uColCountA; n < uNewColCount; ++n) { for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) msaCombined.SetChar(uSeqIndexA, uColIndexCombined + n, '.'); } for (unsigned n = 0; n < uColCountB; ++n) { for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) { char c = msaB.GetChar(uSeqIndexB, uColIndexB + n); c = UnalignChar(c); msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined + n, c); } } for (unsigned n = uColCountB; n < uNewColCount; ++n) { for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined + n, '.'); } uColIndexCombined += uNewColCount; uColIndexA += uColCountA; uColIndexB += uColCountB; }
void MuscleOutput(MSA &msa) { MHackEnd(msa); if (g_bStable) { MSA msaStable; Stabilize(msa, msaStable); msa.Clear(); // save memory DoOutput(msaStable); } else DoOutput(msa); }
void prepareAlignResults(MSA& msa, const DNAAlphabet* al, MultipleSequenceAlignment& ma, bool mhack) { if (mhack) { MHackEnd(msa); } MuscleContext* ctx = getMuscleContext(); if (ctx->params.g_bStable) { MSA msaStable; Stabilize(msa, msaStable); msa.Clear(); convertMSA2MAlignment(msaStable, al, ma); } else { convertMSA2MAlignment(msa, al, ma); } }
void WriteScoreFile(const MSA &msa) { MuscleContext *ctx = getMuscleContext(); FILE *f = fopen(ctx->params.g_pstrScoreFileName, "w"); if (0 == f) Quit("Cannot open score file '%s' errno=%d", ctx->params.g_pstrScoreFileName, errno); const unsigned uColCount = msa.GetColCount(); const unsigned uSeqCount = msa.GetSeqCount(); for (unsigned uCol = 0; uCol < uColCount; ++uCol) { double Score = GetColScore(msa, uCol); fprintf(f, "%10.3f ", Score); for (unsigned uSeq = 0; uSeq < uSeqCount; ++uSeq) { char c = msa.GetChar(uSeq, uCol); fprintf(f, "%c", c); } fprintf(f, "\n"); } fclose(f); }
// TODO: This could be much faster, no need to look // at all columns. static void FindIntersectingGaps(const MSA &msa, unsigned SeqIndex) { MuscleContext *ctx = getMuscleContext(); GAPINFO** &g_Gaps = ctx->scoregaps.g_Gaps; bool* &g_ColDiff = ctx->scoregaps.g_ColDiff; const unsigned ColCount = msa.GetColCount(); bool InGap = false; bool Intersects = false; unsigned Start = uInsane; for (unsigned Col = 0; Col <= ColCount; ++Col) { bool Gap = ((Col != ColCount) && msa.IsGap(SeqIndex, Col)); if (Gap) { if (!InGap) { InGap = true; Start = Col; } if (g_ColDiff[Col]) Intersects = true; } else if (InGap) { InGap = false; if (Intersects) { GAPINFO *GI = NewGapInfo(); GI->Start = Start; GI->End = Col - 1; GI->Next = g_Gaps[SeqIndex]; g_Gaps[SeqIndex] = GI; } Intersects = false; } } }
static SCORE ScoreLetters(const MSA &msa, const unsigned Edges[], unsigned uEdgeCount) { const unsigned uColCount = msa.GetColCount(); // Letters SCORE Score = 0; for (unsigned uEdgeIndex = 0; uEdgeIndex < uEdgeCount; ++uEdgeIndex) { const unsigned uColIndex = Edges[uEdgeIndex]; assert(uColIndex < uColCount); Score += ScoreColLetters(msa, uColIndex); } return Score; }
// Return true if the given column has no gaps and all // its residues are in the same biochemical group. bool MSAColIsConservative(const MSA &msa, unsigned uColIndex) { extern unsigned ResidueGroup[]; const unsigned uSeqCount = msa.GetColCount(); if (0 == uSeqCount) Quit("MSAColIsConservative: empty alignment"); if (msa.IsGap(0, uColIndex)) return false; unsigned uLetter = msa.GetLetterEx(0, uColIndex); const unsigned uGroup = ResidueGroup[uLetter]; for (unsigned uSeqIndex = 1; uSeqIndex < uSeqCount; ++uSeqIndex) { if (msa.IsGap(uSeqIndex, uColIndex)) return false; uLetter = msa.GetLetter(uSeqIndex, uColIndex); if (ResidueGroup[uLetter] != uGroup) return false; } return true; }
void convertMAlignment2MSA(MSA& muscleMSA, const MultipleSequenceAlignment& ma, bool fixAlpha) { MuscleContext *ctx = getMuscleContext(); ctx->fillUidsVectors(ma->getNumRows()); for (int i=0, n = ma->getNumRows(); i<n; i++) { const MultipleSequenceAlignmentRow row = ma->getMsaRow(i); int coreLen = row->getCoreLength(); int maLen = ma->getLength(); char* seq = new char[maLen + 1]; memcpy(seq, row->getCore().constData(), coreLen); memset(seq + coreLen, '-', maLen - coreLen + 1); seq[maLen] = 0; char* name = new char[row->getName().length() + 1]; memcpy(name, row->getName().toLocal8Bit().constData(), row->getName().length()); name[row->getName().length()] = '\0'; muscleMSA.AppendSeq(seq, maLen, name); ctx->tmp_uIds[i] = ctx->input_uIds[i]; } if (fixAlpha) { muscleMSA.FixAlpha(); } }
void DoSP() { MuscleContext *ctx = getMuscleContext(); TextFile f(ctx->params.g_pstrSPFileName); MSA a; a.FromFile(f); ALPHA Alpha = ALPHA_Undefined; switch (ctx->params.g_SeqType) { case SEQTYPE_Auto: Alpha = a.GuessAlpha(); break; case SEQTYPE_Protein: Alpha = ALPHA_Amino; break; case SEQTYPE_DNA: Alpha = ALPHA_DNA; break; case SEQTYPE_RNA: Alpha = ALPHA_RNA; break; default: Quit("Invalid SeqType"); } SetAlpha(Alpha); a.FixAlpha(); SetPPScore(); const unsigned uSeqCount = a.GetSeqCount(); if (0 == uSeqCount) Quit("No sequences in input file %s", ctx->params.g_pstrSPFileName); MSA::SetIdCount(uSeqCount); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) a.SetSeqId(uSeqIndex, uSeqIndex); SetSeqWeightMethod(ctx->params.g_SeqWeight1); Tree tree; TreeFromMSA(a, tree, ctx->params.g_Cluster2, ctx->params.g_Distance2, ctx->params.g_Root2); SetMuscleTree(tree); SetMSAWeightsMuscle((MSA &) a); SCORE SP = ObjScoreSP(a); Log("File=%s;SP=%.4g\n", ctx->params.g_pstrSPFileName, SP); fprintf(stderr, "File=%s;SP=%.4g\n", ctx->params.g_pstrSPFileName, SP); }
void MSAFromSeqSubset(const MSA &msaIn, const unsigned uSeqIndexes[], unsigned uSeqCount, MSA &msaOut) { const unsigned uColCount = msaIn.GetColCount(); msaOut.SetSize(uSeqCount, uColCount); for (unsigned uSeqIndexOut = 0; uSeqIndexOut < uSeqCount; ++uSeqIndexOut) { unsigned uSeqIndexIn = uSeqIndexes[uSeqIndexOut]; const char *ptrName = msaIn.GetSeqName(uSeqIndexIn); unsigned uId = msaIn.GetSeqId(uSeqIndexIn); msaOut.SetSeqName(uSeqIndexOut, ptrName); msaOut.SetSeqId(uSeqIndexOut, uId); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { const char c = msaIn.GetChar(uSeqIndexIn, uColIndex); msaOut.SetChar(uSeqIndexOut, uColIndex, c); } } }
void StripGapColumns( MSA& msa ) { unsigned uCurCol = 0; for( unsigned uColIndex = 0; uColIndex < msa.GetColCount(); uColIndex++ ) { if( !msa.IsGapColumn(uColIndex) ) { for( unsigned uGapSeq = 0; uGapSeq < msa.GetSeqCount(); uGapSeq++ ) { msa.SetChar(uGapSeq, uCurCol, msa.GetChar(uGapSeq,uColIndex)); } uCurCol++; } } msa.DeleteColumns(uCurCol, msa.GetColCount()-uCurCol); }
// Append msa2 at the end of msa1 void MSAAppend(MSA &msa1, const MSA &msa2) { const unsigned uSeqCount = msa1.GetSeqCount(); const unsigned uColCount1 = msa1.GetColCount(); const unsigned uColCount2 = msa2.GetColCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { unsigned uId = msa1.GetSeqId(uSeqIndex); unsigned uSeqIndex2 = msa2.GetSeqIndex(uId); for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex) { const char c = msa2.GetChar(uSeqIndex2, uColIndex); msa1.SetChar(uSeqIndex, uColCount1 + uColIndex, c); } } }
static void AppendUnalignedTerminals(const MSA &msaA, unsigned &uColIndexA, unsigned uColCountA, const MSA &msaB, unsigned &uColIndexB, unsigned uColCountB, unsigned uSeqCountA, unsigned uSeqCountB, MSA &msaCombined, unsigned &uColIndexCombined) { MuscleContext *ctx = getMuscleContext(); char *g_UnalignChar = ctx->alpha.g_UnalignChar; #if TRACE Log("AppendUnalignedTerminals ColIxA=%u ColIxB=%u ColIxCmb=%u\n", uColIndexA, uColIndexB, uColIndexCombined); #endif unsigned uNewColCount = uColCountA; if (uColCountB > uNewColCount) uNewColCount = uColCountB; for (unsigned n = 0; n < uColCountA; ++n) { for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) { char c = msaA.GetChar(uSeqIndexA, uColIndexA + n); c = UnalignChar(c); msaCombined.SetChar(uSeqIndexA, uColIndexCombined + n, c); } } for (unsigned n = uColCountA; n < uNewColCount; ++n) { for (unsigned uSeqIndexA = 0; uSeqIndexA < uSeqCountA; ++uSeqIndexA) msaCombined.SetChar(uSeqIndexA, uColIndexCombined + n, '.'); } for (unsigned n = 0; n < uColCountB; ++n) { for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) { char c = msaB.GetChar(uSeqIndexB, uColIndexB + n); c = UnalignChar(c); msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined + n, c); } } for (unsigned n = uColCountB; n < uNewColCount; ++n) { for (unsigned uSeqIndexB = 0; uSeqIndexB < uSeqCountB; ++uSeqIndexB) msaCombined.SetChar(uSeqCountA + uSeqIndexB, uColIndexCombined + n, '.'); } uColIndexCombined += uNewColCount; uColIndexA += uColCountA; uColIndexB += uColCountB; }
void SetThreeWayWeightsMuscle(MSA &msa) { MuscleContext *ctx =getMuscleContext(); const Tree* &g_ptrMuscleTree = ctx->msa2.g_ptrMuscleTree; unsigned &g_uTreeSplitNode1 = ctx->msa2.g_uTreeSplitNode1; unsigned &g_uTreeSplitNode2 = ctx->msa2.g_uTreeSplitNode2; if (NULL_NEIGHBOR == g_uTreeSplitNode1 || NULL_NEIGHBOR == g_uTreeSplitNode2) { msa.SetHenikoffWeightsPB(); return; } const unsigned uMuscleSeqCount = g_ptrMuscleTree->GetLeafCount(); WEIGHT *Weights = new WEIGHT[uMuscleSeqCount]; CalcThreeWayWeights(*g_ptrMuscleTree, g_uTreeSplitNode1, g_uTreeSplitNode2, Weights); const unsigned uMSASeqCount = msa.GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uMSASeqCount; ++uSeqIndex) { const unsigned uId = msa.GetSeqId(uSeqIndex); if (uId >= uMuscleSeqCount) Quit("SetThreeWayWeightsMuscle: id out of range"); msa.SetSeqWeight(uSeqIndex, Weights[uId]); } #if LOCAL_VERBOSE { Log("SetThreeWayWeightsMuscle\n"); for (unsigned n = 0; n < uMSASeqCount; ++n) { const unsigned uId = msa.GetSeqId(n); Log("%20.20s %6.3f\n", msa.GetSeqName(n), Weights[uId]); } } #endif msa.NormalizeWeights((WEIGHT) 1.0); delete[] Weights; }
void convertMSA2MAlignment(MSA& msa, const DNAAlphabet* al, MultipleSequenceAlignment& res) { assert(res->isEmpty()); MuscleContext *ctx = getMuscleContext(); res->setAlphabet(al); ctx->output_uIds.clear(); for(int i=0, n = msa.GetSeqCount(); i < n; i++) { QString name = msa.GetSeqName(i); QByteArray seq; seq.reserve(msa.GetColCount()); for (int j = 0, m = msa.GetColCount(); j < m ; j++) { char c = msa.GetChar(i, j); seq.append(c); } ctx->output_uIds.append(ctx->tmp_uIds[msa.GetSeqId(i)]); res->addRow(name, seq); } }
void MSAFromSeqRange(const MSA &msaIn, unsigned uFromSeqIndex, unsigned uSeqCount, MSA &msaOut) { const unsigned uColCount = msaIn.GetColCount(); msaOut.SetSize(uSeqCount, uColCount); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const char *ptrName = msaIn.GetSeqName(uFromSeqIndex + uSeqIndex); msaOut.SetSeqName(uSeqIndex, ptrName); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { const char c = msaIn.GetChar(uFromSeqIndex + uSeqIndex, uColIndex); msaOut.SetChar(uSeqIndex, uColIndex, c); } } }
// The usual sum-of-pairs objective score: sum the score // of the alignment of each pair of sequences. SCORE ObjScoreDA(const MSA &msa, SCORE *ptrLetters, SCORE *ptrGaps) { const unsigned uSeqCount = msa.GetSeqCount(); SCORE scoreTotal = 0; unsigned uPairCount = 0; #if TRACE msa.LogMe(); Log(" Score Weight Weight Total\n"); Log("---------- ------ ------ ----------\n"); #endif SCORE TotalLetters = 0; SCORE TotalGaps = 0; for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1) { const WEIGHT w1 = msa.GetSeqWeight(uSeqIndex1); for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount; ++uSeqIndex2) { const WEIGHT w2 = msa.GetSeqWeight(uSeqIndex2); const WEIGHT w = w1*w2; SCORE Letters; SCORE Gaps; SCORE scorePair = ScoreSeqPair(msa, uSeqIndex1, msa, uSeqIndex2, &Letters, &Gaps); scoreTotal += w1*w2*scorePair; TotalLetters += w1*w2*Letters; TotalGaps += w1*w2*Gaps; ++uPairCount; #if TRACE Log("%10.2f %6.3f %6.3f %10.2f %d=%s %d=%s\n", scorePair, w1, w2, scorePair*w1*w2, uSeqIndex1, msa.GetSeqName(uSeqIndex1), uSeqIndex2, msa.GetSeqName(uSeqIndex2)); #endif } } *ptrLetters = TotalLetters; *ptrGaps = TotalGaps; return scoreTotal; }
SCORE ScoreSeqPairLetters(const MSA &msa1, unsigned uSeqIndex1, const MSA &msa2, unsigned uSeqIndex2) { const unsigned uColCount = msa1.GetColCount(); const unsigned uColCount2 = msa2.GetColCount(); if (uColCount != uColCount2) Quit("ScoreSeqPairLetters, different lengths"); #if TRACE_SEQPAIR { Log("\n"); Log("ScoreSeqPairLetters\n"); MSA msaTmp; msaTmp.SetSize(2, uColCount); msaTmp.CopySeq(0, msa1, uSeqIndex1); msaTmp.CopySeq(1, msa2, uSeqIndex2); msaTmp.LogMe(); } #endif SCORE scoreLetters = 0; SCORE scoreGaps = 0; bool bGapping1 = false; bool bGapping2 = false; unsigned uColStart = 0; bool bLeftTermGap = false; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex); bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex); if (!bGap1 || !bGap2) { if (bGap1 || bGap2) bLeftTermGap = true; uColStart = uColIndex; break; } } unsigned uColEnd = uColCount - 1; bool bRightTermGap = false; for (int iColIndex = (int) uColCount - 1; iColIndex >= 0; --iColIndex) { bool bGap1 = msa1.IsGap(uSeqIndex1, iColIndex); bool bGap2 = msa2.IsGap(uSeqIndex2, iColIndex); if (!bGap1 || !bGap2) { if (bGap1 || bGap2) bRightTermGap = true; uColEnd = (unsigned) iColIndex; break; } } #if TRACE_SEQPAIR Log("LeftTermGap=%d RightTermGap=%d\n", bLeftTermGap, bRightTermGap); #endif for (unsigned uColIndex = uColStart; uColIndex <= uColEnd; ++uColIndex) { unsigned uLetter1 = msa1.GetLetterEx(uSeqIndex1, uColIndex); if (uLetter1 >= g_AlphaSize) continue; unsigned uLetter2 = msa2.GetLetterEx(uSeqIndex2, uColIndex); if (uLetter2 >= g_AlphaSize) continue; SCORE scoreMatch = (*g_ptrScoreMatrix)[uLetter1][uLetter2]; scoreLetters += scoreMatch; } return scoreLetters; }
void DoMuscle(CompositeVect*CVLocation) { SetOutputFileName(g_pstrOutFileName); SetInputFileName(g_pstrInFileName); SetMaxIters(g_uMaxIters); SetSeqWeightMethod(g_SeqWeight1); TextFile fileIn(g_pstrInFileName); SeqVect v; v.FromFASTAFile(fileIn); const unsigned uSeqCount = v.Length(); if (0 == uSeqCount) Quit("No sequences in input file"); ALPHA Alpha = ALPHA_Undefined; switch (g_SeqType) { case SEQTYPE_Auto: Alpha = v.GuessAlpha(); break; case SEQTYPE_Protein: Alpha = ALPHA_Amino; break; case SEQTYPE_DNA: Alpha = ALPHA_DNA; break; case SEQTYPE_RNA: Alpha = ALPHA_RNA; break; default: Quit("Invalid seq type"); } SetAlpha(Alpha); v.FixAlpha(); PTR_SCOREMATRIX UserMatrix = 0; if (0 != g_pstrMatrixFileName) { const char *FileName = g_pstrMatrixFileName; const char *Path = getenv("MUSCLE_MXPATH"); if (Path != 0) { size_t n = strlen(Path) + 1 + strlen(FileName) + 1; char *NewFileName = new char[n]; sprintf(NewFileName, "%s/%s", Path, FileName); FileName = NewFileName; } TextFile File(FileName); UserMatrix = ReadMx(File); g_Alpha = ALPHA_Amino; g_PPScore = PPSCORE_SP; } SetPPScore(); if (0 != UserMatrix) g_ptrScoreMatrix = UserMatrix; unsigned uMaxL = 0; unsigned uTotL = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { unsigned L = v.GetSeq(uSeqIndex).Length(); uTotL += L; if (L > uMaxL) uMaxL = L; } SetIter(1); g_bDiags = g_bDiags1; SetSeqStats(uSeqCount, uMaxL, uTotL/uSeqCount); SetMuscleSeqVect(v); MSA::SetIdCount(uSeqCount); // Initialize sequence ids. // From this point on, ids must somehow propogate from here. for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) v.SetSeqId(uSeqIndex, uSeqIndex); if (0 == uSeqCount) Quit("Input file '%s' has no sequences", g_pstrInFileName); if (1 == uSeqCount) { TextFile fileOut(g_pstrOutFileName, true); v.ToFile(fileOut); return; } if (uSeqCount > 1) MHackStart(v); // First iteration Tree GuideTree; if (0 != g_pstrUseTreeFileName) { // Discourage users... if (!g_bUseTreeNoWarn) fprintf(stderr, "%s", g_strUseTreeWarning); // Read tree from file TextFile TreeFile(g_pstrUseTreeFileName); GuideTree.FromFile(TreeFile); // Make sure tree is rooted if (!GuideTree.IsRooted()) Quit("User tree must be rooted"); if (GuideTree.GetLeafCount() != uSeqCount) Quit("User tree does not match input sequences"); const unsigned uNodeCount = GuideTree.GetNodeCount(); for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (!GuideTree.IsLeaf(uNodeIndex)) continue; const char *LeafName = GuideTree.GetLeafName(uNodeIndex); unsigned uSeqIndex; bool SeqFound = v.FindName(LeafName, &uSeqIndex); if (!SeqFound) Quit("Label %s in tree does not match sequences", LeafName); unsigned uId = v.GetSeqIdFromName(LeafName); GuideTree.SetLeafId(uNodeIndex, uId); } } else TreeFromSeqVect(v, GuideTree, g_Cluster1, g_Distance1, g_Root1, g_pstrDistMxFileName1); const char *Tree1 = ValueOpt("Tree1"); if (0 != Tree1) { TextFile f(Tree1, true); GuideTree.ToFile(f); if (g_bClusterOnly) return; } SetMuscleTree(GuideTree); ValidateMuscleIds(GuideTree); MSA msa; msa.SetCompositeVector(CVLocation); ProgNode *ProgNodes = 0; if (g_bLow) ProgNodes = ProgressiveAlignE(v, GuideTree, msa); else ProgressiveAlign(v, GuideTree, msa); SetCurrentAlignment(msa); if (0 != g_pstrComputeWeightsFileName) { extern void OutWeights(const char *FileName, const MSA &msa); SetMSAWeightsMuscle(msa); OutWeights(g_pstrComputeWeightsFileName, msa); return; } ValidateMuscleIds(msa); if (1 == g_uMaxIters || 2 == uSeqCount) { //TextFile fileOut(g_pstrOutFileName, true); //MHackEnd(msa); //msa.ToFile(fileOut); MuscleOutput(msa); return; } if (0 == g_pstrUseTreeFileName) { g_bDiags = g_bDiags2; SetIter(2); if (g_bLow) { if (0 != g_uMaxTreeRefineIters) RefineTreeE(msa, v, GuideTree, ProgNodes); } else RefineTree(msa, GuideTree); const char *Tree2 = ValueOpt("Tree2"); if (0 != Tree2) { TextFile f(Tree2, true); GuideTree.ToFile(f); } } SetSeqWeightMethod(g_SeqWeight2); SetMuscleTree(GuideTree); if (g_bAnchors) RefineVert(msa, GuideTree, g_uMaxIters - 2); else RefineHoriz(msa, GuideTree, g_uMaxIters - 2, false, false); #if 0 // Refining by subfamilies is disabled as it didn't give better // results. I tried doing this before and after RefineHoriz. // Should get back to this as it seems like this should work. RefineSubfams(msa, GuideTree, g_uMaxIters - 2); #endif ValidateMuscleIds(msa); ValidateMuscleIds(GuideTree); //TextFile fileOut(g_pstrOutFileName, true); //MHackEnd(msa); //msa.ToFile(fileOut); MuscleOutput(msa); }
// "Catenate" two MSAs (by bad analogy with UNIX cat command). // msa1 and msa2 must have same sequence names, but possibly // in a different order. // msaCat is the combined alignment produce by appending // sequences in msa2 to sequences in msa1. void MSACat(const MSA &msa1, const MSA &msa2, MSA &msaCat) { const unsigned uSeqCount = msa1.GetSeqCount(); const unsigned uColCount1 = msa1.GetColCount(); const unsigned uColCount2 = msa2.GetColCount(); const unsigned uColCountCat = uColCount1 + uColCount2; msaCat.SetSize(uSeqCount, uColCountCat); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { for (unsigned uColIndex = 0; uColIndex < uColCount1; ++uColIndex) { const char c = msa1.GetChar(uSeqIndex, uColIndex); msaCat.SetChar(uSeqIndex, uColIndex, c); } const char *ptrSeqName = msa1.GetSeqName(uSeqIndex); unsigned uSeqIndex2; msaCat.SetSeqName(uSeqIndex, ptrSeqName); bool bFound = msa2.GetSeqIndex(ptrSeqName, &uSeqIndex2); if (bFound) { for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex) { const char c = msa2.GetChar(uSeqIndex2, uColIndex); msaCat.SetChar(uSeqIndex, uColCount1 + uColIndex, c); } } else { for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex) msaCat.SetChar(uSeqIndex, uColCount1 + uColIndex, '-'); } } }
bool RefineSubfams(MSA &msa, const Tree &tree, unsigned uIters) { MuscleContext *ctx = getMuscleContext(); CLUSTER &g_Cluster2 = ctx->params.g_Cluster2; DISTANCE &g_Distance2 = ctx->params.g_Distance2; ROOT &g_Root2 = ctx->params.g_Root2; bool &g_bAnchors = ctx->params.g_bAnchors; const unsigned uSeqCount = msa.GetSeqCount(); if (uSeqCount < 3) return false; const double dMaxHeight = 0.6; const unsigned uMaxSubfamCount = 16; //const unsigned uNodeCount = tree.GetNodeCount(); unsigned *Subfams; unsigned uSubfamCount; GetSubfams(tree, dMaxHeight, uMaxSubfamCount, &Subfams, &uSubfamCount); assert(uSubfamCount <= uSeqCount); if (ctx->params.g_bVerbose) LogSubfams(tree, Subfams, uSubfamCount); MSA *SubfamMSAs = new MSA[uSubfamCount]; unsigned *Leaves = new unsigned[uSeqCount]; unsigned *Ids = new unsigned[uSeqCount]; bool bAnyChanges = false; for (unsigned uSubfamIndex = 0; uSubfamIndex < uSubfamCount; ++uSubfamIndex) { unsigned uSubfam = Subfams[uSubfamIndex]; unsigned uLeafCount; GetLeaves(tree, uSubfam, Leaves, &uLeafCount); assert(uLeafCount <= uSeqCount); LeafIndexesToIds(tree, Leaves, uLeafCount, Ids); MSA &msaSubfam = SubfamMSAs[uSubfamIndex]; MSASubsetByIds(msa, Ids, uLeafCount, msaSubfam); DeleteGappedCols(msaSubfam); #if TRACE Log("Subfam %u MSA=\n", uSubfamIndex); msaSubfam.LogMe(); #endif if (msaSubfam.GetSeqCount() <= 2) continue; // TODO ///////////////////////////////////////// // Try using existing tree, may actually hurt to // re-estimate, may also be a waste of CPU & mem. ///////////////////////////////////////////////// Tree SubfamTree; TreeFromMSA(msaSubfam, SubfamTree, g_Cluster2, g_Distance2, g_Root2); bool bAnyChangesThisSubfam; if (g_bAnchors) bAnyChangesThisSubfam = RefineVert(msaSubfam, SubfamTree, uIters); else bAnyChangesThisSubfam = RefineHoriz(msaSubfam, SubfamTree, uIters, false, false); #if TRACE Log("Subfam %u Changed %d\n", uSubfamIndex, bAnyChangesThisSubfam); #endif if (bAnyChangesThisSubfam) bAnyChanges = true; } if (bAnyChanges) ProgressiveAlignSubfams(tree, Subfams, uSubfamCount, SubfamMSAs, msa); delete[] Leaves; delete[] Subfams; delete[] SubfamMSAs; return bAnyChanges; }
static void ProgressiveAlignSubfams(const Tree &tree, const unsigned Subfams[], unsigned uSubfamCount, const MSA SubfamMSAs[], MSA &msa) { const unsigned uNodeCount = tree.GetNodeCount(); bool *Ready = new bool[uNodeCount]; MSA **MSAs = new MSA *[uNodeCount]; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { Ready[uNodeIndex] = false; MSAs[uNodeIndex] = 0; } for (unsigned uSubfamIndex = 0; uSubfamIndex < uSubfamCount; ++uSubfamIndex) { unsigned uNodeIndex = Subfams[uSubfamIndex]; Ready[uNodeIndex] = true; MSA *ptrMSA = new MSA; // TODO: Wasteful copy, needs re-design ptrMSA->Copy(SubfamMSAs[uSubfamIndex]); MSAs[uNodeIndex] = ptrMSA; } for (unsigned uNodeIndex = tree.FirstDepthFirstNode(); NULL_NEIGHBOR != uNodeIndex; uNodeIndex = tree.NextDepthFirstNode(uNodeIndex)) { if (tree.IsLeaf(uNodeIndex)) continue; unsigned uRight = tree.GetRight(uNodeIndex); unsigned uLeft = tree.GetLeft(uNodeIndex); if (!Ready[uRight] || !Ready[uLeft]) continue; MSA *ptrLeft = MSAs[uLeft]; MSA *ptrRight = MSAs[uRight]; assert(ptrLeft != 0 && ptrRight != 0); MSA *ptrParent = new MSA; PWPath Path; AlignTwoMSAs(*ptrLeft, *ptrRight, *ptrParent, Path); MSAs[uNodeIndex] = ptrParent; Ready[uNodeIndex] = true; Ready[uLeft] = false; Ready[uRight] = false; delete MSAs[uLeft]; delete MSAs[uRight]; MSAs[uLeft] = 0; MSAs[uRight] = 0; } #if DEBUG { unsigned uReadyCount = 0; for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (Ready[uNodeIndex]) { assert(tree.IsRoot(uNodeIndex)); ++uReadyCount; assert(0 != MSAs[uNodeIndex]); } else assert(0 == MSAs[uNodeIndex]); } assert(1 == uReadyCount); } #endif const unsigned uRoot = tree.GetRootNodeIndex(); MSA *ptrRootAlignment = MSAs[uRoot]; msa.Copy(*ptrRootAlignment); delete ptrRootAlignment; delete[] Ready; #if TRACE Log("After refine subfamilies, root alignment=\n"); msa.LogMe(); #endif }
void MakeRootMSA(const SeqVect &v, const Tree &GuideTree, ProgNode Nodes[], MSA &a) { #if TRACE Log("MakeRootMSA Tree="); GuideTree.LogMe(); #endif const unsigned uSeqCount = v.GetSeqCount(); unsigned uColCount = uInsane; unsigned uSeqIndex = 0; const unsigned uTreeNodeCount = GuideTree.GetNodeCount(); const unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex(); const PWPath &RootPath = Nodes[uRootNodeIndex].m_Path; const unsigned uRootColCount = RootPath.GetEdgeCount(); const unsigned uEstringSize = uRootColCount + 1; short *Estring1 = new short[uEstringSize]; short *Estring2 = new short[uEstringSize]; SetProgressDesc("Root alignment"); unsigned uTreeNodeIndex = GetFirstNodeIndex(GuideTree); do { Progress(uSeqIndex, uSeqCount); unsigned uId = GuideTree.GetLeafId(uTreeNodeIndex); const Seq &s = *(v[uId]); Seq sRootE; short *es = MakeRootSeqE(s, GuideTree, uTreeNodeIndex, Nodes, sRootE, Estring1, Estring2); Nodes[uTreeNodeIndex].m_EstringL = EstringNewCopy(es); #if VALIDATE Seq sRoot; MakeRootSeq(s, GuideTree, uTreeNodeIndex, Nodes, sRoot); if (!sRoot.Eq(sRootE)) { Log("sRoot="); sRoot.LogMe(); Log("sRootE="); sRootE.LogMe(); Quit("Root seqs differ"); } #if TRACE Log("MakeRootSeq=\n"); sRoot.LogMe(); #endif #endif if (uInsane == uColCount) { uColCount = sRootE.Length(); a.SetSize(uSeqCount, uColCount); } else { assert(uColCount == sRootE.Length()); } a.SetSeqName(uSeqIndex, s.GetName()); a.SetSeqId(uSeqIndex, uId); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) a.SetChar(uSeqIndex, uColIndex, sRootE[uColIndex]); ++uSeqIndex; uTreeNodeIndex = GetNextNodeIndex(GuideTree, uTreeNodeIndex); } while (NULL_NEIGHBOR != uTreeNodeIndex); delete[] Estring1; delete[] Estring2; ProgressStepsDone(); assert(uSeqIndex == uSeqCount); }
void Refine() { SetOutputFileName(g_pstrOutFileName.get()); SetInputFileName(g_pstrInFileName.get()); SetStartTime(); SetMaxIters(g_uMaxIters.get()); SetSeqWeightMethod(g_SeqWeight1.get()); TextFile fileIn(g_pstrInFileName.get()); MSA msa; msa.FromFile(fileIn); const unsigned uSeqCount = msa.GetSeqCount(); if (0 == uSeqCount) Quit("No sequences in input file"); ALPHA Alpha = ALPHA_Undefined; switch (g_SeqType.get()) { case SEQTYPE_Auto: Alpha = msa.GuessAlpha(); break; case SEQTYPE_Protein: Alpha = ALPHA_Amino; break; case SEQTYPE_DNA: Alpha = ALPHA_DNA; break; case SEQTYPE_RNA: Alpha = ALPHA_RNA; break; default: Quit("Invalid SeqType"); } SetAlpha(Alpha); msa.FixAlpha(); SetPPScore(); if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha) SetPPScore(PPSCORE_SPN); MSA::SetIdCount(uSeqCount); // Initialize sequence ids. // From this point on, ids must somehow propogate from here. for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) msa.SetSeqId(uSeqIndex, uSeqIndex); SetMuscleInputMSA(msa); Tree GuideTree; TreeFromMSA(msa, GuideTree, g_Cluster2.get(), g_Distance2.get(), g_Root2.get()); SetMuscleTree(GuideTree); if (g_bAnchors.get()) RefineVert(msa, GuideTree, g_uMaxIters.get()); else RefineHoriz(msa, GuideTree, g_uMaxIters.get(), false, false); ValidateMuscleIds(msa); ValidateMuscleIds(GuideTree); // TextFile fileOut(g_pstrOutFileName.get(), true); // msa.ToFile(fileOut); MuscleOutput(msa); }
// The usual sum-of-pairs objective score: sum the score // of the alignment of each pair of sequences. SCORE ObjScoreSP(const MSA &msa, SCORE MatchScore[]) { #if TRACE Log("==================ObjScoreSP==============\n"); Log("msa=\n"); msa.LogMe(); #endif g_SPScoreLetters = 0; g_SPScoreGaps = 0; if (0 != MatchScore) { const unsigned uColCount = msa.GetColCount(); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) MatchScore[uColIndex] = 0; } const unsigned uSeqCount = msa.GetSeqCount(); SCORE scoreTotal = 0; unsigned uPairCount = 0; #if TRACE Log("Seq1 Seq2 wt1 wt2 Letters Gaps Unwt.Score Wt.Score Total\n"); Log("---- ---- ------ ------ ---------- ---------- ---------- ---------- ----------\n"); #endif for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1) { const WEIGHT w1 = msa.GetSeqWeight(uSeqIndex1); for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount; ++uSeqIndex2) { const WEIGHT w2 = msa.GetSeqWeight(uSeqIndex2); const WEIGHT w = w1*w2; SCORE scoreLetters = ScoreSeqPairLetters(msa, uSeqIndex1, msa, uSeqIndex2); SCORE scoreGaps = ScoreSeqPairGaps(msa, uSeqIndex1, msa, uSeqIndex2); SCORE scorePair = scoreLetters + scoreGaps; ++uPairCount; scoreTotal += w*scorePair; g_SPScoreLetters += w*scoreLetters; g_SPScoreGaps += w*scoreGaps; #if TRACE Log("%4d %4d %6.3f %6.3f %10.2f %10.2f %10.2f %10.2f %10.2f >%s >%s\n", uSeqIndex1, uSeqIndex2, w1, w2, scoreLetters, scoreGaps, scorePair, scorePair*w1*w2, scoreTotal, msa.GetSeqName(uSeqIndex1), msa.GetSeqName(uSeqIndex2)); #endif } } #if TEST_SPFAST { SCORE f = ObjScoreSPFast(msa); Log("Fast = %.6g\n", f); Log("Brute = %.6g\n", scoreTotal); if (BTEq(f, scoreTotal)) Log("Agree\n"); else Log("** DISAGREE **\n"); } #endif // return scoreTotal / uPairCount; return scoreTotal; }
SCORE ScoreSeqPairGaps(const MSA &msa1, unsigned uSeqIndex1, const MSA &msa2, unsigned uSeqIndex2) { const unsigned uColCount = msa1.GetColCount(); const unsigned uColCount2 = msa2.GetColCount(); if (uColCount != uColCount2) Quit("ScoreSeqPairGaps, different lengths"); #if TRACE_SEQPAIR { Log("\n"); Log("ScoreSeqPairGaps\n"); MSA msaTmp; msaTmp.SetSize(2, uColCount); msaTmp.CopySeq(0, msa1, uSeqIndex1); msaTmp.CopySeq(1, msa2, uSeqIndex2); msaTmp.LogMe(); } #endif SCORE scoreGaps = 0; bool bGapping1 = false; bool bGapping2 = false; unsigned uColStart = 0; bool bLeftTermGap = false; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex); bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex); if (!bGap1 || !bGap2) { if (bGap1 || bGap2) bLeftTermGap = true; uColStart = uColIndex; break; } } unsigned uColEnd = uColCount - 1; bool bRightTermGap = false; for (int iColIndex = (int) uColCount - 1; iColIndex >= 0; --iColIndex) { bool bGap1 = msa1.IsGap(uSeqIndex1, iColIndex); bool bGap2 = msa2.IsGap(uSeqIndex2, iColIndex); if (!bGap1 || !bGap2) { if (bGap1 || bGap2) bRightTermGap = true; uColEnd = (unsigned) iColIndex; break; } } #if TRACE_SEQPAIR Log("LeftTermGap=%d RightTermGap=%d\n", bLeftTermGap, bRightTermGap); #endif for (unsigned uColIndex = uColStart; uColIndex <= uColEnd; ++uColIndex) { bool bGap1 = msa1.IsGap(uSeqIndex1, uColIndex); bool bGap2 = msa2.IsGap(uSeqIndex2, uColIndex); if (bGap1 && bGap2) continue; if (bGap1) { if (!bGapping1) { #if TRACE_SEQPAIR Log("Gap open seq 1 col %d\n", uColIndex); #endif if (uColIndex == uColStart) scoreGaps += TermGapScore(true); else scoreGaps += g_scoreGapOpen; bGapping1 = true; } else scoreGaps += g_scoreGapExtend; continue; } else if (bGap2) { if (!bGapping2) { #if TRACE_SEQPAIR Log("Gap open seq 2 col %d\n", uColIndex); #endif if (uColIndex == uColStart) scoreGaps += TermGapScore(true); else scoreGaps += g_scoreGapOpen; bGapping2 = true; } else scoreGaps += g_scoreGapExtend; continue; } bGapping1 = false; bGapping2 = false; } if (bGapping1 || bGapping2) { scoreGaps -= g_scoreGapOpen; scoreGaps += TermGapScore(true); } return scoreGaps; }
SCORE DiffObjScore( const MSA &msa1, const PWPath &Path1, const unsigned Edges1[], unsigned uEdgeCount1, const MSA &msa2, const PWPath &Path2, const unsigned Edges2[], unsigned uEdgeCount2) { #if TRACE { Log("============DiffObjScore===========\n"); Log("msa1:\n"); msa1.LogMe(); Log("\n"); Log("Cols1: "); for (unsigned i = 0; i < uEdgeCount1; ++i) Log(" %u", Edges1[i]); Log("\n\n"); Log("msa2:\n"); msa2.LogMe(); Log("Cols2: "); for (unsigned i = 0; i < uEdgeCount2; ++i) Log(" %u", Edges2[i]); Log("\n\n"); } #endif #if COMPARE_3_52 extern SCORE g_SPScoreLetters; extern SCORE g_SPScoreGaps; SCORE SP1 = ObjScoreSP(msa1); SCORE SPLetters1 = g_SPScoreLetters; SCORE SPGaps1 = g_SPScoreGaps; SCORE SP2 = ObjScoreSP(msa2); SCORE SPLetters2 = g_SPScoreLetters; SCORE SPGaps2 = g_SPScoreGaps; SCORE SPDiffLetters = SPLetters2 - SPLetters1; SCORE SPDiffGaps = SPGaps2 - SPGaps1; SCORE SPDiff = SPDiffLetters + SPDiffGaps; #endif SCORE Letters1 = ScoreLetters(msa1, Edges1, uEdgeCount1); SCORE Letters2 = ScoreLetters(msa2, Edges2, uEdgeCount2); SCORE Gaps1 = ScoreGaps(msa1, Edges1, uEdgeCount1); SCORE Gaps2 = ScoreGaps(msa2, Edges2, uEdgeCount2); SCORE DiffLetters = Letters2 - Letters1; SCORE DiffGaps = Gaps2 - Gaps1; SCORE Diff = DiffLetters + DiffGaps; #if COMPARE_3_52 Log("ObjScoreSP Letters1=%.4g Letters2=%.4g DiffLetters=%.4g\n", SPLetters1, SPLetters2, SPDiffLetters); Log("DiffObjScore Letters1=%.4g Letters2=%.4g DiffLetters=%.4g\n", Letters1, Letters2, DiffLetters); Log("ObjScoreSP Gaps1=%.4g Gaps2=%.4g DiffGaps=%.4g\n", SPGaps1, SPGaps2, SPDiffGaps); Log("DiffObjScore Gaps1=%.4g Gaps2=%.4g DiffGaps=%.4g\n", Gaps1, Gaps2, DiffGaps); Log("SP diff=%.4g DiffObjScore Diff=%.4g\n", SPDiff, Diff); #endif return Diff; }