void AssertMSAEq(const MSA &msa1, const MSA &msa2) { const unsigned uSeqCount1 = msa1.GetSeqCount(); const unsigned uSeqCount2 = msa2.GetSeqCount(); if (uSeqCount1 != uSeqCount2) Quit("Seq count differs"); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount1; ++uSeqIndex) { Seq seq1; msa1.GetSeq(uSeqIndex, seq1); unsigned uId = msa1.GetSeqId(uSeqIndex); unsigned uSeqIndex2 = msa2.GetSeqIndex(uId); Seq seq2; msa2.GetSeq(uSeqIndex2, seq2); if (!seq1.Eq(seq2)) { Log("Input:\n"); seq1.LogMe(); Log("Output:\n"); seq2.LogMe(); Quit("Seq %s differ ", msa1.GetSeqName(uSeqIndex)); } } }
// The XP score is the sum of the score of each pair of // sequences between two profiles which are aligned to // each other. Notice that for two given profiles aligned // in different ways, the difference in XP score must be // the same as the difference in SP score because the // score of a pair of sequences in one profile doesn't // depend on the alignment. SCORE ObjScoreXP(const MSA &msa1, const MSA &msa2) { const unsigned uColCount1 = msa1.GetColCount(); const unsigned uColCount2 = msa2.GetColCount(); if (uColCount1 != uColCount2) Quit("ObjScoreXP, alignment lengths differ %u %u", uColCount1, uColCount2); const unsigned uSeqCount1 = msa1.GetSeqCount(); const unsigned uSeqCount2 = msa2.GetSeqCount(); #if TRACE Log(" Score Weight Weight Total\n"); Log("---------- ------ ------ ----------\n"); #endif SCORE scoreTotal = 0; unsigned uPairCount = 0; for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount1; ++uSeqIndex1) { const WEIGHT w1 = msa1.GetSeqWeight(uSeqIndex1); for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqCount2; ++uSeqIndex2) { const WEIGHT w2 = msa2.GetSeqWeight(uSeqIndex2); const WEIGHT w = w1*w2; SCORE scoreLetters = ScoreSeqPairLetters(msa1, uSeqIndex1, msa2, uSeqIndex2); SCORE scoreGaps = ScoreSeqPairGaps(msa1, uSeqIndex1, msa2, uSeqIndex2); SCORE scorePair = scoreLetters + scoreGaps; scoreTotal += w1*w2*scorePair; ++uPairCount; #if TRACE Log("%10.2f %6.3f %6.3f %10.2f >%s >%s\n", scorePair, w1, w2, scorePair*w1*w2, msa1.GetSeqName(uSeqIndex1), msa2.GetSeqName(uSeqIndex2)); #endif } } if (0 == uPairCount) Quit("0 == uPairCount"); #if TRACE Log("msa1=\n"); msa1.LogMe(); Log("msa2=\n"); msa2.LogMe(); Log("XP=%g\n", scoreTotal); #endif // return scoreTotal / uPairCount; return scoreTotal; }
// Append msa2 at the end of msa1 void AppendMSA(MSA &msa1, const MSA &msa2) { const unsigned uSeqCount = msa1.GetSeqCount(); const unsigned uColCount1 = msa1.GetColCount(); const unsigned uColCount2 = msa2.GetColCount(); const unsigned uColCountCat = uColCount1 + uColCount2; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { unsigned uId = msa1.GetSeqId(uSeqIndex); unsigned uSeqIndex2; bool bFound = msa2.GetSeqIndex(uId, &uSeqIndex2); if (bFound) { for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex) { const char c = msa2.GetChar(uSeqIndex2, uColIndex); msa1.SetChar(uSeqIndex, uColCount1 + uColIndex, c); } } else { for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex) msa1.SetChar(uSeqIndex, uColCount1 + uColIndex, '-'); } } }
void MSAFromColRange(const MSA &msaIn, unsigned uFromColIndex, unsigned uColCount, MSA &msaOut) { const unsigned uSeqCount = msaIn.GetSeqCount(); const unsigned uInColCount = msaIn.GetColCount(); if (uFromColIndex + uColCount - 1 > uInColCount) Quit("MSAFromColRange, out of bounds"); msaOut.SetSize(uSeqCount, uColCount); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const char *ptrName = msaIn.GetSeqName(uSeqIndex); unsigned uId = msaIn.GetSeqId(uSeqIndex); msaOut.SetSeqName(uSeqIndex, ptrName); msaOut.SetSeqId(uSeqIndex, uId); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { const char c = msaIn.GetChar(uSeqIndex, uFromColIndex + uColIndex); msaOut.SetChar(uSeqIndex, uColIndex, c); } } }
void DoMakeTree() { if (g_pstrInFileName.get() == 0 || g_pstrOutFileName.get() == 0) Quit("-maketree requires -in <msa> and -out <treefile>"); SetStartTime(); SetSeqWeightMethod(g_SeqWeight1.get()); TextFile MSAFile(g_pstrInFileName.get()); MSA msa; msa.FromFile(MSAFile); unsigned uSeqCount = msa.GetSeqCount(); MSA::SetIdCount(uSeqCount); // Initialize sequence ids. // From this point on, ids must somehow propogate from here. for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) msa.SetSeqId(uSeqIndex, uSeqIndex); SetMuscleInputMSA(msa); Progress("%u sequences", uSeqCount); Tree tree; TreeFromMSA(msa, tree, g_Cluster2.get(), g_Distance2.get(), g_Root2.get()); TextFile TreeFile(g_pstrOutFileName.get(), true); tree.ToFile(TreeFile); Progress("Tree created"); }
void MHackEnd(MSA &msa) { if (ALPHA_Amino != g_Alpha) return; if (0 == M) return; const unsigned uSeqCount = msa.GetSeqCount(); const unsigned uColCount = msa.GetColCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { unsigned uId = msa.GetSeqId(uSeqIndex); if (M[uId]) { for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { if (!msa.IsGap(uSeqIndex, uColIndex)) { msa.SetChar(uSeqIndex, uColIndex, 'M'); break; } } } } delete[] M; M = 0; }
void DoSP() { MuscleContext *ctx = getMuscleContext(); TextFile f(ctx->params.g_pstrSPFileName); MSA a; a.FromFile(f); ALPHA Alpha = ALPHA_Undefined; switch (ctx->params.g_SeqType) { case SEQTYPE_Auto: Alpha = a.GuessAlpha(); break; case SEQTYPE_Protein: Alpha = ALPHA_Amino; break; case SEQTYPE_DNA: Alpha = ALPHA_DNA; break; case SEQTYPE_RNA: Alpha = ALPHA_RNA; break; default: Quit("Invalid SeqType"); } SetAlpha(Alpha); a.FixAlpha(); SetPPScore(); const unsigned uSeqCount = a.GetSeqCount(); if (0 == uSeqCount) Quit("No sequences in input file %s", ctx->params.g_pstrSPFileName); MSA::SetIdCount(uSeqCount); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) a.SetSeqId(uSeqIndex, uSeqIndex); SetSeqWeightMethod(ctx->params.g_SeqWeight1); Tree tree; TreeFromMSA(a, tree, ctx->params.g_Cluster2, ctx->params.g_Distance2, ctx->params.g_Root2); SetMuscleTree(tree); SetMSAWeightsMuscle((MSA &) a); SCORE SP = ObjScoreSP(a); Log("File=%s;SP=%.4g\n", ctx->params.g_pstrSPFileName, SP); fprintf(stderr, "File=%s;SP=%.4g\n", ctx->params.g_pstrSPFileName, SP); }
void PrepareMSAforScoring( MSA& msa ) { Tree tree; const unsigned uSeqCount = msa.GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) msa.SetSeqId(uSeqIndex, uSeqIndex); TreeFromMSA(msa, tree, g_Cluster2.get(), g_Distance2.get(), g_Root1.get()); SetMuscleTree(tree); SetMSAWeightsMuscle(msa); }
// Objective score defined as the sum of profile-sequence // scores for each sequence in the alignment. The profile // is computed from the entire alignment, so this includes // the score of each sequence against itself. This is to // avoid recomputing the profile each time, so we reduce // complexity but introduce a questionable approximation. // The goal is to see if we can exploit the apparent // improvement in performance of log-expectation score // over the usual sum-of-pairs by optimizing this // objective score in the iterative refinement stage. SCORE ObjScorePS(const MSA &msa, SCORE MatchScore[]) { if (g_PPScore != PPSCORE_LE) Quit("FastScoreMSA_LASimple: LA"); const unsigned uSeqCount = msa.GetSeqCount(); const unsigned uColCount = msa.GetColCount(); const ProfPos *Prof = ProfileFromMSA(msa); if (0 != MatchScore) for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) MatchScore[uColIndex] = 0; SCORE scoreTotal = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const WEIGHT weightSeq = msa.GetSeqWeight(uSeqIndex); SCORE scoreSeq = 0; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { const ProfPos &PP = Prof[uColIndex]; if (msa.IsGap(uSeqIndex, uColIndex)) { bool bOpen = (0 == uColIndex || !msa.IsGap(uSeqIndex, uColIndex - 1)); bool bClose = (uColCount - 1 == uColIndex || !msa.IsGap(uSeqIndex, uColIndex + 1)); if (bOpen) scoreSeq += PP.m_scoreGapOpen; if (bClose) scoreSeq += PP.m_scoreGapClose; //if (!bOpen && !bClose) // scoreSeq += PP.m_scoreGapExtend; } else if (msa.IsWildcard(uSeqIndex, uColIndex)) continue; else { unsigned uLetter = msa.GetLetter(uSeqIndex, uColIndex); const SCORE scoreMatch = PP.m_AAScores[uLetter]; if (0 != MatchScore) MatchScore[uColIndex] += weightSeq*scoreMatch; scoreSeq += scoreMatch; } } scoreTotal += weightSeq*scoreSeq; } delete[] Prof; return scoreTotal; }
static void SeqVectFromMSACols(const MSA &msa, unsigned uColFrom, unsigned uColTo, SeqVect &v) { v.Clear(); const unsigned uSeqCount = msa.GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { Seq s; SeqFromMSACols(msa, uSeqIndex, uColFrom, uColTo, s); v.AppendSeq(s); } }
void RefineTreeE(MSA &msa, const SeqVect &v, Tree &tree, ProgNode *ProgNodes) { MuscleContext *ctx = getMuscleContext(); const unsigned uSeqCount = msa.GetSeqCount(); if (tree.GetLeafCount() != uSeqCount) Quit("Refine tree, tree has different number of nodes"); if (uSeqCount < 3) return; #if DEBUG ValidateMuscleIds(msa); ValidateMuscleIds(tree); #endif const unsigned uNodeCount = tree.GetNodeCount(); unsigned *uNewNodeIndexToOldNodeIndex= new unsigned[uNodeCount]; Tree Tree2; TreeFromMSA(msa, Tree2, ctx->params.g_Cluster2, ctx->params.g_Distance2, ctx->params.g_Root2, ctx->params.g_pstrDistMxFileName2); #if DEBUG ValidateMuscleIds(Tree2); #endif DiffTreesE(Tree2, tree, uNewNodeIndexToOldNodeIndex); unsigned uRoot = Tree2.GetRootNodeIndex(); if (NODE_CHANGED == uNewNodeIndexToOldNodeIndex[uRoot]) { MSA msa2; RealignDiffsE(msa, v, Tree2, tree, uNewNodeIndexToOldNodeIndex, msa2, ProgNodes); if (!ctx->isCanceled()) { tree.Copy(Tree2); msa.Copy(msa2); #if DEBUG ValidateMuscleIds(msa2); #endif } } delete[] uNewNodeIndexToOldNodeIndex; if (ctx->isCanceled()) { throw MuscleException("Canceled"); } SetCurrentAlignment(msa); ProgressStepsDone(); }
void SetClustalWWeightsMuscle(MSA &msa) { if (0 == g_MuscleWeights) Quit("g_MuscleWeights = 0"); const unsigned uSeqCount = msa.GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const unsigned uId = msa.GetSeqId(uSeqIndex); if (uId >= g_uMuscleIdCount) Quit("SetClustalWWeightsMuscle: id out of range"); msa.SetSeqWeight(uSeqIndex, g_MuscleWeights[uId]); } msa.NormalizeWeights((WEIGHT) 1.0); }
void StripGapColumns( MSA& msa ) { unsigned uCurCol = 0; for( unsigned uColIndex = 0; uColIndex < msa.GetColCount(); uColIndex++ ) { if( !msa.IsGapColumn(uColIndex) ) { for( unsigned uGapSeq = 0; uGapSeq < msa.GetSeqCount(); uGapSeq++ ) { msa.SetChar(uGapSeq, uCurCol, msa.GetChar(uGapSeq,uColIndex)); } uCurCol++; } } msa.DeleteColumns(uCurCol, msa.GetColCount()-uCurCol); }
// Append msa2 at the end of msa1 void MSAAppend(MSA &msa1, const MSA &msa2) { const unsigned uSeqCount = msa1.GetSeqCount(); const unsigned uColCount1 = msa1.GetColCount(); const unsigned uColCount2 = msa2.GetColCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { unsigned uId = msa1.GetSeqId(uSeqIndex); unsigned uSeqIndex2 = msa2.GetSeqIndex(uId); for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex) { const char c = msa2.GetChar(uSeqIndex2, uColIndex); msa1.SetChar(uSeqIndex, uColCount1 + uColIndex, c); } } }
void Stabilize(const MSA &msa, MSA &msaStable) { const unsigned uSeqCount = msa.GetSeqCount(); const unsigned uColCount = msa.GetColCount(); msaStable.SetSize(uSeqCount, uColCount); for (unsigned uId = 0; uId < uSeqCount; ++uId) { const unsigned uSeqIndex = msa.GetSeqIndex(uId); msaStable.SetSeqName(uId, msa.GetSeqName(uSeqIndex)); msaStable.SetSeqId(uSeqIndex, uId); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { const char c = msa.GetChar(uSeqIndex, uColIndex); msaStable.SetChar(uId, uColIndex, c); } } }
void convertMSA2MAlignment(MSA& msa, const DNAAlphabet* al, MultipleSequenceAlignment& res) { assert(res->isEmpty()); MuscleContext *ctx = getMuscleContext(); res->setAlphabet(al); ctx->output_uIds.clear(); for(int i=0, n = msa.GetSeqCount(); i < n; i++) { QString name = msa.GetSeqName(i); QByteArray seq; seq.reserve(msa.GetColCount()); for (int j = 0, m = msa.GetColCount(); j < m ; j++) { char c = msa.GetChar(i, j); seq.append(c); } ctx->output_uIds.append(ctx->tmp_uIds[msa.GetSeqId(i)]); res->addRow(name, seq); } }
// The usual sum-of-pairs objective score: sum the score // of the alignment of each pair of sequences. SCORE ObjScoreDA(const MSA &msa, SCORE *ptrLetters, SCORE *ptrGaps) { const unsigned uSeqCount = msa.GetSeqCount(); SCORE scoreTotal = 0; unsigned uPairCount = 0; #if TRACE msa.LogMe(); Log(" Score Weight Weight Total\n"); Log("---------- ------ ------ ----------\n"); #endif SCORE TotalLetters = 0; SCORE TotalGaps = 0; for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1) { const WEIGHT w1 = msa.GetSeqWeight(uSeqIndex1); for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount; ++uSeqIndex2) { const WEIGHT w2 = msa.GetSeqWeight(uSeqIndex2); const WEIGHT w = w1*w2; SCORE Letters; SCORE Gaps; SCORE scorePair = ScoreSeqPair(msa, uSeqIndex1, msa, uSeqIndex2, &Letters, &Gaps); scoreTotal += w1*w2*scorePair; TotalLetters += w1*w2*Letters; TotalGaps += w1*w2*Gaps; ++uPairCount; #if TRACE Log("%10.2f %6.3f %6.3f %10.2f %d=%s %d=%s\n", scorePair, w1, w2, scorePair*w1*w2, uSeqIndex1, msa.GetSeqName(uSeqIndex1), uSeqIndex2, msa.GetSeqName(uSeqIndex2)); #endif } } *ptrLetters = TotalLetters; *ptrGaps = TotalGaps; return scoreTotal; }
static double GetColScore(const MSA &msa, unsigned uCol) { MuscleContext *d = getMuscleContext(); unsigned &g_AlphaSize = d->alpha.g_AlphaSize; ALPHA &g_Alpha = d->alpha.g_Alpha; const unsigned uSeqCount = msa.GetSeqCount(); unsigned uPairCount = 0; double dSum = 0.0; for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) { if (msa.IsGap(uSeq1, uCol)) continue; unsigned uLetter1 = msa.GetLetterEx(uSeq1, uCol); if (uLetter1 >= g_AlphaSize) continue; for (unsigned uSeq2 = uSeq1 + 1; uSeq2 < uSeqCount; ++uSeq2) { if (msa.IsGap(uSeq2, uCol)) continue; unsigned uLetter2 = msa.GetLetterEx(uSeq2, uCol); if (uLetter2 >= g_AlphaSize) continue; double Score; switch (g_Alpha) { case ALPHA_Amino: Score = VTML_SP[uLetter1][uLetter2]; break; case ALPHA_DNA: case ALPHA_RNA: Score = NUC_SP[uLetter1][uLetter2]; break; default: Quit("GetColScore: invalid alpha=%d", g_Alpha); } dSum += Score; ++uPairCount; } } if (0 == uPairCount) return 0; return dSum / uPairCount; }
void SeqVectFromMSA(const MSA &msa, SeqVect &v) { v.Clear(); const unsigned uSeqCount = msa.GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { Seq s; msa.GetSeq(uSeqIndex, s); s.StripGaps(); //if (0 == s.Length()) // continue; const char *ptrName = msa.GetSeqName(uSeqIndex); s.SetName(ptrName); v.AppendSeq(s); } }
void SetThreeWayWeightsMuscle(MSA &msa) { MuscleContext *ctx =getMuscleContext(); const Tree* &g_ptrMuscleTree = ctx->msa2.g_ptrMuscleTree; unsigned &g_uTreeSplitNode1 = ctx->msa2.g_uTreeSplitNode1; unsigned &g_uTreeSplitNode2 = ctx->msa2.g_uTreeSplitNode2; if (NULL_NEIGHBOR == g_uTreeSplitNode1 || NULL_NEIGHBOR == g_uTreeSplitNode2) { msa.SetHenikoffWeightsPB(); return; } const unsigned uMuscleSeqCount = g_ptrMuscleTree->GetLeafCount(); WEIGHT *Weights = new WEIGHT[uMuscleSeqCount]; CalcThreeWayWeights(*g_ptrMuscleTree, g_uTreeSplitNode1, g_uTreeSplitNode2, Weights); const unsigned uMSASeqCount = msa.GetSeqCount(); for (unsigned uSeqIndex = 0; uSeqIndex < uMSASeqCount; ++uSeqIndex) { const unsigned uId = msa.GetSeqId(uSeqIndex); if (uId >= uMuscleSeqCount) Quit("SetThreeWayWeightsMuscle: id out of range"); msa.SetSeqWeight(uSeqIndex, Weights[uId]); } #if LOCAL_VERBOSE { Log("SetThreeWayWeightsMuscle\n"); for (unsigned n = 0; n < uMSASeqCount; ++n) { const unsigned uId = msa.GetSeqId(n); Log("%20.20s %6.3f\n", msa.GetSeqName(n), Weights[uId]); } } #endif msa.NormalizeWeights((WEIGHT) 1.0); delete[] Weights; }
void WriteScoreFile(const MSA &msa) { MuscleContext *ctx = getMuscleContext(); FILE *f = fopen(ctx->params.g_pstrScoreFileName, "w"); if (0 == f) Quit("Cannot open score file '%s' errno=%d", ctx->params.g_pstrScoreFileName, errno); const unsigned uColCount = msa.GetColCount(); const unsigned uSeqCount = msa.GetSeqCount(); for (unsigned uCol = 0; uCol < uColCount; ++uCol) { double Score = GetColScore(msa, uCol); fprintf(f, "%10.3f ", Score); for (unsigned uSeq = 0; uSeq < uSeqCount; ++uSeq) { char c = msa.GetChar(uSeq, uCol); fprintf(f, "%c", c); } fprintf(f, "\n"); } fclose(f); }
// "Catenate" two MSAs (by bad analogy with UNIX cat command). // msa1 and msa2 must have same sequence names, but possibly // in a different order. // msaCat is the combined alignment produce by appending // sequences in msa2 to sequences in msa1. void MSACat(const MSA &msa1, const MSA &msa2, MSA &msaCat) { const unsigned uSeqCount = msa1.GetSeqCount(); const unsigned uColCount1 = msa1.GetColCount(); const unsigned uColCount2 = msa2.GetColCount(); const unsigned uColCountCat = uColCount1 + uColCount2; msaCat.SetSize(uSeqCount, uColCountCat); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { for (unsigned uColIndex = 0; uColIndex < uColCount1; ++uColIndex) { const char c = msa1.GetChar(uSeqIndex, uColIndex); msaCat.SetChar(uSeqIndex, uColIndex, c); } const char *ptrSeqName = msa1.GetSeqName(uSeqIndex); unsigned uSeqIndex2; msaCat.SetSeqName(uSeqIndex, ptrSeqName); bool bFound = msa2.GetSeqIndex(ptrSeqName, &uSeqIndex2); if (bFound) { for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex) { const char c = msa2.GetChar(uSeqIndex2, uColIndex); msaCat.SetChar(uSeqIndex, uColCount1 + uColIndex, c); } } else { for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex) msaCat.SetChar(uSeqIndex, uColCount1 + uColIndex, '-'); } } }
void RefineWorker::_run() { unsigned i = 0; #if TRACE algoLog.trace(QString("Worker %1 start. Wait...").arg(QString::number(workerID))); #endif workpool->mainSem.acquire(); #if TRACE algoLog.trace(QString("Worker %1: Stop wait. Start (mainSem %2, childSem %3)").arg(QString::number(workerID)). arg(QString::number(workpool->mainSem.available())).arg(QString::number(workpool->mainSem.available()))); #endif while(!workpool->isRefineDone()) { MSA msaIn; i = workpool->refineGetJob(&msaIn, workerID); MuscleContext *ctx = workpool->ctx; // unsigned &g_uTreeSplitNode1 = ctx->muscle.g_uTreeSplitNode1; // unsigned &g_uTreeSplitNode2 = ctx->muscle.g_uTreeSplitNode2; // unsigned &g_uRefineHeightSubtree = ctx->refinehoriz.g_uRefineHeightSubtree; // unsigned &g_uRefineHeightSubtreeTotal = ctx->refinehoriz.g_uRefineHeightSubtreeTotal; Tree &tree = workpool->GuideTree; const unsigned uSeqCount = msaIn.GetSeqCount(); // const unsigned uInternalNodeCount = uSeqCount - 1; unsigned *Leaves1 = new unsigned[uSeqCount]; unsigned *Leaves2 = new unsigned[uSeqCount]; const unsigned uRootNodeIndex = tree.GetRootNodeIndex(); while (i != NULL_NEIGHBOR) { const unsigned uInternalNodeIndex = workpool->InternalNodeIndexes[i]; unsigned uNeighborNodeIndex; if (tree.IsRoot(uInternalNodeIndex) && !workpool->bRight) { i = workpool->refineGetNextJob(&msaIn, false, -1, i, workerID); continue; } else if (workpool->bRight) uNeighborNodeIndex = tree.GetRight(uInternalNodeIndex); else uNeighborNodeIndex = tree.GetLeft(uInternalNodeIndex); // g_uTreeSplitNode1 = uInternalNodeIndex; // g_uTreeSplitNode2 = uNeighborNodeIndex; unsigned uCount1; unsigned uCount2; GetLeaves(tree, uNeighborNodeIndex, Leaves1, &uCount1); GetLeavesExcluding(tree, uRootNodeIndex, uNeighborNodeIndex, Leaves2, &uCount2); SCORE scoreBefore; SCORE scoreAfter; bool bAccepted = TryRealign(msaIn, tree, Leaves1, uCount1, Leaves2, uCount2, &scoreBefore, &scoreAfter, workpool->bLockLeft, workpool->bLockRight); SCORE scoreMax = scoreAfter > scoreBefore? scoreAfter : scoreBefore; //bool bRepeated = workpool->History->SetScore(workpool->uIter, uInternalNodeIndex, workpool->bRight, scoreMax); i = workpool->refineGetNextJob(&msaIn, bAccepted, scoreMax, i, workerID); } delete[] Leaves1; delete[] Leaves2; #if TRACE algoLog.trace(QString("Worker %1: no job available. Wait... (mainSem %2, childSem %3)").arg(QString::number(workerID)). arg(QString::number(workpool->mainSem.available())).arg(QString::number(workpool->mainSem.available()))); #endif workpool->childSem.release(); workpool->mainSem.acquire(); #if TRACE algoLog.trace(QString("Worker %1: Stop wait. Start (mainSem %2, childSem %3)").arg(QString::number(workerID)). arg(QString::number(workpool->mainSem.available())).arg(QString::number(workpool->mainSem.available()))); #endif } #if TRACE algoLog.trace(QString("Worker %1: Refine done. Exit").arg(QString::number(workerID))); #endif }
void Refine() { SetOutputFileName(g_pstrOutFileName.get()); SetInputFileName(g_pstrInFileName.get()); SetStartTime(); SetMaxIters(g_uMaxIters.get()); SetSeqWeightMethod(g_SeqWeight1.get()); TextFile fileIn(g_pstrInFileName.get()); MSA msa; msa.FromFile(fileIn); const unsigned uSeqCount = msa.GetSeqCount(); if (0 == uSeqCount) Quit("No sequences in input file"); ALPHA Alpha = ALPHA_Undefined; switch (g_SeqType.get()) { case SEQTYPE_Auto: Alpha = msa.GuessAlpha(); break; case SEQTYPE_Protein: Alpha = ALPHA_Amino; break; case SEQTYPE_DNA: Alpha = ALPHA_DNA; break; case SEQTYPE_RNA: Alpha = ALPHA_RNA; break; default: Quit("Invalid SeqType"); } SetAlpha(Alpha); msa.FixAlpha(); SetPPScore(); if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha) SetPPScore(PPSCORE_SPN); MSA::SetIdCount(uSeqCount); // Initialize sequence ids. // From this point on, ids must somehow propogate from here. for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) msa.SetSeqId(uSeqIndex, uSeqIndex); SetMuscleInputMSA(msa); Tree GuideTree; TreeFromMSA(msa, GuideTree, g_Cluster2.get(), g_Distance2.get(), g_Root2.get()); SetMuscleTree(GuideTree); if (g_bAnchors.get()) RefineVert(msa, GuideTree, g_uMaxIters.get()); else RefineHoriz(msa, GuideTree, g_uMaxIters.get(), false, false); ValidateMuscleIds(msa); ValidateMuscleIds(GuideTree); // TextFile fileOut(g_pstrOutFileName.get(), true); // msa.ToFile(fileOut); MuscleOutput(msa); }
void RefineW(const MSA &msaIn, MSA &msaOut) { const unsigned uSeqCount = msaIn.GetSeqCount(); const unsigned uColCount = msaIn.GetColCount(); // Reserve same nr seqs, 20% more cols const unsigned uReserveColCount = (uColCount*120)/100; msaOut.SetSize(uSeqCount, uReserveColCount); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { msaOut.SetSeqName(uSeqIndex, msaIn.GetSeqName(uSeqIndex)); msaOut.SetSeqId(uSeqIndex, msaIn.GetSeqId(uSeqIndex)); } const unsigned uWindowCount = (uColCount + g_uRefineWindow.get() - 1)/g_uRefineWindow.get(); if (0 == g_uWindowTo.get()) g_uWindowTo.get() = uWindowCount - 1; #if MEMDEBUG _CrtSetBreakAlloc(1560); #endif if (g_uWindowOffset.get() > 0) { MSA msaTmp; MSAFromColRange(msaIn, 0, g_uWindowOffset.get(), msaOut); } if (!g_bQuiet.get()) fprintf(stderr, "\n"); for (unsigned uWindowIndex = g_uWindowFrom.get(); uWindowIndex <= g_uWindowTo.get(); ++uWindowIndex) { if (!g_bQuiet.get()) fprintf(stderr, "Window %d of %d \r", uWindowIndex, uWindowCount); const unsigned uColFrom = g_uWindowOffset.get() + uWindowIndex*g_uRefineWindow.get(); unsigned uColTo = uColFrom + g_uRefineWindow.get() - 1; if (uColTo >= uColCount) uColTo = uColCount - 1; assert(uColTo >= uColFrom); SeqVect v; SeqVectFromMSACols(msaIn, uColFrom, uColTo, v); #if MEMDEBUG _CrtMemState s1; _CrtMemCheckpoint(&s1); #endif // Begin AED 5/20/06 // remove any empty seqs in this window std::vector< size_t > empty_seqs; SeqVect vr; for( size_t seqI = 0; seqI < v.size(); ++seqI ) { if( v[seqI]->size() == 0 ) empty_seqs.push_back(seqI); else vr.push_back(v[seqI]); } std::vector< unsigned > seqid_map( vr.size() ); for( size_t seqI = 0; seqI < vr.size(); ++seqI ) { seqid_map[seqI] = vr[seqI]->GetId(); vr[seqI]->SetId(seqI); } MSA msaTmp; if( vr.size() > 1 ) MUSCLE(vr, msaTmp); // remap the seqids to their original state for( size_t seqI = 0; seqI < vr.size(); ++seqI ) vr[seqI]->SetId(seqid_map[seqI]); // merge empty seqs back in { const unsigned uSeqCount = msaOut.GetSeqCount(); const unsigned uColCount1 = msaOut.GetColCount(); const unsigned uColCount2 = vr.size() > 1 ? msaTmp.GetColCount() : vr[0]->size(); const unsigned uColCountCat = uColCount1 + uColCount2; for( unsigned seqI = 0; seqI < vr.size(); ++seqI ) { unsigned uSeqIndex = msaOut.GetSeqIndex(seqid_map[seqI]); if( vr.size() > 1 ) { unsigned uSeqIndex2 = msaTmp.GetSeqIndex(seqI); for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex) { const char c = msaTmp.GetChar(uSeqIndex2, uColIndex); msaOut.SetChar(uSeqIndex, uColCount1 + uColIndex, c); } }else{ for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex) { const char c = vr[0]->GetChar(uColIndex); msaOut.SetChar(uSeqIndex, uColCount1 + uColIndex, c); } } } for( unsigned seqI = 0; seqI < empty_seqs.size(); ++seqI ) { unsigned uSeqId2 = v[empty_seqs[seqI]]->GetId(); unsigned uSeqIndex = msaOut.GetSeqIndex(uSeqId2); for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex) { msaOut.SetChar(uSeqIndex, uColCount1 + uColIndex, '-'); } } vr.clear(); } // AppendMSA(msaOut, msaTmp); // end AED 5/20/06 if (uWindowIndex == g_uSaveWindow.get()) { MSA msaInTmp; unsigned uOutCols = msaOut.GetColCount(); unsigned un = uColTo - uColFrom + 1; MSAFromColRange(msaIn, uColFrom, un, msaInTmp); char fn[256]; sprintf(fn, "win%d_inaln.tmp", uWindowIndex); TextFile fIn(fn, true); msaInTmp.ToFile(fIn); sprintf(fn, "win%d_inseqs.tmp", uWindowIndex); TextFile fv(fn, true); v.ToFile(fv); sprintf(fn, "win%d_outaln.tmp", uWindowIndex); TextFile fOut(fn, true); msaTmp.ToFile(fOut); } #if MEMDEBUG void FreeDPMemSPN(); FreeDPMemSPN(); _CrtMemState s2; _CrtMemCheckpoint(&s2); _CrtMemState s; _CrtMemDifference(&s, &s1, &s2); _CrtMemDumpStatistics(&s); _CrtMemDumpAllObjectsSince(&s1); exit(1); #endif //#if DEBUG // AssertMSAEqIgnoreCaseAndGaps(msaInTmp, msaTmp); //#endif } if (!g_bQuiet.get()) fprintf(stderr, "\n"); // AssertMSAEqIgnoreCaseAndGaps(msaIn, msaOut);//@@uncomment! }
// The usual sum-of-pairs objective score: sum the score // of the alignment of each pair of sequences. SCORE ObjScoreSP(const MSA &msa, SCORE MatchScore[]) { #if TRACE Log("==================ObjScoreSP==============\n"); Log("msa=\n"); msa.LogMe(); #endif g_SPScoreLetters = 0; g_SPScoreGaps = 0; if (0 != MatchScore) { const unsigned uColCount = msa.GetColCount(); for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) MatchScore[uColIndex] = 0; } const unsigned uSeqCount = msa.GetSeqCount(); SCORE scoreTotal = 0; unsigned uPairCount = 0; #if TRACE Log("Seq1 Seq2 wt1 wt2 Letters Gaps Unwt.Score Wt.Score Total\n"); Log("---- ---- ------ ------ ---------- ---------- ---------- ---------- ----------\n"); #endif for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1) { const WEIGHT w1 = msa.GetSeqWeight(uSeqIndex1); for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount; ++uSeqIndex2) { const WEIGHT w2 = msa.GetSeqWeight(uSeqIndex2); const WEIGHT w = w1*w2; SCORE scoreLetters = ScoreSeqPairLetters(msa, uSeqIndex1, msa, uSeqIndex2); SCORE scoreGaps = ScoreSeqPairGaps(msa, uSeqIndex1, msa, uSeqIndex2); SCORE scorePair = scoreLetters + scoreGaps; ++uPairCount; scoreTotal += w*scorePair; g_SPScoreLetters += w*scoreLetters; g_SPScoreGaps += w*scoreGaps; #if TRACE Log("%4d %4d %6.3f %6.3f %10.2f %10.2f %10.2f %10.2f %10.2f >%s >%s\n", uSeqIndex1, uSeqIndex2, w1, w2, scoreLetters, scoreGaps, scorePair, scorePair*w1*w2, scoreTotal, msa.GetSeqName(uSeqIndex1), msa.GetSeqName(uSeqIndex2)); #endif } } #if TEST_SPFAST { SCORE f = ObjScoreSPFast(msa); Log("Fast = %.6g\n", f); Log("Brute = %.6g\n", scoreTotal); if (BTEq(f, scoreTotal)) Log("Agree\n"); else Log("** DISAGREE **\n"); } #endif // return scoreTotal / uPairCount; return scoreTotal; }
static SCORE ScoreColLetters(const MSA &msa, unsigned uColIndex) { MuscleContext *ctx = getMuscleContext(); SCOREMATRIX &Mx = *ctx->params.g_ptrScoreMatrix; unsigned &g_AlphaSize = ctx->alpha.g_AlphaSize; const unsigned uSeqCount = msa.GetSeqCount(); #if BRUTE_LETTERS SCORE BruteScore = 0; for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1) { unsigned uLetter1 = msa.GetLetterEx(uSeqIndex1, uColIndex); if (uLetter1 >= g_AlphaSize) continue; WEIGHT w1 = msa.GetSeqWeight(uSeqIndex1); for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount; ++uSeqIndex2) { unsigned uLetter2 = msa.GetLetterEx(uSeqIndex2, uColIndex); if (uLetter2 >= g_AlphaSize) continue; WEIGHT w2 = msa.GetSeqWeight(uSeqIndex2); BruteScore += w1*w2*Mx[uLetter1][uLetter2]; } } #endif double N = 0; for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1) { WEIGHT w = msa.GetSeqWeight(uSeqIndex1); N += w; } if (N <= 0) return 0; FCOUNT Freqs[20]; memset(Freqs, 0, sizeof(Freqs)); SCORE Score = 0; for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1) { unsigned uLetter = msa.GetLetterEx(uSeqIndex1, uColIndex); if (uLetter >= g_AlphaSize) continue; WEIGHT w = msa.GetSeqWeight(uSeqIndex1); Freqs[uLetter] += w; Score -= w*w*Mx[uLetter][uLetter]; } for (unsigned uLetter1 = 0; uLetter1 < g_AlphaSize; ++uLetter1) { const FCOUNT f1 = Freqs[uLetter1]; Score += f1*f1*Mx[uLetter1][uLetter1]; for (unsigned uLetter2 = uLetter1 + 1; uLetter2 < g_AlphaSize; ++uLetter2) { const FCOUNT f2 = Freqs[uLetter2]; Score += 2*f1*f2*Mx[uLetter1][uLetter2]; } } Score /= 2; #if BRUTE_LETTERS assert(BTEq(BruteScore, Score)); #endif return Score; }
bool RefineSubfams(MSA &msa, const Tree &tree, unsigned uIters) { MuscleContext *ctx = getMuscleContext(); CLUSTER &g_Cluster2 = ctx->params.g_Cluster2; DISTANCE &g_Distance2 = ctx->params.g_Distance2; ROOT &g_Root2 = ctx->params.g_Root2; bool &g_bAnchors = ctx->params.g_bAnchors; const unsigned uSeqCount = msa.GetSeqCount(); if (uSeqCount < 3) return false; const double dMaxHeight = 0.6; const unsigned uMaxSubfamCount = 16; //const unsigned uNodeCount = tree.GetNodeCount(); unsigned *Subfams; unsigned uSubfamCount; GetSubfams(tree, dMaxHeight, uMaxSubfamCount, &Subfams, &uSubfamCount); assert(uSubfamCount <= uSeqCount); if (ctx->params.g_bVerbose) LogSubfams(tree, Subfams, uSubfamCount); MSA *SubfamMSAs = new MSA[uSubfamCount]; unsigned *Leaves = new unsigned[uSeqCount]; unsigned *Ids = new unsigned[uSeqCount]; bool bAnyChanges = false; for (unsigned uSubfamIndex = 0; uSubfamIndex < uSubfamCount; ++uSubfamIndex) { unsigned uSubfam = Subfams[uSubfamIndex]; unsigned uLeafCount; GetLeaves(tree, uSubfam, Leaves, &uLeafCount); assert(uLeafCount <= uSeqCount); LeafIndexesToIds(tree, Leaves, uLeafCount, Ids); MSA &msaSubfam = SubfamMSAs[uSubfamIndex]; MSASubsetByIds(msa, Ids, uLeafCount, msaSubfam); DeleteGappedCols(msaSubfam); #if TRACE Log("Subfam %u MSA=\n", uSubfamIndex); msaSubfam.LogMe(); #endif if (msaSubfam.GetSeqCount() <= 2) continue; // TODO ///////////////////////////////////////// // Try using existing tree, may actually hurt to // re-estimate, may also be a waste of CPU & mem. ///////////////////////////////////////////////// Tree SubfamTree; TreeFromMSA(msaSubfam, SubfamTree, g_Cluster2, g_Distance2, g_Root2); bool bAnyChangesThisSubfam; if (g_bAnchors) bAnyChangesThisSubfam = RefineVert(msaSubfam, SubfamTree, uIters); else bAnyChangesThisSubfam = RefineHoriz(msaSubfam, SubfamTree, uIters, false, false); #if TRACE Log("Subfam %u Changed %d\n", uSubfamIndex, bAnyChangesThisSubfam); #endif if (bAnyChangesThisSubfam) bAnyChanges = true; } if (bAnyChanges) ProgressiveAlignSubfams(tree, Subfams, uSubfamCount, SubfamMSAs, msa); delete[] Leaves; delete[] Subfams; delete[] SubfamMSAs; return bAnyChanges; }
bool TryRealign(MSA &msaIn, const Tree &tree, const unsigned Leaves1[], unsigned uCount1, const unsigned Leaves2[], unsigned uCount2, SCORE *ptrscoreBefore, SCORE *ptrscoreAfter, bool bLockLeft, bool bLockRight) { #if TRACE Log("TryRealign, msaIn=\n"); #endif MuscleContext *ctx = getMuscleContext(); const unsigned uSeqCount = msaIn.GetSeqCount(); unsigned *Ids1 = new unsigned[uSeqCount]; unsigned *Ids2 = new unsigned[uSeqCount]; LeafIndexesToIds(tree, Leaves1, uCount1, Ids1); LeafIndexesToIds(tree, Leaves2, uCount2, Ids2); MSA msa1; MSA msa2; MSASubsetByIds(msaIn, Ids1, uCount1, msa1); MSASubsetByIds(msaIn, Ids2, uCount2, msa2); #if DEBUG ValidateMuscleIds(msa1); ValidateMuscleIds(msa2); #endif // Computing the objective score may be expensive for // large numbers of sequences. As a speed optimization, // we check whether the alignment changes. If it does // not change, there is no need to compute the objective // score. We test for the alignment changing by comparing // the Viterbi paths before and after re-aligning. PWPath pathBefore; pathBefore.FromMSAPair(msa1, msa2); DeleteGappedCols(msa1); DeleteGappedCols(msa2); if (0 == msa1.GetColCount() || 0 == msa2.GetColCount()) { delete[] Ids1; delete[] Ids2; return false; } MSA msaRealigned; PWPath pathAfter; AlignTwoMSAs(msa1, msa2, msaRealigned, pathAfter, bLockLeft, bLockRight); bool bAnyChanges = !pathAfter.Equal(pathBefore); unsigned uDiffCount1; unsigned uDiffCount2; unsigned* Edges1 = ctx->refinehoriz.Edges1; unsigned* Edges2 = ctx->refinehoriz.Edges2; DiffPaths(pathBefore, pathAfter, Edges1, &uDiffCount1, Edges2, &uDiffCount2); #if TRACE Log("TryRealign, msa1=\n"); Log("\nmsa2=\n"); Log("\nRealigned (changes %s)=\n", bAnyChanges ? "TRUE" : "FALSE"); #endif if (!bAnyChanges) { *ptrscoreBefore = 0; *ptrscoreAfter = 0; delete[] Ids1; delete[] Ids2; return false; } SetMSAWeightsMuscle(msaIn); SetMSAWeightsMuscle(msaRealigned); #if DIFFOBJSCORE const SCORE scoreDiff = DiffObjScore(msaIn, pathBefore, Edges1, uDiffCount1, msaRealigned, pathAfter, Edges2, uDiffCount2); bool bAccept = (scoreDiff > 0); *ptrscoreBefore = 0; *ptrscoreAfter = scoreDiff; //const SCORE scoreBefore = ObjScoreIds(msaIn, Ids1, uCount1, Ids2, uCount2); //const SCORE scoreAfter = ObjScoreIds(msaRealigned, Ids1, uCount1, Ids2, uCount2); //Log("Diff = %.3g %.3g\n", scoreDiff, scoreAfter - scoreBefore); #else const SCORE scoreBefore = ObjScoreIds(msaIn, Ids1, uCount1, Ids2, uCount2); const SCORE scoreAfter = ObjScoreIds(msaRealigned, Ids1, uCount1, Ids2, uCount2); bool bAccept = (scoreAfter > scoreBefore); #if TRACE Log("Score %g -> %g Accept %s\n", scoreBefore, scoreAfter, bAccept ? "TRUE" : "FALSE"); #endif *ptrscoreBefore = scoreBefore; *ptrscoreAfter = scoreAfter; #endif if (bAccept) msaIn.Copy(msaRealigned); delete[] Ids1; delete[] Ids2; return bAccept; }
SCORE ObjScore(const MSA &msa, const unsigned SeqIndexes1[], unsigned uSeqCount1, const unsigned SeqIndexes2[], unsigned uSeqCount2) { #if TIMING TICKS t1 = GetClockTicks(); #endif const unsigned uSeqCount = msa.GetSeqCount(); OBJSCORE OS = g_ObjScore; if (g_ObjScore == OBJSCORE_SPM) { if (uSeqCount <= 100) OS = OBJSCORE_XP; else OS = OBJSCORE_SPF; } MSA msa1; MSA msa2; switch (OS) { case OBJSCORE_DP: case OBJSCORE_XP: MSAFromSeqSubset(msa, SeqIndexes1, uSeqCount1, msa1); MSAFromSeqSubset(msa, SeqIndexes2, uSeqCount2, msa2); SetMSAWeightsMuscle(msa1); SetMSAWeightsMuscle(msa2); break; case OBJSCORE_SP: case OBJSCORE_SPF: case OBJSCORE_PS: // Yuck -- casting away const (design flaw) SetMSAWeightsMuscle((MSA &) msa); break; } SCORE Score = 0; switch (OS) { case OBJSCORE_SP: Score = ObjScoreSP(msa); break; case OBJSCORE_DP: Score = ObjScoreDP(msa1, msa2); break; case OBJSCORE_XP: Score = ObjScoreXP(msa1, msa2); break; case OBJSCORE_PS: Score = ObjScorePS(msa); break; case OBJSCORE_SPF: Score = ObjScoreSPDimer(msa); break; default: Quit("Invalid g_ObjScore=%d", g_ObjScore); } #if TIMING TICKS t2 = GetClockTicks(); g_ticksObjScore += (t2 - t1); #endif return Score; }