void DoMuscle() { SetOutputFileName(g_pstrOutFileName.get()); SetInputFileName(g_pstrInFileName.get()); SetMaxIters(g_uMaxIters.get()); SetSeqWeightMethod(g_SeqWeight1.get()); TextFile fileIn(g_pstrInFileName.get()); SeqVect v; v.FromFASTAFile(fileIn); const unsigned uSeqCount = v.Length(); if (0 == uSeqCount) Quit("No sequences in input file"); ALPHA Alpha = ALPHA_Undefined; switch (g_SeqType.get()) { case SEQTYPE_Auto: Alpha = v.GuessAlpha(); break; case SEQTYPE_Protein: Alpha = ALPHA_Amino; break; case SEQTYPE_DNA: Alpha = ALPHA_DNA; break; case SEQTYPE_RNA: Alpha = ALPHA_RNA; break; default: Quit("Invalid seq type"); } SetAlpha(Alpha); v.FixAlpha(); // // AED 21/12/06: Moved matrix loading code inside the PP param function so it gets called for all alignment types // SetPPScore(); unsigned uMaxL = 0; unsigned uTotL = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { unsigned L = v.GetSeq(uSeqIndex).Length(); uTotL += L; if (L > uMaxL) uMaxL = L; } SetIter(1); g_bDiags.get() = g_bDiags1.get(); SetSeqStats(uSeqCount, uMaxL, uTotL/uSeqCount); SetMuscleSeqVect(v); MSA::SetIdCount(uSeqCount); // Initialize sequence ids. // From this point on, ids must somehow propogate from here. for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) v.SetSeqId(uSeqIndex, uSeqIndex); if (0 == uSeqCount) Quit("Input file '%s' has no sequences", g_pstrInFileName.get()); if (1 == uSeqCount) { TextFile fileOut(g_pstrOutFileName.get(), true); v.ToFile(fileOut); return; } if (uSeqCount > 1) MHackStart(v); // First iteration Tree GuideTree; if (0 != g_pstrUseTreeFileName.get()) { // Discourage users... if (!g_bUseTreeNoWarn.get()) fprintf(stderr, g_strUseTreeWarning); // Read tree from file TextFile TreeFile(g_pstrUseTreeFileName.get()); GuideTree.FromFile(TreeFile); // Make sure tree is rooted if (!GuideTree.IsRooted()) Quit("User tree must be rooted"); if (GuideTree.GetLeafCount() != uSeqCount) Quit("User tree does not match input sequences"); const unsigned uNodeCount = GuideTree.GetNodeCount(); for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (!GuideTree.IsLeaf(uNodeIndex)) continue; const char *LeafName = GuideTree.GetLeafName(uNodeIndex); unsigned uSeqIndex; bool SeqFound = v.FindName(LeafName, &uSeqIndex); if (!SeqFound) Quit("Label %s in tree does not match sequences", LeafName); unsigned uId = v.GetSeqIdFromName(LeafName); GuideTree.SetLeafId(uNodeIndex, uId); } } else TreeFromSeqVect(v, GuideTree, g_Cluster1.get(), g_Distance1.get(), g_Root1.get(), g_pstrDistMxFileName1.get()); const char *Tree1 = ValueOpt("Tree1"); if (0 != Tree1) { TextFile f(Tree1, true); GuideTree.ToFile(f); if (g_bClusterOnly.get()) return; } SetMuscleTree(GuideTree); ValidateMuscleIds(GuideTree); MSA msa; ProgNode *ProgNodes = 0; if (g_bLow.get()) ProgNodes = ProgressiveAlignE(v, GuideTree, msa); else ProgressiveAlign(v, GuideTree, msa); SetCurrentAlignment(msa); if (0 != g_pstrComputeWeightsFileName.get()) { extern void OutWeights(const char *FileName, const MSA &msa); SetMSAWeightsMuscle(msa); OutWeights(g_pstrComputeWeightsFileName.get(), msa); return; } ValidateMuscleIds(msa); if (1 == g_uMaxIters.get() || 2 == uSeqCount) { //TextFile fileOut(g_pstrOutFileName.get(), true); //MHackEnd(msa); //msa.ToFile(fileOut); MuscleOutput(msa); return; } if (0 == g_pstrUseTreeFileName.get()) { g_bDiags.get() = g_bDiags2.get(); SetIter(2); if (g_bLow.get()) { if (0 != g_uMaxTreeRefineIters.get()) RefineTreeE(msa, v, GuideTree, ProgNodes); } else RefineTree(msa, GuideTree); const char *Tree2 = ValueOpt("Tree2"); if (0 != Tree2) { TextFile f(Tree2, true); GuideTree.ToFile(f); } } SetSeqWeightMethod(g_SeqWeight2.get()); SetMuscleTree(GuideTree); if (g_bAnchors.get()) RefineVert(msa, GuideTree, g_uMaxIters.get() - 2); else RefineHoriz(msa, GuideTree, g_uMaxIters.get() - 2, false, false); #if 0 // Refining by subfamilies is disabled as it didn't give better // results. I tried doing this before and after RefineHoriz. // Should get back to this as it seems like this should work. RefineSubfams(msa, GuideTree, g_uMaxIters.get() - 2); #endif ValidateMuscleIds(msa); ValidateMuscleIds(GuideTree); //TextFile fileOut(g_pstrOutFileName.get(), true); //MHackEnd(msa); //msa.ToFile(fileOut); MuscleOutput(msa); }
void RefineW(const MSA &msaIn, MSA &msaOut) { const unsigned uSeqCount = msaIn.GetSeqCount(); const unsigned uColCount = msaIn.GetColCount(); // Reserve same nr seqs, 20% more cols const unsigned uReserveColCount = (uColCount*120)/100; msaOut.SetSize(uSeqCount, uReserveColCount); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { msaOut.SetSeqName(uSeqIndex, msaIn.GetSeqName(uSeqIndex)); msaOut.SetSeqId(uSeqIndex, msaIn.GetSeqId(uSeqIndex)); } const unsigned uWindowCount = (uColCount + g_uRefineWindow.get() - 1)/g_uRefineWindow.get(); if (0 == g_uWindowTo.get()) g_uWindowTo.get() = uWindowCount - 1; #if MEMDEBUG _CrtSetBreakAlloc(1560); #endif if (g_uWindowOffset.get() > 0) { MSA msaTmp; MSAFromColRange(msaIn, 0, g_uWindowOffset.get(), msaOut); } if (!g_bQuiet.get()) fprintf(stderr, "\n"); for (unsigned uWindowIndex = g_uWindowFrom.get(); uWindowIndex <= g_uWindowTo.get(); ++uWindowIndex) { if (!g_bQuiet.get()) fprintf(stderr, "Window %d of %d \r", uWindowIndex, uWindowCount); const unsigned uColFrom = g_uWindowOffset.get() + uWindowIndex*g_uRefineWindow.get(); unsigned uColTo = uColFrom + g_uRefineWindow.get() - 1; if (uColTo >= uColCount) uColTo = uColCount - 1; assert(uColTo >= uColFrom); SeqVect v; SeqVectFromMSACols(msaIn, uColFrom, uColTo, v); #if MEMDEBUG _CrtMemState s1; _CrtMemCheckpoint(&s1); #endif // Begin AED 5/20/06 // remove any empty seqs in this window std::vector< size_t > empty_seqs; SeqVect vr; for( size_t seqI = 0; seqI < v.size(); ++seqI ) { if( v[seqI]->size() == 0 ) empty_seqs.push_back(seqI); else vr.push_back(v[seqI]); } std::vector< unsigned > seqid_map( vr.size() ); for( size_t seqI = 0; seqI < vr.size(); ++seqI ) { seqid_map[seqI] = vr[seqI]->GetId(); vr[seqI]->SetId(seqI); } MSA msaTmp; if( vr.size() > 1 ) MUSCLE(vr, msaTmp); // remap the seqids to their original state for( size_t seqI = 0; seqI < vr.size(); ++seqI ) vr[seqI]->SetId(seqid_map[seqI]); // merge empty seqs back in { const unsigned uSeqCount = msaOut.GetSeqCount(); const unsigned uColCount1 = msaOut.GetColCount(); const unsigned uColCount2 = vr.size() > 1 ? msaTmp.GetColCount() : vr[0]->size(); const unsigned uColCountCat = uColCount1 + uColCount2; for( unsigned seqI = 0; seqI < vr.size(); ++seqI ) { unsigned uSeqIndex = msaOut.GetSeqIndex(seqid_map[seqI]); if( vr.size() > 1 ) { unsigned uSeqIndex2 = msaTmp.GetSeqIndex(seqI); for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex) { const char c = msaTmp.GetChar(uSeqIndex2, uColIndex); msaOut.SetChar(uSeqIndex, uColCount1 + uColIndex, c); } }else{ for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex) { const char c = vr[0]->GetChar(uColIndex); msaOut.SetChar(uSeqIndex, uColCount1 + uColIndex, c); } } } for( unsigned seqI = 0; seqI < empty_seqs.size(); ++seqI ) { unsigned uSeqId2 = v[empty_seqs[seqI]]->GetId(); unsigned uSeqIndex = msaOut.GetSeqIndex(uSeqId2); for (unsigned uColIndex = 0; uColIndex < uColCount2; ++uColIndex) { msaOut.SetChar(uSeqIndex, uColCount1 + uColIndex, '-'); } } vr.clear(); } // AppendMSA(msaOut, msaTmp); // end AED 5/20/06 if (uWindowIndex == g_uSaveWindow.get()) { MSA msaInTmp; unsigned uOutCols = msaOut.GetColCount(); unsigned un = uColTo - uColFrom + 1; MSAFromColRange(msaIn, uColFrom, un, msaInTmp); char fn[256]; sprintf(fn, "win%d_inaln.tmp", uWindowIndex); TextFile fIn(fn, true); msaInTmp.ToFile(fIn); sprintf(fn, "win%d_inseqs.tmp", uWindowIndex); TextFile fv(fn, true); v.ToFile(fv); sprintf(fn, "win%d_outaln.tmp", uWindowIndex); TextFile fOut(fn, true); msaTmp.ToFile(fOut); } #if MEMDEBUG void FreeDPMemSPN(); FreeDPMemSPN(); _CrtMemState s2; _CrtMemCheckpoint(&s2); _CrtMemState s; _CrtMemDifference(&s, &s1, &s2); _CrtMemDumpStatistics(&s); _CrtMemDumpAllObjectsSince(&s1); exit(1); #endif //#if DEBUG // AssertMSAEqIgnoreCaseAndGaps(msaInTmp, msaTmp); //#endif } if (!g_bQuiet.get()) fprintf(stderr, "\n"); // AssertMSAEqIgnoreCaseAndGaps(msaIn, msaOut);//@@uncomment! }
void DoMuscle(CompositeVect*CVLocation) { SetOutputFileName(g_pstrOutFileName); SetInputFileName(g_pstrInFileName); SetMaxIters(g_uMaxIters); SetSeqWeightMethod(g_SeqWeight1); TextFile fileIn(g_pstrInFileName); SeqVect v; v.FromFASTAFile(fileIn); const unsigned uSeqCount = v.Length(); if (0 == uSeqCount) Quit("No sequences in input file"); ALPHA Alpha = ALPHA_Undefined; switch (g_SeqType) { case SEQTYPE_Auto: Alpha = v.GuessAlpha(); break; case SEQTYPE_Protein: Alpha = ALPHA_Amino; break; case SEQTYPE_DNA: Alpha = ALPHA_DNA; break; case SEQTYPE_RNA: Alpha = ALPHA_RNA; break; default: Quit("Invalid seq type"); } SetAlpha(Alpha); v.FixAlpha(); PTR_SCOREMATRIX UserMatrix = 0; if (0 != g_pstrMatrixFileName) { const char *FileName = g_pstrMatrixFileName; const char *Path = getenv("MUSCLE_MXPATH"); if (Path != 0) { size_t n = strlen(Path) + 1 + strlen(FileName) + 1; char *NewFileName = new char[n]; sprintf(NewFileName, "%s/%s", Path, FileName); FileName = NewFileName; } TextFile File(FileName); UserMatrix = ReadMx(File); g_Alpha = ALPHA_Amino; g_PPScore = PPSCORE_SP; } SetPPScore(); if (0 != UserMatrix) g_ptrScoreMatrix = UserMatrix; unsigned uMaxL = 0; unsigned uTotL = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { unsigned L = v.GetSeq(uSeqIndex).Length(); uTotL += L; if (L > uMaxL) uMaxL = L; } SetIter(1); g_bDiags = g_bDiags1; SetSeqStats(uSeqCount, uMaxL, uTotL/uSeqCount); SetMuscleSeqVect(v); MSA::SetIdCount(uSeqCount); // Initialize sequence ids. // From this point on, ids must somehow propogate from here. for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) v.SetSeqId(uSeqIndex, uSeqIndex); if (0 == uSeqCount) Quit("Input file '%s' has no sequences", g_pstrInFileName); if (1 == uSeqCount) { TextFile fileOut(g_pstrOutFileName, true); v.ToFile(fileOut); return; } if (uSeqCount > 1) MHackStart(v); // First iteration Tree GuideTree; if (0 != g_pstrUseTreeFileName) { // Discourage users... if (!g_bUseTreeNoWarn) fprintf(stderr, "%s", g_strUseTreeWarning); // Read tree from file TextFile TreeFile(g_pstrUseTreeFileName); GuideTree.FromFile(TreeFile); // Make sure tree is rooted if (!GuideTree.IsRooted()) Quit("User tree must be rooted"); if (GuideTree.GetLeafCount() != uSeqCount) Quit("User tree does not match input sequences"); const unsigned uNodeCount = GuideTree.GetNodeCount(); for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex) { if (!GuideTree.IsLeaf(uNodeIndex)) continue; const char *LeafName = GuideTree.GetLeafName(uNodeIndex); unsigned uSeqIndex; bool SeqFound = v.FindName(LeafName, &uSeqIndex); if (!SeqFound) Quit("Label %s in tree does not match sequences", LeafName); unsigned uId = v.GetSeqIdFromName(LeafName); GuideTree.SetLeafId(uNodeIndex, uId); } } else TreeFromSeqVect(v, GuideTree, g_Cluster1, g_Distance1, g_Root1, g_pstrDistMxFileName1); const char *Tree1 = ValueOpt("Tree1"); if (0 != Tree1) { TextFile f(Tree1, true); GuideTree.ToFile(f); if (g_bClusterOnly) return; } SetMuscleTree(GuideTree); ValidateMuscleIds(GuideTree); MSA msa; msa.SetCompositeVector(CVLocation); ProgNode *ProgNodes = 0; if (g_bLow) ProgNodes = ProgressiveAlignE(v, GuideTree, msa); else ProgressiveAlign(v, GuideTree, msa); SetCurrentAlignment(msa); if (0 != g_pstrComputeWeightsFileName) { extern void OutWeights(const char *FileName, const MSA &msa); SetMSAWeightsMuscle(msa); OutWeights(g_pstrComputeWeightsFileName, msa); return; } ValidateMuscleIds(msa); if (1 == g_uMaxIters || 2 == uSeqCount) { //TextFile fileOut(g_pstrOutFileName, true); //MHackEnd(msa); //msa.ToFile(fileOut); MuscleOutput(msa); return; } if (0 == g_pstrUseTreeFileName) { g_bDiags = g_bDiags2; SetIter(2); if (g_bLow) { if (0 != g_uMaxTreeRefineIters) RefineTreeE(msa, v, GuideTree, ProgNodes); } else RefineTree(msa, GuideTree); const char *Tree2 = ValueOpt("Tree2"); if (0 != Tree2) { TextFile f(Tree2, true); GuideTree.ToFile(f); } } SetSeqWeightMethod(g_SeqWeight2); SetMuscleTree(GuideTree); if (g_bAnchors) RefineVert(msa, GuideTree, g_uMaxIters - 2); else RefineHoriz(msa, GuideTree, g_uMaxIters - 2, false, false); #if 0 // Refining by subfamilies is disabled as it didn't give better // results. I tried doing this before and after RefineHoriz. // Should get back to this as it seems like this should work. RefineSubfams(msa, GuideTree, g_uMaxIters - 2); #endif ValidateMuscleIds(msa); ValidateMuscleIds(GuideTree); //TextFile fileOut(g_pstrOutFileName, true); //MHackEnd(msa); //msa.ToFile(fileOut); MuscleOutput(msa); }
void RefineW(const MSA &msaIn, MSA &msaOut) { const unsigned uSeqCount = msaIn.GetSeqCount(); const unsigned uColCount = msaIn.GetColCount(); // Reserve same nr seqs, 20% more cols const unsigned uReserveColCount = (uColCount*120)/100; msaOut.SetSize(uSeqCount, uReserveColCount); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { msaOut.SetSeqName(uSeqIndex, msaIn.GetSeqName(uSeqIndex)); msaOut.SetSeqId(uSeqIndex, msaIn.GetSeqId(uSeqIndex)); } const unsigned uWindowCount = (uColCount + g_uRefineWindow - 1)/g_uRefineWindow; if (0 == g_uWindowTo) g_uWindowTo = uWindowCount - 1; #if MEMDEBUG _CrtSetBreakAlloc(1560); #endif if (g_uWindowOffset > 0) { MSA msaTmp; MSAFromColRange(msaIn, 0, g_uWindowOffset, msaOut); } fprintf(stderr, "\n"); for (unsigned uWindowIndex = g_uWindowFrom; uWindowIndex <= g_uWindowTo; ++uWindowIndex) { fprintf(stderr, "Window %d of %d \r", uWindowIndex, uWindowCount); const unsigned uColFrom = g_uWindowOffset + uWindowIndex*g_uRefineWindow; unsigned uColTo = uColFrom + g_uRefineWindow - 1; if (uColTo >= uColCount) uColTo = uColCount - 1; assert(uColTo >= uColFrom); SeqVect v; SeqVectFromMSACols(msaIn, uColFrom, uColTo, v); #if MEMDEBUG _CrtMemState s1; _CrtMemCheckpoint(&s1); #endif MSA msaTmp; MUSCLE(v, msaTmp); AppendMSA(msaOut, msaTmp); if (uWindowIndex == g_uSaveWindow) { MSA msaInTmp; unsigned uOutCols = msaOut.GetColCount(); unsigned un = uColTo - uColFrom + 1; MSAFromColRange(msaIn, uColFrom, un, msaInTmp); char fn[256]; sprintf(fn, "win%d_inaln.tmp", uWindowIndex); TextFile fIn(fn, true); msaInTmp.ToFile(fIn); sprintf(fn, "win%d_inseqs.tmp", uWindowIndex); TextFile fv(fn, true); v.ToFile(fv); sprintf(fn, "win%d_outaln.tmp", uWindowIndex); TextFile fOut(fn, true); msaTmp.ToFile(fOut); } #if MEMDEBUG void FreeDPMemSPN(); FreeDPMemSPN(); _CrtMemState s2; _CrtMemCheckpoint(&s2); _CrtMemState s; _CrtMemDifference(&s, &s1, &s2); _CrtMemDumpStatistics(&s); _CrtMemDumpAllObjectsSince(&s1); exit(1); #endif //#if DEBUG // AssertMSAEqIgnoreCaseAndGaps(msaInTmp, msaTmp); //#endif } fprintf(stderr, "\n"); // AssertMSAEqIgnoreCaseAndGaps(msaIn, msaOut);//@@uncomment! }