Exemplo n.º 1
0
void DoMuscle()
	{
	SetOutputFileName(g_pstrOutFileName.get());
	SetInputFileName(g_pstrInFileName.get());

	SetMaxIters(g_uMaxIters.get());
	SetSeqWeightMethod(g_SeqWeight1.get());

	TextFile fileIn(g_pstrInFileName.get());
	SeqVect v;
	v.FromFASTAFile(fileIn);
	const unsigned uSeqCount = v.Length();

	if (0 == uSeqCount)
		Quit("No sequences in input file");

	ALPHA Alpha = ALPHA_Undefined;
	switch (g_SeqType.get())
		{
	case SEQTYPE_Auto:
		Alpha = v.GuessAlpha();
		break;

	case SEQTYPE_Protein:
		Alpha = ALPHA_Amino;
		break;

	case SEQTYPE_DNA:
		Alpha = ALPHA_DNA;
		break;

	case SEQTYPE_RNA:
		Alpha = ALPHA_RNA;
		break;

	default:
		Quit("Invalid seq type");
		}
	SetAlpha(Alpha);
	v.FixAlpha();

//
// AED 21/12/06: Moved matrix loading code inside the PP param function so it gets called for all alignment types
//
	SetPPScore();


	unsigned uMaxL = 0;
	unsigned uTotL = 0;
	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		{
		unsigned L = v.GetSeq(uSeqIndex).Length();
		uTotL += L;
		if (L > uMaxL)
			uMaxL = L;
		}

	SetIter(1);
	g_bDiags.get() = g_bDiags1.get();
	SetSeqStats(uSeqCount, uMaxL, uTotL/uSeqCount);

	SetMuscleSeqVect(v);

	MSA::SetIdCount(uSeqCount);

// Initialize sequence ids.
// From this point on, ids must somehow propogate from here.
	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		v.SetSeqId(uSeqIndex, uSeqIndex);

	if (0 == uSeqCount)
		Quit("Input file '%s' has no sequences", g_pstrInFileName.get());
	if (1 == uSeqCount)
		{
		TextFile fileOut(g_pstrOutFileName.get(), true);
		v.ToFile(fileOut);
		return;
		}

	if (uSeqCount > 1)
		MHackStart(v);

// First iteration
	Tree GuideTree;
	if (0 != g_pstrUseTreeFileName.get())
		{
	// Discourage users...
		if (!g_bUseTreeNoWarn.get())
			fprintf(stderr, g_strUseTreeWarning);

	// Read tree from file
		TextFile TreeFile(g_pstrUseTreeFileName.get());
		GuideTree.FromFile(TreeFile);

	// Make sure tree is rooted
		if (!GuideTree.IsRooted())
			Quit("User tree must be rooted");

		if (GuideTree.GetLeafCount() != uSeqCount)
			Quit("User tree does not match input sequences");

		const unsigned uNodeCount = GuideTree.GetNodeCount();
		for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)
			{
			if (!GuideTree.IsLeaf(uNodeIndex))
				continue;
			const char *LeafName = GuideTree.GetLeafName(uNodeIndex);
			unsigned uSeqIndex;
			bool SeqFound = v.FindName(LeafName, &uSeqIndex);
			if (!SeqFound)
				Quit("Label %s in tree does not match sequences", LeafName);
			unsigned uId = v.GetSeqIdFromName(LeafName);
			GuideTree.SetLeafId(uNodeIndex, uId);
			}
		}
	else
		TreeFromSeqVect(v, GuideTree, g_Cluster1.get(), g_Distance1.get(), g_Root1.get(),
		  g_pstrDistMxFileName1.get());

	const char *Tree1 = ValueOpt("Tree1");
	if (0 != Tree1)
		{
		TextFile f(Tree1, true);
		GuideTree.ToFile(f);
		if (g_bClusterOnly.get())
			return;
		}

	SetMuscleTree(GuideTree);
	ValidateMuscleIds(GuideTree);

	MSA msa;
	ProgNode *ProgNodes = 0;
	if (g_bLow.get())
		ProgNodes = ProgressiveAlignE(v, GuideTree, msa);
	else
		ProgressiveAlign(v, GuideTree, msa);
	SetCurrentAlignment(msa);

	if (0 != g_pstrComputeWeightsFileName.get())
		{
		extern void OutWeights(const char *FileName, const MSA &msa);
		SetMSAWeightsMuscle(msa);
		OutWeights(g_pstrComputeWeightsFileName.get(), msa);
		return;
		}

	ValidateMuscleIds(msa);

	if (1 == g_uMaxIters.get() || 2 == uSeqCount)
		{
		//TextFile fileOut(g_pstrOutFileName.get(), true);
		//MHackEnd(msa);
		//msa.ToFile(fileOut);
		MuscleOutput(msa);
		return;
		}

	if (0 == g_pstrUseTreeFileName.get())
		{
		g_bDiags.get() = g_bDiags2.get();
		SetIter(2);

		if (g_bLow.get())
			{
			if (0 != g_uMaxTreeRefineIters.get())
				RefineTreeE(msa, v, GuideTree, ProgNodes);
			}
		else
			RefineTree(msa, GuideTree);

		const char *Tree2 = ValueOpt("Tree2");
		if (0 != Tree2)
			{
			TextFile f(Tree2, true);
			GuideTree.ToFile(f);
			}
		}

	SetSeqWeightMethod(g_SeqWeight2.get());
	SetMuscleTree(GuideTree);

	if (g_bAnchors.get())
		RefineVert(msa, GuideTree, g_uMaxIters.get() - 2);
	else
		RefineHoriz(msa, GuideTree, g_uMaxIters.get() - 2, false, false);

#if	0
// Refining by subfamilies is disabled as it didn't give better
// results. I tried doing this before and after RefineHoriz.
// Should get back to this as it seems like this should work.
	RefineSubfams(msa, GuideTree, g_uMaxIters.get() - 2);
#endif

	ValidateMuscleIds(msa);
	ValidateMuscleIds(GuideTree);

	//TextFile fileOut(g_pstrOutFileName.get(), true);
	//MHackEnd(msa);
	//msa.ToFile(fileOut);
	MuscleOutput(msa);
	}
Exemplo n.º 2
0
void DoMuscle(CompositeVect*CVLocation)
	{
	SetOutputFileName(g_pstrOutFileName);
	SetInputFileName(g_pstrInFileName);

	SetMaxIters(g_uMaxIters);
	SetSeqWeightMethod(g_SeqWeight1);

	TextFile fileIn(g_pstrInFileName);
	SeqVect v;
	v.FromFASTAFile(fileIn);
	const unsigned uSeqCount = v.Length();

	if (0 == uSeqCount)
		Quit("No sequences in input file");

	ALPHA Alpha = ALPHA_Undefined;
	switch (g_SeqType)
		{
	case SEQTYPE_Auto:
		Alpha = v.GuessAlpha();
		break;

	case SEQTYPE_Protein:
		Alpha = ALPHA_Amino;
		break;

	case SEQTYPE_DNA:
		Alpha = ALPHA_DNA;
		break;

	case SEQTYPE_RNA:
		Alpha = ALPHA_RNA;
		break;

	default:
		Quit("Invalid seq type");
		}
	SetAlpha(Alpha);
	v.FixAlpha();

	PTR_SCOREMATRIX UserMatrix = 0;
	if (0 != g_pstrMatrixFileName)
		{
		const char *FileName = g_pstrMatrixFileName;
		const char *Path = getenv("MUSCLE_MXPATH");
		if (Path != 0)
			{
			size_t n = strlen(Path) + 1 + strlen(FileName) + 1;
			char *NewFileName = new char[n];
			sprintf(NewFileName, "%s/%s", Path, FileName);
			FileName = NewFileName;
			}
		TextFile File(FileName);
		UserMatrix = ReadMx(File);
		g_Alpha = ALPHA_Amino;
		g_PPScore = PPSCORE_SP;
		}

	SetPPScore();

	if (0 != UserMatrix)
		g_ptrScoreMatrix = UserMatrix;

	unsigned uMaxL = 0;
	unsigned uTotL = 0;
	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		{
		unsigned L = v.GetSeq(uSeqIndex).Length();
		uTotL += L;
		if (L > uMaxL)
			uMaxL = L;
		}

	SetIter(1);
	g_bDiags = g_bDiags1;
	SetSeqStats(uSeqCount, uMaxL, uTotL/uSeqCount);

	SetMuscleSeqVect(v);

	MSA::SetIdCount(uSeqCount);

// Initialize sequence ids.
// From this point on, ids must somehow propogate from here.
	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		v.SetSeqId(uSeqIndex, uSeqIndex);

	if (0 == uSeqCount)
		Quit("Input file '%s' has no sequences", g_pstrInFileName);
	if (1 == uSeqCount)
		{
		TextFile fileOut(g_pstrOutFileName, true);
		v.ToFile(fileOut);
		return;
		}

	if (uSeqCount > 1)
		MHackStart(v);

// First iteration
	Tree GuideTree;
	if (0 != g_pstrUseTreeFileName)
	{
	// Discourage users...
		if (!g_bUseTreeNoWarn)
			fprintf(stderr, "%s", g_strUseTreeWarning);

	// Read tree from file
		TextFile TreeFile(g_pstrUseTreeFileName);
		GuideTree.FromFile(TreeFile);

	// Make sure tree is rooted
		if (!GuideTree.IsRooted())
			Quit("User tree must be rooted");

		if (GuideTree.GetLeafCount() != uSeqCount)
			Quit("User tree does not match input sequences");

		const unsigned uNodeCount = GuideTree.GetNodeCount();
		for (unsigned uNodeIndex = 0; uNodeIndex < uNodeCount; ++uNodeIndex)
			{
			if (!GuideTree.IsLeaf(uNodeIndex))
				continue;
			const char *LeafName = GuideTree.GetLeafName(uNodeIndex);
			unsigned uSeqIndex;
			bool SeqFound = v.FindName(LeafName, &uSeqIndex);
			if (!SeqFound)
				Quit("Label %s in tree does not match sequences", LeafName);
			unsigned uId = v.GetSeqIdFromName(LeafName);
			GuideTree.SetLeafId(uNodeIndex, uId);
			}
		}
	else
		TreeFromSeqVect(v, GuideTree, g_Cluster1, g_Distance1, g_Root1,
		  g_pstrDistMxFileName1);

	const char *Tree1 = ValueOpt("Tree1");
	if (0 != Tree1)
		{
		TextFile f(Tree1, true);
		GuideTree.ToFile(f);
		if (g_bClusterOnly)
			return;
		}

	SetMuscleTree(GuideTree);
	ValidateMuscleIds(GuideTree);

	MSA msa;
	msa.SetCompositeVector(CVLocation);
	ProgNode *ProgNodes = 0;
	if (g_bLow)
		ProgNodes = ProgressiveAlignE(v, GuideTree, msa);
	else
		ProgressiveAlign(v, GuideTree, msa);
	SetCurrentAlignment(msa);

	if (0 != g_pstrComputeWeightsFileName)
		{
		extern void OutWeights(const char *FileName, const MSA &msa);
		SetMSAWeightsMuscle(msa);
		OutWeights(g_pstrComputeWeightsFileName, msa);
		return;
		}

	ValidateMuscleIds(msa);

	if (1 == g_uMaxIters || 2 == uSeqCount)
		{
		//TextFile fileOut(g_pstrOutFileName, true);
		//MHackEnd(msa);
		//msa.ToFile(fileOut);
		MuscleOutput(msa);
		return;
		}

	if (0 == g_pstrUseTreeFileName)
		{
		g_bDiags = g_bDiags2;
		SetIter(2);

		if (g_bLow)
			{
			if (0 != g_uMaxTreeRefineIters)
				RefineTreeE(msa, v, GuideTree, ProgNodes);
			}
		else
			RefineTree(msa, GuideTree);

		const char *Tree2 = ValueOpt("Tree2");
		if (0 != Tree2)
			{
			TextFile f(Tree2, true);
			GuideTree.ToFile(f);
			}
		}

	SetSeqWeightMethod(g_SeqWeight2);
	SetMuscleTree(GuideTree);

	if (g_bAnchors)
		RefineVert(msa, GuideTree, g_uMaxIters - 2);
	else
		RefineHoriz(msa, GuideTree, g_uMaxIters - 2, false, false);

#if	0
// Refining by subfamilies is disabled as it didn't give better
// results. I tried doing this before and after RefineHoriz.
// Should get back to this as it seems like this should work.
	RefineSubfams(msa, GuideTree, g_uMaxIters - 2);
#endif

	ValidateMuscleIds(msa);
	ValidateMuscleIds(GuideTree);

	//TextFile fileOut(g_pstrOutFileName, true);
	//MHackEnd(msa);
	//msa.ToFile(fileOut);
	MuscleOutput(msa);
	}
Exemplo n.º 3
0
void Refine()
	{
	SetOutputFileName(g_pstrOutFileName.get());
	SetInputFileName(g_pstrInFileName.get());
	SetStartTime();

	SetMaxIters(g_uMaxIters.get());
	SetSeqWeightMethod(g_SeqWeight1.get());

	TextFile fileIn(g_pstrInFileName.get());
	MSA msa;
	msa.FromFile(fileIn);

	const unsigned uSeqCount = msa.GetSeqCount();
	if (0 == uSeqCount)
		Quit("No sequences in input file");

	ALPHA Alpha = ALPHA_Undefined;
	switch (g_SeqType.get())
		{
	case SEQTYPE_Auto:
		Alpha = msa.GuessAlpha();
		break;

	case SEQTYPE_Protein:
		Alpha = ALPHA_Amino;
		break;

	case SEQTYPE_DNA:
		Alpha = ALPHA_DNA;
		break;

	case SEQTYPE_RNA:
		Alpha = ALPHA_RNA;
		break;

	default:
		Quit("Invalid SeqType");
		}
	SetAlpha(Alpha);
	msa.FixAlpha();

	SetPPScore();
	if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha)
		SetPPScore(PPSCORE_SPN);

	MSA::SetIdCount(uSeqCount);

// Initialize sequence ids.
// From this point on, ids must somehow propogate from here.
	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		msa.SetSeqId(uSeqIndex, uSeqIndex);
	SetMuscleInputMSA(msa);

	Tree GuideTree;
	TreeFromMSA(msa, GuideTree, g_Cluster2.get(), g_Distance2.get(), g_Root2.get());
	SetMuscleTree(GuideTree);

	if (g_bAnchors.get())
		RefineVert(msa, GuideTree, g_uMaxIters.get());
	else
		RefineHoriz(msa, GuideTree, g_uMaxIters.get(), false, false);

	ValidateMuscleIds(msa);
	ValidateMuscleIds(GuideTree);

//	TextFile fileOut(g_pstrOutFileName.get(), true);
//	msa.ToFile(fileOut);
	MuscleOutput(msa);
	}
Exemplo n.º 4
0
bool RefineSubfams(MSA &msa, const Tree &tree, unsigned uIters)
	{
    MuscleContext *ctx = getMuscleContext();
    CLUSTER &g_Cluster2 = ctx->params.g_Cluster2;
    DISTANCE &g_Distance2 =  ctx->params.g_Distance2;
    ROOT &g_Root2 = ctx->params.g_Root2;
    bool &g_bAnchors = ctx->params.g_bAnchors;

	const unsigned uSeqCount = msa.GetSeqCount();
	if (uSeqCount < 3)
		return false;

	const double dMaxHeight = 0.6;
	const unsigned uMaxSubfamCount = 16;
    //const unsigned uNodeCount = tree.GetNodeCount();

	unsigned *Subfams;
	unsigned uSubfamCount;
	GetSubfams(tree, dMaxHeight, uMaxSubfamCount, &Subfams, &uSubfamCount);
	assert(uSubfamCount <= uSeqCount);

	if (ctx->params.g_bVerbose)
		LogSubfams(tree, Subfams, uSubfamCount);

	MSA *SubfamMSAs = new MSA[uSubfamCount];
	unsigned *Leaves = new unsigned[uSeqCount];
	unsigned *Ids = new unsigned[uSeqCount];

	bool bAnyChanges = false;
	for (unsigned uSubfamIndex = 0; uSubfamIndex < uSubfamCount; ++uSubfamIndex)
		{
		unsigned uSubfam = Subfams[uSubfamIndex];
		unsigned uLeafCount;
		GetLeaves(tree, uSubfam, Leaves, &uLeafCount);
		assert(uLeafCount <= uSeqCount);

		LeafIndexesToIds(tree, Leaves, uLeafCount, Ids);

		MSA &msaSubfam = SubfamMSAs[uSubfamIndex];
		MSASubsetByIds(msa, Ids, uLeafCount, msaSubfam);
		DeleteGappedCols(msaSubfam);

#if	TRACE
		Log("Subfam %u MSA=\n", uSubfamIndex);
		msaSubfam.LogMe();
#endif

		if (msaSubfam.GetSeqCount() <= 2)
			continue;

	// TODO /////////////////////////////////////////
	// Try using existing tree, may actually hurt to
	// re-estimate, may also be a waste of CPU & mem.
	/////////////////////////////////////////////////
		Tree SubfamTree;
		TreeFromMSA(msaSubfam, SubfamTree, g_Cluster2, g_Distance2, g_Root2);

		bool bAnyChangesThisSubfam;
		if (g_bAnchors)
			bAnyChangesThisSubfam = RefineVert(msaSubfam, SubfamTree, uIters);
		else
			bAnyChangesThisSubfam = RefineHoriz(msaSubfam, SubfamTree, uIters, false, false);
#if	TRACE
		Log("Subfam %u Changed %d\n", uSubfamIndex, bAnyChangesThisSubfam);
#endif
		if (bAnyChangesThisSubfam)
			bAnyChanges = true;
		}

	if (bAnyChanges)
		ProgressiveAlignSubfams(tree, Subfams, uSubfamCount, SubfamMSAs, msa);

	delete[] Leaves;
	delete[] Subfams;
	delete[] SubfamMSAs;

	return bAnyChanges;
	}
Exemplo n.º 5
0
// Return true if any changes made
bool RefineVert(MSA &msaIn, const Tree &tree, unsigned uIters)
	{
	bool bAnyChanges = false;

	const unsigned uColCountIn = msaIn.GetColCount();
	const unsigned uSeqCountIn = msaIn.GetSeqCount();

	if (uColCountIn < 3 || uSeqCountIn < 3)
		return false;

	unsigned *AnchorCols = new unsigned[uColCountIn];
	unsigned uAnchorColCount;
	SetMSAWeightsMuscle(msaIn);
	FindAnchorCols(msaIn, AnchorCols, &uAnchorColCount);

	const unsigned uRangeCount = uAnchorColCount + 1;
	Range *Ranges = new Range[uRangeCount];

#if	TRACE
	Log("%u ranges\n", uRangeCount);
#endif

	ColsToRanges(AnchorCols, uAnchorColCount, uColCountIn, Ranges);
	ListVertSavings(uColCountIn, uAnchorColCount, Ranges, uRangeCount);

#if	TRACE
	{
	Log("Anchor cols: ");
	for (unsigned i = 0; i < uAnchorColCount; ++i)
		Log(" %u", AnchorCols[i]);
	Log("\n");

	Log("Ranges:\n");
	for (unsigned i = 0; i < uRangeCount; ++i)
		Log("%4u - %4u\n", Ranges[i].m_uBestColLeft, Ranges[i].m_uBestColRight);
	}
#endif

	delete[] AnchorCols;

	MSA msaOut;
	msaOut.SetSize(uSeqCountIn, 0);

	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCountIn; ++uSeqIndex)
		{
		const char *ptrName = msaIn.GetSeqName(uSeqIndex);
		unsigned uId = msaIn.GetSeqId(uSeqIndex);
		msaOut.SetSeqName(uSeqIndex, ptrName);
		msaOut.SetSeqId(uSeqIndex, uId);
		}

	for (unsigned uRangeIndex = 0; uRangeIndex < uRangeCount; ++uRangeIndex)
		{
		MSA msaRange;

		const Range &r = Ranges[uRangeIndex];

		const unsigned uFromColIndex = r.m_uBestColLeft;
		const unsigned uRangeColCount = r.m_uBestColRight - uFromColIndex;

		if (0 == uRangeColCount)
			continue;
		else if (1 == uRangeColCount)
			{
			MSAFromColRange(msaIn, uFromColIndex, 1, msaRange);
			MSAAppend(msaOut, msaRange);
			continue;
			}
		MSAFromColRange(msaIn, uFromColIndex, uRangeColCount, msaRange);

#if	TRACE
		Log("\n-------------\n");
		Log("Range %u - %u count=%u\n", r.m_uBestColLeft, r.m_uBestColRight, uRangeColCount);
		Log("Before:\n");
		msaRange.LogMe();
#endif

		bool bLockLeft = (0 != uRangeIndex);
		bool bLockRight = (uRangeCount - 1 != uRangeIndex);
		bool bAnyChangesThisBlock = RefineHoriz(msaRange, tree, uIters, bLockLeft, bLockRight);
		bAnyChanges = (bAnyChanges || bAnyChangesThisBlock);

#if	TRACE
		Log("After:\n");
		msaRange.LogMe();
#endif

		MSAAppend(msaOut, msaRange);

#if	TRACE
		Log("msaOut after Cat:\n");
		msaOut.LogMe();
#endif
		}

#if	DEBUG
// Sanity check
	AssertMSAEqIgnoreCaseAndGaps(msaIn, msaOut);
#endif

	delete[] Ranges;
	if (bAnyChanges)
		msaIn.Copy(msaOut);
	return bAnyChanges;
	}