Пример #1
0
bool ScoreHistory::SetScore(unsigned uIter, unsigned uNodeIndex, bool bRight, SCORE Score)
	{
#if	TRACE
	Log("ScoreHistory::SetScore(Iter=%u Node=%u Right=%d Score=%g)\n",
	  uIter, uNodeIndex, bRight, Score);
#endif
	if (uIter >= m_uIters)
		Quit("ScoreHistory::SetScore-1");
	if (uNodeIndex >= m_uNodeCount)
		Quit("ScoreHistory::SetScore-2");

	const unsigned uIndex = uNodeIndex*2 + bRight;
	for (unsigned n = 1; n < uIter; ++n)
		{
		const unsigned uPrevIter = n - 1;
		if (!m_bScoreSet[uPrevIter][uIndex])
			{
			LogMe();
			Quit("ScoreHistory::SetScore-3");
			}
		if (m_Score[uPrevIter][uIndex] == Score)
			{
			ProgressStepsDone();
#if	TRACE
			Log("Oscillating\n");
#endif
			return true;
			}
		}
	m_Score[uIter][uIndex] = Score;
	m_bScoreSet[uIter][uIndex] = true;
	return false;
	}
Пример #2
0
void RefineTreeE(MSA &msa, const SeqVect &v, Tree &tree, ProgNode *ProgNodes)
	{
    MuscleContext *ctx = getMuscleContext();
	const unsigned uSeqCount = msa.GetSeqCount();
	if (tree.GetLeafCount() != uSeqCount)
		Quit("Refine tree, tree has different number of nodes");

	if (uSeqCount < 3)
		return;

#if	DEBUG
	ValidateMuscleIds(msa);
	ValidateMuscleIds(tree);
#endif

	const unsigned uNodeCount = tree.GetNodeCount();
	unsigned *uNewNodeIndexToOldNodeIndex= new unsigned[uNodeCount];

	Tree Tree2;
	TreeFromMSA(msa, Tree2, ctx->params.g_Cluster2, ctx->params.g_Distance2, ctx->params.g_Root2, ctx->params.g_pstrDistMxFileName2);

#if	DEBUG
	ValidateMuscleIds(Tree2);
#endif

	DiffTreesE(Tree2, tree, uNewNodeIndexToOldNodeIndex);

	unsigned uRoot = Tree2.GetRootNodeIndex();
	if (NODE_CHANGED == uNewNodeIndexToOldNodeIndex[uRoot])
		{
		MSA msa2;
		RealignDiffsE(msa, v, Tree2, tree, uNewNodeIndexToOldNodeIndex, msa2, ProgNodes);
        if (!ctx->isCanceled()) {
            tree.Copy(Tree2);
		    msa.Copy(msa2);
#if	DEBUG
            ValidateMuscleIds(msa2);
#endif
        }
		}

	delete[] uNewNodeIndexToOldNodeIndex;

    if (ctx->isCanceled()) {
        throw MuscleException("Canceled");
    }

	SetCurrentAlignment(msa);
	ProgressStepsDone();

	}
Пример #3
0
void DistPWScoreDist(const SeqVect &v, DistFunc &DF)
	{
	SEQWEIGHT SeqWeightSave = GetSeqWeightMethod();
	SetSeqWeightMethod(SEQWEIGHT_Henikoff);

	const unsigned uSeqCount = v.Length();
	DF.SetCount(uSeqCount);

	const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2;
	unsigned uCount = 0;
	SetProgressDesc("PW ScoreDist");
	for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1)
		{
		const Seq &s1 = v.GetSeq(uSeqIndex1);
		MSA msa1;
		msa1.FromSeq(s1);
		for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqIndex1; ++uSeqIndex2)
			{
			if (0 == uCount%20)
				Progress(uCount, uPairCount);
			++uCount;
			const Seq &s2 = v.GetSeq(uSeqIndex2);
			MSA msa2;
			msa2.FromSeq(s2);
		
			PWPath Path;
			MSA msaOut;
			AlignTwoMSAs(msa1, msa2, msaOut, Path, false, false);

			float d = (float) GetScoreDist(msaOut, 0, 1);
			DF.SetDist(uSeqIndex1, uSeqIndex2, d);
			}
		}
	ProgressStepsDone();

	SetSeqWeightMethod(SeqWeightSave);
	}
Пример #4
0
ProgNode *ProgressiveAlignE(const SeqVect &v, const Tree &GuideTree, MSA &a)
	{
	assert(GuideTree.IsRooted());

#if	TRACE
	Log("GuideTree:\n");
	GuideTree.LogMe();
#endif

	const unsigned uSeqCount = v.Length();
	const unsigned uNodeCount = 2*uSeqCount - 1;
	const unsigned uIterCount = uSeqCount - 1;

	WEIGHT *Weights = new WEIGHT[uSeqCount];
	CalcClustalWWeights(GuideTree, Weights);

	ProgNode *ProgNodes = new ProgNode[uNodeCount];

	unsigned uJoin = 0;
	unsigned uTreeNodeIndex = GuideTree.FirstDepthFirstNode();
	SetProgressDesc("Align node");
	do
		{
		if (GuideTree.IsLeaf(uTreeNodeIndex))
			{
			if (uTreeNodeIndex >= uNodeCount)
				Quit("TreeNodeIndex=%u NodeCount=%u\n", uTreeNodeIndex, uNodeCount);
			ProgNode &Node = ProgNodes[uTreeNodeIndex];
			unsigned uId = GuideTree.GetLeafId(uTreeNodeIndex);
			if (uId >= uSeqCount)
				Quit("Seq index out of range");
			const Seq &s = *(v[uId]);
			Node.m_MSA.FromSeq(s);
			Node.m_MSA.SetSeqId(0, uId);
			Node.m_uLength = Node.m_MSA.GetColCount();
			Node.m_Weight = Weights[uId];
		// TODO: Term gaps settable
			Node.m_Prof = ProfileFromMSA(Node.m_MSA);
			Node.m_EstringL = 0;
			Node.m_EstringR = 0;
#if	TRACE
			Log("Leaf id=%u\n", uId);
			Log("MSA=\n");
			Node.m_MSA.LogMe();
			Log("Profile (from MSA)=\n");
			ListProfile(Node.m_Prof, Node.m_uLength, &Node.m_MSA);
#endif
			}
		else
			{
			Progress(uJoin, uSeqCount - 1);
			++uJoin;

			const unsigned uMergeNodeIndex = uTreeNodeIndex;
			ProgNode &Parent = ProgNodes[uMergeNodeIndex];

			const unsigned uLeft = GuideTree.GetLeft(uTreeNodeIndex);
			const unsigned uRight = GuideTree.GetRight(uTreeNodeIndex);

			if (g_bVerbose)
				{
				Log("Align: (");
				LogLeafNames(GuideTree, uLeft);
				Log(") (");
				LogLeafNames(GuideTree, uRight);
				Log(")\n");
				}

			ProgNode &Node1 = ProgNodes[uLeft];
			ProgNode &Node2 = ProgNodes[uRight];

#if	TRACE
			Log("AlignTwoMSAs:\n");
#endif
			AlignTwoProfs(
			  Node1.m_Prof, Node1.m_uLength, Node1.m_Weight,
			  Node2.m_Prof, Node2.m_uLength, Node2.m_Weight,
			  Parent.m_Path,
			  &Parent.m_Prof, &Parent.m_uLength);
#if	TRACE_LENGTH_DELTA
			{
			unsigned L = Node1.m_uLength;
			unsigned R = Node2.m_uLength;
			unsigned P = Parent.m_Path.GetEdgeCount();
			unsigned Max = L > R ? L : R;
			unsigned d = P - Max;
			Log("LD%u;%u;%u;%u\n", L, R, P, d);
			}
#endif
			PathToEstrings(Parent.m_Path, &Parent.m_EstringL, &Parent.m_EstringR);

			Parent.m_Weight = Node1.m_Weight + Node2.m_Weight;

#if	VALIDATE
			{
#if	TRACE
			Log("AlignTwoMSAs:\n");
#endif
			PWPath TmpPath;
			AlignTwoMSAs(Node1.m_MSA, Node2.m_MSA, Parent.m_MSA, TmpPath);
			ProfPos *P1 = ProfileFromMSA(Node1.m_MSA, true);
			ProfPos *P2 = ProfileFromMSA(Node2.m_MSA, true);
			unsigned uLength = Parent.m_MSA.GetColCount();
			ProfPos *TmpProf = ProfileFromMSA(Parent.m_MSA, true);

#if	TRACE
			Log("Node1 MSA=\n");
			Node1.m_MSA.LogMe();

			Log("Node1 prof=\n");
			ListProfile(Node1.m_Prof, Node1.m_MSA.GetColCount(), &Node1.m_MSA);
			Log("Node1 prof (from MSA)=\n");
			ListProfile(P1, Node1.m_MSA.GetColCount(), &Node1.m_MSA);

			AssertProfsEq(Node1.m_Prof, Node1.m_uLength, P1, Node1.m_MSA.GetColCount());

			Log("Node2 prof=\n");
			ListProfile(Node2.m_Prof, Node2.m_MSA.GetColCount(), &Node2.m_MSA);

			Log("Node2 MSA=\n");
			Node2.m_MSA.LogMe();

			Log("Node2 prof (from MSA)=\n");
			ListProfile(P2, Node2.m_MSA.GetColCount(), &Node2.m_MSA);

			AssertProfsEq(Node2.m_Prof, Node2.m_uLength, P2, Node2.m_MSA.GetColCount());

			TmpPath.AssertEqual(Parent.m_Path);

			Log("Parent MSA=\n");
			Parent.m_MSA.LogMe();

			Log("Parent prof=\n");
			ListProfile(Parent.m_Prof, Parent.m_uLength, &Parent.m_MSA);

			Log("Parent prof (from MSA)=\n");
			ListProfile(TmpProf, Parent.m_MSA.GetColCount(), &Parent.m_MSA);

#endif	// TRACE
			AssertProfsEq(Parent.m_Prof, Parent.m_uLength,
			  TmpProf, Parent.m_MSA.GetColCount());
			delete[] P1;
			delete[] P2;
			delete[] TmpProf;
			}
#endif	// VALIDATE

			Node1.m_MSA.Clear();
			Node2.m_MSA.Clear();

		// Don't delete profiles, may need them for tree refinement.
			//delete[] Node1.m_Prof;
			//delete[] Node2.m_Prof;
			//Node1.m_Prof = 0;
			//Node2.m_Prof = 0;
			}
		uTreeNodeIndex = GuideTree.NextDepthFirstNode(uTreeNodeIndex);
		}
	while (NULL_NEIGHBOR != uTreeNodeIndex);
	ProgressStepsDone();

	if (g_bBrenner)
		MakeRootMSABrenner((SeqVect &) v, GuideTree, ProgNodes, a);
	else
		MakeRootMSA(v, GuideTree, ProgNodes, a);

#if	VALIDATE
	{
	unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex();
	const ProgNode &RootProgNode = ProgNodes[uRootNodeIndex];
	AssertMSAEq(a, RootProgNode.m_MSA);
	}
#endif

	delete[] Weights;
	return ProgNodes;
	}
Пример #5
0
void DistKmer6_6(const SeqVect &v, DistFunc &DF)
	{
	const unsigned uSeqCount = v.Length();

	DF.SetCount(uSeqCount);
	if (0 == uSeqCount)
		return;

// Initialize distance matrix to zero
	for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
		{
		DF.SetDist(uSeq1, uSeq1, 0);
		for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
			DF.SetDist(uSeq1, uSeq2, 0);
		}

// Convert to letters
	unsigned **Letters = new unsigned *[uSeqCount];
	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		{
		Seq &s = *(v[uSeqIndex]);
		const unsigned uSeqLength = s.Length();
		unsigned *L = new unsigned[uSeqLength];
		Letters[uSeqIndex] = L;
		for (unsigned n = 0; n < uSeqLength; ++n)
			{
			char c = s[n];
			L[n] = CharToLetterEx(c);
			assert(L[n] < uResidueGroupCount);
			}
		}

	unsigned **uCommonTupleCount = new unsigned *[uSeqCount];
	for (unsigned n = 0; n < uSeqCount; ++n)
		{
		uCommonTupleCount[n] = new unsigned[uSeqCount];
		memset(uCommonTupleCount[n], 0, uSeqCount*sizeof(unsigned));
		}

	const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2;
	unsigned uCount = 0;
	for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
		{
		Seq &seq1 = *(v[uSeq1]);
		const unsigned uSeqLength1 = seq1.Length();
		if (uSeqLength1 < 5)
			continue;

		const unsigned uTupleCount = uSeqLength1 - 5;
		const unsigned *L = Letters[uSeq1];
		CountTuples(L, uTupleCount, Count1);
#if	TRACE
		{
		Log("Seq1=%d\n", uSeq1);
		Log("Groups:\n");
		for (unsigned n = 0; n < uSeqLength1; ++n)
			Log("%u", ResidueGroup[L[n]]);
		Log("\n");

		Log("Tuples:\n");
		ListCount(Count1);
		}
#endif

		SetProgressDesc("K-mer dist pass 1");
		for (unsigned uSeq2 = 0; uSeq2 <= uSeq1; ++uSeq2)
			{
			if (0 == uCount%500)
				Progress(uCount, uPairCount);
			++uCount;
			Seq &seq2 = *(v[uSeq2]);
			const unsigned uSeqLength2 = seq2.Length();
			if (uSeqLength2 < 5)
				{
				if (uSeq1 == uSeq2)
					DF.SetDist(uSeq1, uSeq2, 0);
				else
					DF.SetDist(uSeq1, uSeq2, 1);
				continue;
				}

		// First pass through seq 2 to count tuples
			const unsigned uTupleCount = uSeqLength2 - 5;
			const unsigned *L = Letters[uSeq2];
			CountTuples(L, uTupleCount, Count2);
#if	TRACE
			Log("Seq2=%d Counts=\n", uSeq2);
			ListCount(Count2);
#endif

		// Second pass to accumulate sum of shared tuples
		// MAFFT defines this as the sum over unique tuples
		// in seq2 of the minimum of the number of tuples found
		// in the two sequences.
			unsigned uSum = 0;
			for (unsigned n = 0; n < uTupleCount; ++n)
				{
				const unsigned uTuple = GetTuple(L, n);
				uSum += MIN(Count1[uTuple], Count2[uTuple]);

			// This is a hack to make sure each unique tuple counted only once.
				Count2[uTuple] = 0;
				}
#if	TRACE
			{
			Seq &s1 = *(v[uSeq1]);
			Seq &s2 = *(v[uSeq2]);
			const char *pName1 = s1.GetName();
			const char *pName2 = s2.GetName();
			Log("Common count %s(%d) - %s(%d) =%u\n",
			  pName1, uSeq1, pName2, uSeq2, uSum);
			}
#endif
			uCommonTupleCount[uSeq1][uSeq2] = uSum;
			uCommonTupleCount[uSeq2][uSeq1] = uSum;
			}
		}
	ProgressStepsDone();

	uCount = 0;
	SetProgressDesc("K-mer dist pass 2");
	for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
		{
		Seq &s1 = *(v[uSeq1]);
		const char *pName1 = s1.GetName();

		double dCommonTupleCount11 = uCommonTupleCount[uSeq1][uSeq1];
		if (0 == dCommonTupleCount11)
			dCommonTupleCount11 = 1;

		DF.SetDist(uSeq1, uSeq1, 0);
		for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
			{
			if (0 == uCount%500)
				Progress(uCount, uPairCount);
			++uCount;

			double dCommonTupleCount22 = uCommonTupleCount[uSeq2][uSeq2];
			if (0 == dCommonTupleCount22)
				dCommonTupleCount22 = 1;

			const double dDist1 = 3.0*(dCommonTupleCount11 - uCommonTupleCount[uSeq1][uSeq2])
			  /dCommonTupleCount11;
			const double dDist2 = 3.0*(dCommonTupleCount22 - uCommonTupleCount[uSeq1][uSeq2])
			  /dCommonTupleCount22;

		// dMinDist is the value used for tree-building in MAFFT
			const double dMinDist = MIN(dDist1, dDist2);
			DF.SetDist(uSeq1, uSeq2, (float) dMinDist);

			//const double dEstimatedPctId = TupleDistToEstimatedPctId(dMinDist);
			//g_dfPwId.SetDist(uSeq1, uSeq2, dEstimatedPctId);
		// **** TODO **** why does this make score slightly worse??
			//const double dKimuraDist = KimuraDist(dEstimatedPctId);
			//DF.SetDist(uSeq1, uSeq2, dKimuraDist);
			}
		}
	ProgressStepsDone();

	for (unsigned n = 0; n < uSeqCount; ++n)
		delete[] uCommonTupleCount[n];
	delete[] uCommonTupleCount;
	delete[] Letters;
	}
Пример #6
0
void MakeRootMSA(const SeqVect &v, const Tree &GuideTree, ProgNode Nodes[],
  MSA &a)
	{
#if	TRACE
	Log("MakeRootMSA Tree=");
	GuideTree.LogMe();
#endif
	const unsigned uSeqCount = v.GetSeqCount();
	unsigned uColCount = uInsane;
	unsigned uSeqIndex = 0;
	const unsigned uTreeNodeCount = GuideTree.GetNodeCount();
	const unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex();
	const PWPath &RootPath = Nodes[uRootNodeIndex].m_Path;
	const unsigned uRootColCount = RootPath.GetEdgeCount();
	const unsigned uEstringSize = uRootColCount + 1;
	short *Estring1 = new short[uEstringSize];
	short *Estring2 = new short[uEstringSize];
	SetProgressDesc("Root alignment");

	unsigned uTreeNodeIndex = GetFirstNodeIndex(GuideTree);
	do
		{
		Progress(uSeqIndex, uSeqCount);

		unsigned uId = GuideTree.GetLeafId(uTreeNodeIndex);
		const Seq &s = *(v[uId]);

		Seq sRootE;
		short *es = MakeRootSeqE(s, GuideTree, uTreeNodeIndex, Nodes, sRootE,
		  Estring1, Estring2);
		Nodes[uTreeNodeIndex].m_EstringL = EstringNewCopy(es);

#if	VALIDATE
		Seq sRoot;
		MakeRootSeq(s, GuideTree, uTreeNodeIndex, Nodes, sRoot);
		if (!sRoot.Eq(sRootE))
			{
			Log("sRoot=");
			sRoot.LogMe();
			Log("sRootE=");
			sRootE.LogMe();
			Quit("Root seqs differ");
			}
#if	TRACE
		Log("MakeRootSeq=\n");
		sRoot.LogMe();
#endif
#endif

		if (uInsane == uColCount)
			{
			uColCount = sRootE.Length();
			a.SetSize(uSeqCount, uColCount);
			}
		else
			{
			assert(uColCount == sRootE.Length());
			}
		a.SetSeqName(uSeqIndex, s.GetName());
		a.SetSeqId(uSeqIndex, uId);
		for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
			a.SetChar(uSeqIndex, uColIndex, sRootE[uColIndex]);
		++uSeqIndex;

		uTreeNodeIndex = GetNextNodeIndex(GuideTree, uTreeNodeIndex);
		}
	while (NULL_NEIGHBOR != uTreeNodeIndex);

	delete[] Estring1;
	delete[] Estring2;

	ProgressStepsDone();
	assert(uSeqIndex == uSeqCount);
	}
Пример #7
0
    // Return true if any changes made
    bool RefineTask::RefineHorizP(MSA* msaIn, unsigned uIters, bool bLockLeft, bool bLockRight) 
    {
        Q_UNUSED(bLockLeft); Q_UNUSED(bLockRight);
        
        MuscleContext *ctx = workpool->ctx;
        unsigned &g_uRefineHeightSubtree = ctx->refinehoriz.g_uRefineHeightSubtree;
        unsigned &g_uRefineHeightSubtreeTotal = ctx->refinehoriz.g_uRefineHeightSubtreeTotal;
        Tree &tree = workpool->GuideTree;
        workpool->msaIn = msaIn;
        workpool->uIters = uIters;

        if (!tree.IsRooted())
            Quit("RefineHeight: requires rooted tree");

        const unsigned uSeqCount = msaIn->GetSeqCount();
        if (uSeqCount < 3)
            return false;

        const unsigned uInternalNodeCount = uSeqCount - 1;
        unsigned *InternalNodeIndexes = new unsigned[uInternalNodeCount];
        unsigned *InternalNodeIndexesR = new unsigned[uInternalNodeCount];
        

        GetInternalNodesInHeightOrder(tree, InternalNodeIndexes);

        ScoreHistory History(uIters, 2*uSeqCount - 1);
        workpool->History = &History;
        workpool->uInternalNodeCount = uInternalNodeCount;
        bool bAnyChangesAnyIter = false;
        workpool->refineNodeStatuses = new RefineTreeNodeStatus[uInternalNodeCount];
        for (unsigned n = 0; n < uInternalNodeCount; ++n) {
            InternalNodeIndexesR[uInternalNodeCount - 1 - n] = InternalNodeIndexes[n];
            workpool->refineNodeStatuses[n] = RefineTreeNodeStatus_Available;
        }

        for (unsigned uIter = 0; uIter < uIters && !ctx->isCanceled(); ++uIter)
        {
            workpool->uIter = uIter;

            bool bAnyChangesThisIter = false;
            IncIter();
            SetProgressDesc("Refine biparts");
            g_uRefineHeightSubtree = 0;
            g_uRefineHeightSubtreeTotal = uInternalNodeCount*2 - 1;

            bool &bReverse = workpool->bReversed = (uIter%2 != 0);
            if (bReverse)
                workpool->InternalNodeIndexes = InternalNodeIndexesR;
            else
                workpool->InternalNodeIndexes = InternalNodeIndexes;

            bool bOscillating;
            workpool->ptrbOscillating = &bOscillating;
            for (unsigned i = 0; i < 2 && !ctx->isCanceled(); ++i)
            {
                bool bAnyChanges = false;
                bool &bRight = workpool->bRight;
                switch (i)
                {
                case 0:
                    bRight = true;
                    break;
                case 1:
                    bRight = false;
                    break;
                default:
                    delete[] InternalNodeIndexes;
                    delete[] InternalNodeIndexesR;
                    Quit("RefineHeight default case");
                }
                workpool->reset();
                RefineHeightPartsP(&bAnyChanges);
                if (bOscillating)
                {
                    ProgressStepsDone();
                    goto Osc;
                }
                if (bAnyChanges)
                {
                    bAnyChangesThisIter = true;
                    bAnyChangesAnyIter = true;
                }
            }

            ProgressStepsDone();
            if (bOscillating)
                break;

            if (!bAnyChangesThisIter)
                break;
        }

Osc:
        delete[] InternalNodeIndexes;
        delete[] InternalNodeIndexesR;
        delete[] workpool->refineNodeStatuses;

        return bAnyChangesAnyIter;
    }
Пример #8
0
// WARNING: Sequences MUST be stripped of gaps and upper case!
void DistKmer20_3(const SeqVect &v, DistFunc &DF)
{
    const unsigned uSeqCount = v.Length();

    DF.SetCount(uSeqCount);
    if (0 == uSeqCount)
        return;
    for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
    {
        DF.SetDist(uSeq1, uSeq1, 0);
        for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
            DF.SetDist(uSeq1, uSeq2, 0);
    }

    const unsigned uTripleArrayBytes = TRIPLE_COUNT*sizeof(TripleCount);
    TripleCounts = (TripleCount *) malloc(uTripleArrayBytes);
    if (0 == TripleCounts)
        Quit("Not enough memory (TripleCounts)");
    memset(TripleCounts, 0, uTripleArrayBytes);

    for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord)
    {
        TripleCount &tc = *(TripleCounts + uWord);
        const unsigned uBytes = uSeqCount*sizeof(short);
        tc.m_Counts = (unsigned short *) malloc(uBytes);
        memset(tc.m_Counts, 0, uBytes);
    }

    for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
    {
        Seq &s = *(v[uSeqIndex]);
        const unsigned uSeqLength = s.Length();
        for (unsigned uPos = 0; uPos < uSeqLength - 2; ++uPos)
        {
            const unsigned uLetter1 = CharToLetterEx(s[uPos]);
            if (uLetter1 >= 20)
                continue;
            const unsigned uLetter2 = CharToLetterEx(s[uPos+1]);
            if (uLetter2 >= 20)
                continue;
            const unsigned uLetter3 = CharToLetterEx(s[uPos+2]);
            if (uLetter3 >= 20)
                continue;

            const unsigned uWord = uLetter1 + uLetter2*20 + uLetter3*20*20;
            assert(uWord < TRIPLE_COUNT);

            TripleCount &tc = *(TripleCounts + uWord);
            const unsigned uOldCount = tc.m_Counts[uSeqIndex];
            if (0 == uOldCount)
                ++(tc.m_uSeqCount);

            ++(tc.m_Counts[uSeqIndex]);
        }
    }

#if TRACE
    {
        Log("TripleCounts\n");
        unsigned uGrandTotal = 0;
        for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord)
        {
            const TripleCount &tc = *(TripleCounts + uWord);
            if (0 == tc.m_uSeqCount)
                continue;

            const unsigned uLetter3 = uWord/(20*20);
            const unsigned uLetter2 = (uWord - uLetter3*20*20)/20;
            const unsigned uLetter1 = uWord%20;
            Log("Word %6u %c%c%c   %6u",
                uWord,
                LetterToCharAmino(uLetter1),
                LetterToCharAmino(uLetter2),
                LetterToCharAmino(uLetter3),
                tc.m_uSeqCount);

            unsigned uSeqCountWithThisWord = 0;
            for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
            {
                const unsigned uCount = tc.m_Counts[uSeqIndex];
                if (uCount > 0)
                {
                    ++uSeqCountWithThisWord;
                    Log(" %u=%u", uSeqIndex, uCount);
                    uGrandTotal += uCount;
                }
            }
            if (uSeqCountWithThisWord != tc.m_uSeqCount)
                Log(" *** SQ ERROR *** %u %u", tc.m_uSeqCount, uSeqCountWithThisWord);
            Log("\n");
        }

        unsigned uTotalBySeqLength = 0;
        for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
        {
            Seq &s = *(v[uSeqIndex]);
            const unsigned uSeqLength = s.Length();
            uTotalBySeqLength += uSeqLength - 2;
        }
        if (uGrandTotal != uTotalBySeqLength)
            Log("*** TOTALS DISAGREE *** %u %u\n", uGrandTotal, uTotalBySeqLength);
    }
#endif

    const unsigned uSeqListBytes = uSeqCount*sizeof(unsigned);
    unsigned short *SeqList = (unsigned short *) malloc(uSeqListBytes);

    for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord)
    {
        const TripleCount &tc = *(TripleCounts + uWord);
        if (0 == tc.m_uSeqCount)
            continue;

        unsigned uSeqCountFound = 0;
        memset(SeqList, 0, uSeqListBytes);

        for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
        {
            if (tc.m_Counts[uSeqIndex] > 0)
            {
                SeqList[uSeqCountFound] = uSeqIndex;
                ++uSeqCountFound;
                if (uSeqCountFound == tc.m_uSeqCount)
                    break;
            }
        }
        assert(uSeqCountFound == tc.m_uSeqCount);

        for (unsigned uSeq1 = 0; uSeq1 < uSeqCountFound; ++uSeq1)
        {
            const unsigned uSeqIndex1 = SeqList[uSeq1];
            const unsigned uCount1 = tc.m_Counts[uSeqIndex1];
            for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
            {
                const unsigned uSeqIndex2 = SeqList[uSeq2];
                const unsigned uCount2 = tc.m_Counts[uSeqIndex2];
                const unsigned uMinCount = uCount1 < uCount2 ? uCount1 : uCount2;
                const double d = DF.GetDist(uSeqIndex1, uSeqIndex2);
                DF.SetDist(uSeqIndex1, uSeqIndex2, (float) (d + uMinCount));
            }
        }
    }
    delete[] SeqList;
    free(TripleCounts);

    unsigned uDone = 0;
    const unsigned uTotal = (uSeqCount*(uSeqCount - 1))/2;
    for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
    {
        DF.SetDist(uSeq1, uSeq1, 0.0);

        const Seq &s1 = *(v[uSeq1]);
        const unsigned uLength1 = s1.Length();

        for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
        {
            const Seq &s2 = *(v[uSeq2]);
            const unsigned uLength2 = s2.Length();
            unsigned uMinLength = uLength1 < uLength2 ? uLength1 : uLength2;
            if (uMinLength < 3)
            {
                DF.SetDist(uSeq1, uSeq2, 1.0);
                continue;
            }

            const double dTripleCount = DF.GetDist(uSeq1, uSeq2);
            if (dTripleCount == 0)
            {
                DF.SetDist(uSeq1, uSeq2, 1.0);
                continue;
            }
            double dNormalizedTripletScore = dTripleCount/(uMinLength - 2);
            //double dEstimatedPairwiseIdentity = exp(0.3912*log(dNormalizedTripletScore));
            //if (dEstimatedPairwiseIdentity > 1)
            //	dEstimatedPairwiseIdentity = 1;
//			DF.SetDist(uSeq1, uSeq2, (float) (1.0 - dEstimatedPairwiseIdentity));
            DF.SetDist(uSeq1, uSeq2, (float) dNormalizedTripletScore);

#if	TRACE
            {
                Log("%s - %s  Triplet count = %g  Lengths %u, %u Estimated pwid = %g\n",
                    s1.GetName(), s2.GetName(), dTripleCount, uLength1, uLength2,
                    dEstimatedPairwiseIdentity);
            }
#endif
            if (uDone%1000 == 0)
                Progress(uDone, uTotal);
        }
    }
    ProgressStepsDone();
}
Пример #9
0
void ProgressiveAlign(const SeqVect &v, const Tree &GuideTree, MSA &a)
	{
	assert(GuideTree.IsRooted());

#if	TRACE
	Log("GuideTree:\n");
	GuideTree.LogMe();
#endif

	const unsigned uSeqCount = v.Length();
	const unsigned uNodeCount = 2*uSeqCount - 1;

	ProgNode *ProgNodes = new ProgNode[uNodeCount];

	unsigned uJoin = 0;
	unsigned uTreeNodeIndex = GuideTree.FirstDepthFirstNode();
	SetProgressDesc("Align node");
	do
		{
		if (GuideTree.IsLeaf(uTreeNodeIndex))
			{
			if (uTreeNodeIndex >= uNodeCount)
				Quit("TreeNodeIndex=%u NodeCount=%u\n", uTreeNodeIndex, uNodeCount);
			ProgNode &Node = ProgNodes[uTreeNodeIndex];
			unsigned uId = GuideTree.GetLeafId(uTreeNodeIndex);
			if (uId >= uSeqCount)
				Quit("Seq index out of range");
			const Seq &s = *(v[uId]);
			Node.m_MSA.FromSeq(s);
			Node.m_MSA.SetSeqId(0, uId);
			Node.m_uLength = Node.m_MSA.GetColCount();
			}
		else
			{
			Progress(uJoin, uSeqCount - 1);
			++uJoin;

			const unsigned uMergeNodeIndex = uTreeNodeIndex;
			ProgNode &Parent = ProgNodes[uMergeNodeIndex];

			const unsigned uLeft = GuideTree.GetLeft(uTreeNodeIndex);
			const unsigned uRight = GuideTree.GetRight(uTreeNodeIndex);

			ProgNode &Node1 = ProgNodes[uLeft];
			ProgNode &Node2 = ProgNodes[uRight];

			PWPath Path;
			AlignTwoMSAs(Node1.m_MSA, Node2.m_MSA, Parent.m_MSA, Path);
			Parent.m_uLength = Parent.m_MSA.GetColCount();

			Node1.m_MSA.Clear();
			Node2.m_MSA.Clear();
			}
		uTreeNodeIndex = GuideTree.NextDepthFirstNode(uTreeNodeIndex);
		}
	while (NULL_NEIGHBOR != uTreeNodeIndex);
	ProgressStepsDone();

	unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex();
	const ProgNode &RootProgNode = ProgNodes[uRootNodeIndex];
	a.Copy(RootProgNode.m_MSA);

	delete[] ProgNodes;
	ProgNodes = 0;
	}
Пример #10
0
void ProgAlignSubFams()
	{
	MSA msaOut;

	SetOutputFileName(g_pstrOutFileName.get());
	SetInputFileName(g_pstrInFileName.get());

	SetMaxIters(g_uMaxIters.get());
	SetSeqWeightMethod(g_SeqWeight1.get());

	TextFile fileIn(g_pstrInFileName.get());
	SeqVect v;
	v.FromFASTAFile(fileIn);
	const unsigned uSeqCount = v.Length();

	if (0 == uSeqCount)
		Quit("No sequences in input file");

	ALPHA Alpha = ALPHA_Undefined;
	switch (g_SeqType.get())
		{
	case SEQTYPE_Auto:
		Alpha = v.GuessAlpha();
		break;

	case SEQTYPE_Protein:
		Alpha = ALPHA_Amino;
		break;

	case SEQTYPE_DNA:
		Alpha = ALPHA_DNA;
		break;

	case SEQTYPE_RNA:
		Alpha = ALPHA_RNA;
		break;

	default:
		Quit("Invalid seq type");
		}
	SetAlpha(Alpha);
	v.FixAlpha();

	PTR_SCOREMATRIX UserMatrix = 0;
	if (0 != g_pstrMatrixFileName.get())
		{
		const char *FileName = g_pstrMatrixFileName.get();
		const char *Path = getenv("MUSCLE_MXPATH");
		if (Path != 0)
			{
			size_t n = strlen(Path) + 1 + strlen(FileName) + 1;
			char *NewFileName = new char[n];
			sprintf(NewFileName, "%s/%s", Path, FileName);
			FileName = NewFileName;
			}
		TextFile File(FileName);
		UserMatrix = ReadMx(File);
		g_Alpha = ALPHA_Amino;
		g_PPScore = PPSCORE_SP;
		}

	SetPPScore();

	if (0 != UserMatrix)
		g_ptrScoreMatrix = UserMatrix;

	if (ALPHA_DNA == Alpha || ALPHA_RNA == Alpha)
		{
		SetPPScore(PPSCORE_SPN);
		g_Distance1.get() = DISTANCE_Kmer4_6;
		}

	unsigned uMaxL = 0;
	unsigned uTotL = 0;
	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		{
		unsigned L = v.GetSeq(uSeqIndex).Length();
		uTotL += L;
		if (L > uMaxL)
			uMaxL = L;
		}

	SetIter(1);
	g_bDiags.get() = g_bDiags1.get();
	SetSeqStats(uSeqCount, uMaxL, uTotL/uSeqCount);

	SetMuscleSeqVect(v);

	MSA::SetIdCount(uSeqCount);

// Initialize sequence ids.
// From this point on, ids must somehow propogate from here.
	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		v.SetSeqId(uSeqIndex, uSeqIndex);

	if (uSeqCount > 1)
		MHackStart(v);

	if (0 == uSeqCount)
		{
		msaOut.Clear();
		return;
		}

	if (1 == uSeqCount && ALPHA_Amino == Alpha)
		{
		const Seq &s = v.GetSeq(0);
		msaOut.FromSeq(s);
		return;
		}

	Tree GuideTree;
	TreeFromSeqVect(v, GuideTree, g_Cluster1.get(), g_Distance1.get(), g_Root1.get());
	SetMuscleTree(GuideTree);

	MSA msa;
	if (g_bLow.get())
		{
		ProgNode *ProgNodes = 0;
		ProgNodes = ProgressiveAlignE(v, GuideTree, msa);
		delete[] ProgNodes;
		}
	else
		ProgressiveAlign(v, GuideTree, msa);
	SetCurrentAlignment(msa);
	TreeFromMSA(msa, GuideTree, g_Cluster2.get(), g_Distance2.get(), g_Root2.get());
	SetMuscleTree(GuideTree);

	unsigned *SubFams = new unsigned[uSeqCount];
	unsigned uSubFamCount;
	SubFam(GuideTree, g_uMaxSubFamCount.get(), SubFams, &uSubFamCount);

	SetProgressDesc("Align node");
	const unsigned uNodeCount = 2*uSeqCount - 1;

	ProgNode *ProgNodes = new ProgNode[uNodeCount];
	bool *NodeIsSubFam = new bool[uNodeCount];
	bool *NodeInSubFam = new bool[uNodeCount];

	for (unsigned i = 0; i < uNodeCount; ++i)
		{
		NodeIsSubFam[i] = false;
		NodeInSubFam[i] = false;
		}

	for (unsigned i = 0; i < uSubFamCount; ++i)
		{
		unsigned uNodeIndex = SubFams[i];
		assert(uNodeIndex < uNodeCount);
		NodeIsSubFam[uNodeIndex] = true;
		SetInFam(GuideTree, uNodeIndex, NodeInSubFam);
		}

	unsigned uJoin = 0;
	unsigned uTreeNodeIndex = GuideTree.FirstDepthFirstNode();
	do
		{
		if (NodeIsSubFam[uTreeNodeIndex])
			{
#if	TRACE
			Log("Node %d: align subfam\n", uTreeNodeIndex);
#endif
			ProgNode &Node = ProgNodes[uTreeNodeIndex];
			AlignSubFam(v, GuideTree, uTreeNodeIndex, Node.m_MSA);
			Node.m_uLength = Node.m_MSA.GetColCount();
			}
		else if (!NodeInSubFam[uTreeNodeIndex])
			{
#if	TRACE
			Log("Node %d: align two subfams\n", uTreeNodeIndex);
#endif
			Progress(uJoin, uSubFamCount - 1);
			++uJoin;

			const unsigned uMergeNodeIndex = uTreeNodeIndex;
			ProgNode &Parent = ProgNodes[uMergeNodeIndex];

			const unsigned uLeft = GuideTree.GetLeft(uTreeNodeIndex);
			const unsigned uRight = GuideTree.GetRight(uTreeNodeIndex);

			ProgNode &Node1 = ProgNodes[uLeft];
			ProgNode &Node2 = ProgNodes[uRight];

			PWPath Path;
			AlignTwoMSAs(Node1.m_MSA, Node2.m_MSA, Parent.m_MSA, Path);
			Parent.m_uLength = Parent.m_MSA.GetColCount();

			Node1.m_MSA.Clear();
			Node2.m_MSA.Clear();
			}
		else
			{
#if	TRACE
			Log("Node %d: in subfam\n", uTreeNodeIndex);
#endif
			;
			}
		uTreeNodeIndex = GuideTree.NextDepthFirstNode(uTreeNodeIndex);
		}
	while (NULL_NEIGHBOR != uTreeNodeIndex);
	ProgressStepsDone();

	unsigned uRootNodeIndex = GuideTree.GetRootNodeIndex();
	ProgNode &RootProgNode = ProgNodes[uRootNodeIndex];

	TextFile fOut(g_pstrOutFileName.get(), true);
	MHackEnd(RootProgNode.m_MSA);
	RootProgNode.m_MSA.ToFile(fOut);

	delete[] NodeInSubFam;
	delete[] NodeIsSubFam;
	delete[] ProgNodes;
	delete[] SubFams;

	ProgNodes = 0;
	NodeInSubFam = 0;
	NodeIsSubFam = 0;
	SubFams = 0;
	}