Beispiel #1
0
void DistUnaligned(const SeqVect &v, DISTANCE DistMethod, DistFunc &DF)
	{
	const unsigned uSeqCount = v.Length();

	switch (DistMethod)
		{
	case DISTANCE_Kmer6_6:
		DistKmer6_6(v, DF);
		break;

	case DISTANCE_Kmer20_3:
		DistKmer20_3(v, DF);
		break;

	case DISTANCE_Kmer20_4:
		FastDistKmer(v, DF);
		break;

	case DISTANCE_Kbit20_3:
		DistKbit20_3(v, DF);
		break;

	case DISTANCE_Kmer4_6:
		DistKmer4_6(v, DF);
		break;

	case DISTANCE_PWKimura:
		DistPWKimura(v, DF);
		break;

	case DISTANCE_PWScoreDist:
		DistPWScoreDist(v, DF);
		break;

	default:
		Quit("DistUnaligned, unsupported distance method %d", DistMethod);
		}

//	const char **SeqNames = (const char **) malloc(uSeqCount*sizeof(char *));
	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		{
		const Seq &s = *(v[uSeqIndex]);

		const char *ptrName = s.GetName();
		unsigned uId = s.GetId();

		DF.SetName(uSeqIndex, ptrName);
		DF.SetId(uSeqIndex, uId);
		}
	}
Beispiel #2
0
// Return value is the group count, i.e. the effective number
// of distinctly different sequences.
unsigned MSA::CalcBLOSUMWeights(ClusterTree &BlosumCluster) const
	{
// Build distance matrix
	DistFunc DF;
	unsigned uSeqCount = GetSeqCount();
	DF.SetCount(uSeqCount);
	for (unsigned i = 0; i < uSeqCount; ++i)
		for (unsigned j = i+1; j < uSeqCount; ++j)
			{
			double dDist = GetPctIdentityPair(i, j);
			assert(dDist >= 0.0 && dDist <= 1.0);
			DF.SetDist(i, j, (float) (1.0 - dDist));
			}

// Cluster based on the distance function
	BlosumCluster.Create(DF);

// Return value is HMMer's "effective sequence count".
	return SetBLOSUMNodeWeight(BlosumCluster.GetRoot(), 1.0 - BLOSUM_DIST);
	}
Beispiel #3
0
void DistPWScoreDist(const SeqVect &v, DistFunc &DF)
	{
	SEQWEIGHT SeqWeightSave = GetSeqWeightMethod();
	SetSeqWeightMethod(SEQWEIGHT_Henikoff);

	const unsigned uSeqCount = v.Length();
	DF.SetCount(uSeqCount);

	const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2;
	unsigned uCount = 0;
	SetProgressDesc("PW ScoreDist");
	for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1)
		{
		const Seq &s1 = v.GetSeq(uSeqIndex1);
		MSA msa1;
		msa1.FromSeq(s1);
		for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqIndex1; ++uSeqIndex2)
			{
			if (0 == uCount%20)
				Progress(uCount, uPairCount);
			++uCount;
			const Seq &s2 = v.GetSeq(uSeqIndex2);
			MSA msa2;
			msa2.FromSeq(s2);
		
			PWPath Path;
			MSA msaOut;
			AlignTwoMSAs(msa1, msa2, msaOut, Path, false, false);

			float d = (float) GetScoreDist(msaOut, 0, 1);
			DF.SetDist(uSeqIndex1, uSeqIndex2, d);
			}
		}
	ProgressStepsDone();

	SetSeqWeightMethod(SeqWeightSave);
	}
Beispiel #4
0
void DistKmer6_6(const SeqVect &v, DistFunc &DF)
	{
	const unsigned uSeqCount = v.Length();

	DF.SetCount(uSeqCount);
	if (0 == uSeqCount)
		return;

// Initialize distance matrix to zero
	for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
		{
		DF.SetDist(uSeq1, uSeq1, 0);
		for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
			DF.SetDist(uSeq1, uSeq2, 0);
		}

// Convert to letters
	unsigned **Letters = new unsigned *[uSeqCount];
	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
		{
		Seq &s = *(v[uSeqIndex]);
		const unsigned uSeqLength = s.Length();
		unsigned *L = new unsigned[uSeqLength];
		Letters[uSeqIndex] = L;
		for (unsigned n = 0; n < uSeqLength; ++n)
			{
			char c = s[n];
			L[n] = CharToLetterEx(c);
			assert(L[n] < uResidueGroupCount);
			}
		}

	unsigned **uCommonTupleCount = new unsigned *[uSeqCount];
	for (unsigned n = 0; n < uSeqCount; ++n)
		{
		uCommonTupleCount[n] = new unsigned[uSeqCount];
		memset(uCommonTupleCount[n], 0, uSeqCount*sizeof(unsigned));
		}

	const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2;
	unsigned uCount = 0;
	for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
		{
		Seq &seq1 = *(v[uSeq1]);
		const unsigned uSeqLength1 = seq1.Length();
		if (uSeqLength1 < 5)
			continue;

		const unsigned uTupleCount = uSeqLength1 - 5;
		const unsigned *L = Letters[uSeq1];
		CountTuples(L, uTupleCount, Count1);
#if	TRACE
		{
		Log("Seq1=%d\n", uSeq1);
		Log("Groups:\n");
		for (unsigned n = 0; n < uSeqLength1; ++n)
			Log("%u", ResidueGroup[L[n]]);
		Log("\n");

		Log("Tuples:\n");
		ListCount(Count1);
		}
#endif

		SetProgressDesc("K-mer dist pass 1");
		for (unsigned uSeq2 = 0; uSeq2 <= uSeq1; ++uSeq2)
			{
			if (0 == uCount%500)
				Progress(uCount, uPairCount);
			++uCount;
			Seq &seq2 = *(v[uSeq2]);
			const unsigned uSeqLength2 = seq2.Length();
			if (uSeqLength2 < 5)
				{
				if (uSeq1 == uSeq2)
					DF.SetDist(uSeq1, uSeq2, 0);
				else
					DF.SetDist(uSeq1, uSeq2, 1);
				continue;
				}

		// First pass through seq 2 to count tuples
			const unsigned uTupleCount = uSeqLength2 - 5;
			const unsigned *L = Letters[uSeq2];
			CountTuples(L, uTupleCount, Count2);
#if	TRACE
			Log("Seq2=%d Counts=\n", uSeq2);
			ListCount(Count2);
#endif

		// Second pass to accumulate sum of shared tuples
		// MAFFT defines this as the sum over unique tuples
		// in seq2 of the minimum of the number of tuples found
		// in the two sequences.
			unsigned uSum = 0;
			for (unsigned n = 0; n < uTupleCount; ++n)
				{
				const unsigned uTuple = GetTuple(L, n);
				uSum += MIN(Count1[uTuple], Count2[uTuple]);

			// This is a hack to make sure each unique tuple counted only once.
				Count2[uTuple] = 0;
				}
#if	TRACE
			{
			Seq &s1 = *(v[uSeq1]);
			Seq &s2 = *(v[uSeq2]);
			const char *pName1 = s1.GetName();
			const char *pName2 = s2.GetName();
			Log("Common count %s(%d) - %s(%d) =%u\n",
			  pName1, uSeq1, pName2, uSeq2, uSum);
			}
#endif
			uCommonTupleCount[uSeq1][uSeq2] = uSum;
			uCommonTupleCount[uSeq2][uSeq1] = uSum;
			}
		}
	ProgressStepsDone();

	uCount = 0;
	SetProgressDesc("K-mer dist pass 2");
	for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
		{
		Seq &s1 = *(v[uSeq1]);
		const char *pName1 = s1.GetName();

		double dCommonTupleCount11 = uCommonTupleCount[uSeq1][uSeq1];
		if (0 == dCommonTupleCount11)
			dCommonTupleCount11 = 1;

		DF.SetDist(uSeq1, uSeq1, 0);
		for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
			{
			if (0 == uCount%500)
				Progress(uCount, uPairCount);
			++uCount;

			double dCommonTupleCount22 = uCommonTupleCount[uSeq2][uSeq2];
			if (0 == dCommonTupleCount22)
				dCommonTupleCount22 = 1;

			const double dDist1 = 3.0*(dCommonTupleCount11 - uCommonTupleCount[uSeq1][uSeq2])
			  /dCommonTupleCount11;
			const double dDist2 = 3.0*(dCommonTupleCount22 - uCommonTupleCount[uSeq1][uSeq2])
			  /dCommonTupleCount22;

		// dMinDist is the value used for tree-building in MAFFT
			const double dMinDist = MIN(dDist1, dDist2);
			DF.SetDist(uSeq1, uSeq2, (float) dMinDist);

			//const double dEstimatedPctId = TupleDistToEstimatedPctId(dMinDist);
			//g_dfPwId.SetDist(uSeq1, uSeq2, dEstimatedPctId);
		// **** TODO **** why does this make score slightly worse??
			//const double dKimuraDist = KimuraDist(dEstimatedPctId);
			//DF.SetDist(uSeq1, uSeq2, dKimuraDist);
			}
		}
	ProgressStepsDone();

	for (unsigned n = 0; n < uSeqCount; ++n)
		delete[] uCommonTupleCount[n];
	delete[] uCommonTupleCount;
	delete[] Letters;
	}
Beispiel #5
0
// WARNING: Sequences MUST be stripped of gaps and upper case!
void DistKmer20_3(const SeqVect &v, DistFunc &DF)
{
    const unsigned uSeqCount = v.Length();

    DF.SetCount(uSeqCount);
    if (0 == uSeqCount)
        return;
    for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
    {
        DF.SetDist(uSeq1, uSeq1, 0);
        for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
            DF.SetDist(uSeq1, uSeq2, 0);
    }

    const unsigned uTripleArrayBytes = TRIPLE_COUNT*sizeof(TripleCount);
    TripleCounts = (TripleCount *) malloc(uTripleArrayBytes);
    if (0 == TripleCounts)
        Quit("Not enough memory (TripleCounts)");
    memset(TripleCounts, 0, uTripleArrayBytes);

    for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord)
    {
        TripleCount &tc = *(TripleCounts + uWord);
        const unsigned uBytes = uSeqCount*sizeof(short);
        tc.m_Counts = (unsigned short *) malloc(uBytes);
        memset(tc.m_Counts, 0, uBytes);
    }

    for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
    {
        Seq &s = *(v[uSeqIndex]);
        const unsigned uSeqLength = s.Length();
        for (unsigned uPos = 0; uPos < uSeqLength - 2; ++uPos)
        {
            const unsigned uLetter1 = CharToLetterEx(s[uPos]);
            if (uLetter1 >= 20)
                continue;
            const unsigned uLetter2 = CharToLetterEx(s[uPos+1]);
            if (uLetter2 >= 20)
                continue;
            const unsigned uLetter3 = CharToLetterEx(s[uPos+2]);
            if (uLetter3 >= 20)
                continue;

            const unsigned uWord = uLetter1 + uLetter2*20 + uLetter3*20*20;
            assert(uWord < TRIPLE_COUNT);

            TripleCount &tc = *(TripleCounts + uWord);
            const unsigned uOldCount = tc.m_Counts[uSeqIndex];
            if (0 == uOldCount)
                ++(tc.m_uSeqCount);

            ++(tc.m_Counts[uSeqIndex]);
        }
    }

#if TRACE
    {
        Log("TripleCounts\n");
        unsigned uGrandTotal = 0;
        for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord)
        {
            const TripleCount &tc = *(TripleCounts + uWord);
            if (0 == tc.m_uSeqCount)
                continue;

            const unsigned uLetter3 = uWord/(20*20);
            const unsigned uLetter2 = (uWord - uLetter3*20*20)/20;
            const unsigned uLetter1 = uWord%20;
            Log("Word %6u %c%c%c   %6u",
                uWord,
                LetterToCharAmino(uLetter1),
                LetterToCharAmino(uLetter2),
                LetterToCharAmino(uLetter3),
                tc.m_uSeqCount);

            unsigned uSeqCountWithThisWord = 0;
            for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
            {
                const unsigned uCount = tc.m_Counts[uSeqIndex];
                if (uCount > 0)
                {
                    ++uSeqCountWithThisWord;
                    Log(" %u=%u", uSeqIndex, uCount);
                    uGrandTotal += uCount;
                }
            }
            if (uSeqCountWithThisWord != tc.m_uSeqCount)
                Log(" *** SQ ERROR *** %u %u", tc.m_uSeqCount, uSeqCountWithThisWord);
            Log("\n");
        }

        unsigned uTotalBySeqLength = 0;
        for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
        {
            Seq &s = *(v[uSeqIndex]);
            const unsigned uSeqLength = s.Length();
            uTotalBySeqLength += uSeqLength - 2;
        }
        if (uGrandTotal != uTotalBySeqLength)
            Log("*** TOTALS DISAGREE *** %u %u\n", uGrandTotal, uTotalBySeqLength);
    }
#endif

    const unsigned uSeqListBytes = uSeqCount*sizeof(unsigned);
    unsigned short *SeqList = (unsigned short *) malloc(uSeqListBytes);

    for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord)
    {
        const TripleCount &tc = *(TripleCounts + uWord);
        if (0 == tc.m_uSeqCount)
            continue;

        unsigned uSeqCountFound = 0;
        memset(SeqList, 0, uSeqListBytes);

        for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
        {
            if (tc.m_Counts[uSeqIndex] > 0)
            {
                SeqList[uSeqCountFound] = uSeqIndex;
                ++uSeqCountFound;
                if (uSeqCountFound == tc.m_uSeqCount)
                    break;
            }
        }
        assert(uSeqCountFound == tc.m_uSeqCount);

        for (unsigned uSeq1 = 0; uSeq1 < uSeqCountFound; ++uSeq1)
        {
            const unsigned uSeqIndex1 = SeqList[uSeq1];
            const unsigned uCount1 = tc.m_Counts[uSeqIndex1];
            for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
            {
                const unsigned uSeqIndex2 = SeqList[uSeq2];
                const unsigned uCount2 = tc.m_Counts[uSeqIndex2];
                const unsigned uMinCount = uCount1 < uCount2 ? uCount1 : uCount2;
                const double d = DF.GetDist(uSeqIndex1, uSeqIndex2);
                DF.SetDist(uSeqIndex1, uSeqIndex2, (float) (d + uMinCount));
            }
        }
    }
    delete[] SeqList;
    free(TripleCounts);

    unsigned uDone = 0;
    const unsigned uTotal = (uSeqCount*(uSeqCount - 1))/2;
    for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
    {
        DF.SetDist(uSeq1, uSeq1, 0.0);

        const Seq &s1 = *(v[uSeq1]);
        const unsigned uLength1 = s1.Length();

        for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
        {
            const Seq &s2 = *(v[uSeq2]);
            const unsigned uLength2 = s2.Length();
            unsigned uMinLength = uLength1 < uLength2 ? uLength1 : uLength2;
            if (uMinLength < 3)
            {
                DF.SetDist(uSeq1, uSeq2, 1.0);
                continue;
            }

            const double dTripleCount = DF.GetDist(uSeq1, uSeq2);
            if (dTripleCount == 0)
            {
                DF.SetDist(uSeq1, uSeq2, 1.0);
                continue;
            }
            double dNormalizedTripletScore = dTripleCount/(uMinLength - 2);
            //double dEstimatedPairwiseIdentity = exp(0.3912*log(dNormalizedTripletScore));
            //if (dEstimatedPairwiseIdentity > 1)
            //	dEstimatedPairwiseIdentity = 1;
//			DF.SetDist(uSeq1, uSeq2, (float) (1.0 - dEstimatedPairwiseIdentity));
            DF.SetDist(uSeq1, uSeq2, (float) dNormalizedTripletScore);

#if	TRACE
            {
                Log("%s - %s  Triplet count = %g  Lengths %u, %u Estimated pwid = %g\n",
                    s1.GetName(), s2.GetName(), dTripleCount, uLength1, uLength2,
                    dEstimatedPairwiseIdentity);
            }
#endif
            if (uDone%1000 == 0)
                Progress(uDone, uTotal);
        }
    }
    ProgressStepsDone();
}