void DistUnaligned(const SeqVect &v, DISTANCE DistMethod, DistFunc &DF) { const unsigned uSeqCount = v.Length(); switch (DistMethod) { case DISTANCE_Kmer6_6: DistKmer6_6(v, DF); break; case DISTANCE_Kmer20_3: DistKmer20_3(v, DF); break; case DISTANCE_Kmer20_4: FastDistKmer(v, DF); break; case DISTANCE_Kbit20_3: DistKbit20_3(v, DF); break; case DISTANCE_Kmer4_6: DistKmer4_6(v, DF); break; case DISTANCE_PWKimura: DistPWKimura(v, DF); break; case DISTANCE_PWScoreDist: DistPWScoreDist(v, DF); break; default: Quit("DistUnaligned, unsupported distance method %d", DistMethod); } // const char **SeqNames = (const char **) malloc(uSeqCount*sizeof(char *)); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const Seq &s = *(v[uSeqIndex]); const char *ptrName = s.GetName(); unsigned uId = s.GetId(); DF.SetName(uSeqIndex, ptrName); DF.SetId(uSeqIndex, uId); } }
// Return value is the group count, i.e. the effective number // of distinctly different sequences. unsigned MSA::CalcBLOSUMWeights(ClusterTree &BlosumCluster) const { // Build distance matrix DistFunc DF; unsigned uSeqCount = GetSeqCount(); DF.SetCount(uSeqCount); for (unsigned i = 0; i < uSeqCount; ++i) for (unsigned j = i+1; j < uSeqCount; ++j) { double dDist = GetPctIdentityPair(i, j); assert(dDist >= 0.0 && dDist <= 1.0); DF.SetDist(i, j, (float) (1.0 - dDist)); } // Cluster based on the distance function BlosumCluster.Create(DF); // Return value is HMMer's "effective sequence count". return SetBLOSUMNodeWeight(BlosumCluster.GetRoot(), 1.0 - BLOSUM_DIST); }
void DistPWScoreDist(const SeqVect &v, DistFunc &DF) { SEQWEIGHT SeqWeightSave = GetSeqWeightMethod(); SetSeqWeightMethod(SEQWEIGHT_Henikoff); const unsigned uSeqCount = v.Length(); DF.SetCount(uSeqCount); const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2; unsigned uCount = 0; SetProgressDesc("PW ScoreDist"); for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1) { const Seq &s1 = v.GetSeq(uSeqIndex1); MSA msa1; msa1.FromSeq(s1); for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqIndex1; ++uSeqIndex2) { if (0 == uCount%20) Progress(uCount, uPairCount); ++uCount; const Seq &s2 = v.GetSeq(uSeqIndex2); MSA msa2; msa2.FromSeq(s2); PWPath Path; MSA msaOut; AlignTwoMSAs(msa1, msa2, msaOut, Path, false, false); float d = (float) GetScoreDist(msaOut, 0, 1); DF.SetDist(uSeqIndex1, uSeqIndex2, d); } } ProgressStepsDone(); SetSeqWeightMethod(SeqWeightSave); }
void DistKmer6_6(const SeqVect &v, DistFunc &DF) { const unsigned uSeqCount = v.Length(); DF.SetCount(uSeqCount); if (0 == uSeqCount) return; // Initialize distance matrix to zero for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) { DF.SetDist(uSeq1, uSeq1, 0); for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) DF.SetDist(uSeq1, uSeq2, 0); } // Convert to letters unsigned **Letters = new unsigned *[uSeqCount]; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { Seq &s = *(v[uSeqIndex]); const unsigned uSeqLength = s.Length(); unsigned *L = new unsigned[uSeqLength]; Letters[uSeqIndex] = L; for (unsigned n = 0; n < uSeqLength; ++n) { char c = s[n]; L[n] = CharToLetterEx(c); assert(L[n] < uResidueGroupCount); } } unsigned **uCommonTupleCount = new unsigned *[uSeqCount]; for (unsigned n = 0; n < uSeqCount; ++n) { uCommonTupleCount[n] = new unsigned[uSeqCount]; memset(uCommonTupleCount[n], 0, uSeqCount*sizeof(unsigned)); } const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2; unsigned uCount = 0; for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) { Seq &seq1 = *(v[uSeq1]); const unsigned uSeqLength1 = seq1.Length(); if (uSeqLength1 < 5) continue; const unsigned uTupleCount = uSeqLength1 - 5; const unsigned *L = Letters[uSeq1]; CountTuples(L, uTupleCount, Count1); #if TRACE { Log("Seq1=%d\n", uSeq1); Log("Groups:\n"); for (unsigned n = 0; n < uSeqLength1; ++n) Log("%u", ResidueGroup[L[n]]); Log("\n"); Log("Tuples:\n"); ListCount(Count1); } #endif SetProgressDesc("K-mer dist pass 1"); for (unsigned uSeq2 = 0; uSeq2 <= uSeq1; ++uSeq2) { if (0 == uCount%500) Progress(uCount, uPairCount); ++uCount; Seq &seq2 = *(v[uSeq2]); const unsigned uSeqLength2 = seq2.Length(); if (uSeqLength2 < 5) { if (uSeq1 == uSeq2) DF.SetDist(uSeq1, uSeq2, 0); else DF.SetDist(uSeq1, uSeq2, 1); continue; } // First pass through seq 2 to count tuples const unsigned uTupleCount = uSeqLength2 - 5; const unsigned *L = Letters[uSeq2]; CountTuples(L, uTupleCount, Count2); #if TRACE Log("Seq2=%d Counts=\n", uSeq2); ListCount(Count2); #endif // Second pass to accumulate sum of shared tuples // MAFFT defines this as the sum over unique tuples // in seq2 of the minimum of the number of tuples found // in the two sequences. unsigned uSum = 0; for (unsigned n = 0; n < uTupleCount; ++n) { const unsigned uTuple = GetTuple(L, n); uSum += MIN(Count1[uTuple], Count2[uTuple]); // This is a hack to make sure each unique tuple counted only once. Count2[uTuple] = 0; } #if TRACE { Seq &s1 = *(v[uSeq1]); Seq &s2 = *(v[uSeq2]); const char *pName1 = s1.GetName(); const char *pName2 = s2.GetName(); Log("Common count %s(%d) - %s(%d) =%u\n", pName1, uSeq1, pName2, uSeq2, uSum); } #endif uCommonTupleCount[uSeq1][uSeq2] = uSum; uCommonTupleCount[uSeq2][uSeq1] = uSum; } } ProgressStepsDone(); uCount = 0; SetProgressDesc("K-mer dist pass 2"); for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) { Seq &s1 = *(v[uSeq1]); const char *pName1 = s1.GetName(); double dCommonTupleCount11 = uCommonTupleCount[uSeq1][uSeq1]; if (0 == dCommonTupleCount11) dCommonTupleCount11 = 1; DF.SetDist(uSeq1, uSeq1, 0); for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) { if (0 == uCount%500) Progress(uCount, uPairCount); ++uCount; double dCommonTupleCount22 = uCommonTupleCount[uSeq2][uSeq2]; if (0 == dCommonTupleCount22) dCommonTupleCount22 = 1; const double dDist1 = 3.0*(dCommonTupleCount11 - uCommonTupleCount[uSeq1][uSeq2]) /dCommonTupleCount11; const double dDist2 = 3.0*(dCommonTupleCount22 - uCommonTupleCount[uSeq1][uSeq2]) /dCommonTupleCount22; // dMinDist is the value used for tree-building in MAFFT const double dMinDist = MIN(dDist1, dDist2); DF.SetDist(uSeq1, uSeq2, (float) dMinDist); //const double dEstimatedPctId = TupleDistToEstimatedPctId(dMinDist); //g_dfPwId.SetDist(uSeq1, uSeq2, dEstimatedPctId); // **** TODO **** why does this make score slightly worse?? //const double dKimuraDist = KimuraDist(dEstimatedPctId); //DF.SetDist(uSeq1, uSeq2, dKimuraDist); } } ProgressStepsDone(); for (unsigned n = 0; n < uSeqCount; ++n) delete[] uCommonTupleCount[n]; delete[] uCommonTupleCount; delete[] Letters; }
// WARNING: Sequences MUST be stripped of gaps and upper case! void DistKmer20_3(const SeqVect &v, DistFunc &DF) { const unsigned uSeqCount = v.Length(); DF.SetCount(uSeqCount); if (0 == uSeqCount) return; for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) { DF.SetDist(uSeq1, uSeq1, 0); for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) DF.SetDist(uSeq1, uSeq2, 0); } const unsigned uTripleArrayBytes = TRIPLE_COUNT*sizeof(TripleCount); TripleCounts = (TripleCount *) malloc(uTripleArrayBytes); if (0 == TripleCounts) Quit("Not enough memory (TripleCounts)"); memset(TripleCounts, 0, uTripleArrayBytes); for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord) { TripleCount &tc = *(TripleCounts + uWord); const unsigned uBytes = uSeqCount*sizeof(short); tc.m_Counts = (unsigned short *) malloc(uBytes); memset(tc.m_Counts, 0, uBytes); } for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { Seq &s = *(v[uSeqIndex]); const unsigned uSeqLength = s.Length(); for (unsigned uPos = 0; uPos < uSeqLength - 2; ++uPos) { const unsigned uLetter1 = CharToLetterEx(s[uPos]); if (uLetter1 >= 20) continue; const unsigned uLetter2 = CharToLetterEx(s[uPos+1]); if (uLetter2 >= 20) continue; const unsigned uLetter3 = CharToLetterEx(s[uPos+2]); if (uLetter3 >= 20) continue; const unsigned uWord = uLetter1 + uLetter2*20 + uLetter3*20*20; assert(uWord < TRIPLE_COUNT); TripleCount &tc = *(TripleCounts + uWord); const unsigned uOldCount = tc.m_Counts[uSeqIndex]; if (0 == uOldCount) ++(tc.m_uSeqCount); ++(tc.m_Counts[uSeqIndex]); } } #if TRACE { Log("TripleCounts\n"); unsigned uGrandTotal = 0; for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord) { const TripleCount &tc = *(TripleCounts + uWord); if (0 == tc.m_uSeqCount) continue; const unsigned uLetter3 = uWord/(20*20); const unsigned uLetter2 = (uWord - uLetter3*20*20)/20; const unsigned uLetter1 = uWord%20; Log("Word %6u %c%c%c %6u", uWord, LetterToCharAmino(uLetter1), LetterToCharAmino(uLetter2), LetterToCharAmino(uLetter3), tc.m_uSeqCount); unsigned uSeqCountWithThisWord = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const unsigned uCount = tc.m_Counts[uSeqIndex]; if (uCount > 0) { ++uSeqCountWithThisWord; Log(" %u=%u", uSeqIndex, uCount); uGrandTotal += uCount; } } if (uSeqCountWithThisWord != tc.m_uSeqCount) Log(" *** SQ ERROR *** %u %u", tc.m_uSeqCount, uSeqCountWithThisWord); Log("\n"); } unsigned uTotalBySeqLength = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { Seq &s = *(v[uSeqIndex]); const unsigned uSeqLength = s.Length(); uTotalBySeqLength += uSeqLength - 2; } if (uGrandTotal != uTotalBySeqLength) Log("*** TOTALS DISAGREE *** %u %u\n", uGrandTotal, uTotalBySeqLength); } #endif const unsigned uSeqListBytes = uSeqCount*sizeof(unsigned); unsigned short *SeqList = (unsigned short *) malloc(uSeqListBytes); for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord) { const TripleCount &tc = *(TripleCounts + uWord); if (0 == tc.m_uSeqCount) continue; unsigned uSeqCountFound = 0; memset(SeqList, 0, uSeqListBytes); for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { if (tc.m_Counts[uSeqIndex] > 0) { SeqList[uSeqCountFound] = uSeqIndex; ++uSeqCountFound; if (uSeqCountFound == tc.m_uSeqCount) break; } } assert(uSeqCountFound == tc.m_uSeqCount); for (unsigned uSeq1 = 0; uSeq1 < uSeqCountFound; ++uSeq1) { const unsigned uSeqIndex1 = SeqList[uSeq1]; const unsigned uCount1 = tc.m_Counts[uSeqIndex1]; for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) { const unsigned uSeqIndex2 = SeqList[uSeq2]; const unsigned uCount2 = tc.m_Counts[uSeqIndex2]; const unsigned uMinCount = uCount1 < uCount2 ? uCount1 : uCount2; const double d = DF.GetDist(uSeqIndex1, uSeqIndex2); DF.SetDist(uSeqIndex1, uSeqIndex2, (float) (d + uMinCount)); } } } delete[] SeqList; free(TripleCounts); unsigned uDone = 0; const unsigned uTotal = (uSeqCount*(uSeqCount - 1))/2; for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) { DF.SetDist(uSeq1, uSeq1, 0.0); const Seq &s1 = *(v[uSeq1]); const unsigned uLength1 = s1.Length(); for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) { const Seq &s2 = *(v[uSeq2]); const unsigned uLength2 = s2.Length(); unsigned uMinLength = uLength1 < uLength2 ? uLength1 : uLength2; if (uMinLength < 3) { DF.SetDist(uSeq1, uSeq2, 1.0); continue; } const double dTripleCount = DF.GetDist(uSeq1, uSeq2); if (dTripleCount == 0) { DF.SetDist(uSeq1, uSeq2, 1.0); continue; } double dNormalizedTripletScore = dTripleCount/(uMinLength - 2); //double dEstimatedPairwiseIdentity = exp(0.3912*log(dNormalizedTripletScore)); //if (dEstimatedPairwiseIdentity > 1) // dEstimatedPairwiseIdentity = 1; // DF.SetDist(uSeq1, uSeq2, (float) (1.0 - dEstimatedPairwiseIdentity)); DF.SetDist(uSeq1, uSeq2, (float) dNormalizedTripletScore); #if TRACE { Log("%s - %s Triplet count = %g Lengths %u, %u Estimated pwid = %g\n", s1.GetName(), s2.GetName(), dTripleCount, uLength1, uLength2, dEstimatedPairwiseIdentity); } #endif if (uDone%1000 == 0) Progress(uDone, uTotal); } } ProgressStepsDone(); }