// Perecent group identity of a pair of sequences. // Positions with one or both gapped are ignored. double MSA::GetPctGroupIdentityPair(unsigned uSeqIndex1, unsigned uSeqIndex2) const { const unsigned uColCount = GetColCount(); unsigned uPosCount = 0; unsigned uSameCount = 0; for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) { if (IsGap(uSeqIndex1, uColIndex)) continue; if (IsGap(uSeqIndex2, uColIndex)) continue; if (IsWildcard(uSeqIndex1, uColIndex)) continue; if (IsWildcard(uSeqIndex2, uColIndex)) continue; const unsigned uLetter1 = GetLetter(uSeqIndex1, uColIndex); const unsigned uLetter2 = GetLetter(uSeqIndex2, uColIndex); const unsigned uGroup1 = ResidueGroup[uLetter1]; const unsigned uGroup2 = ResidueGroup[uLetter2]; if (uGroup1 == uGroup2) ++uSameCount; ++uPosCount; } if (0 == uPosCount) return 0; return (double) uSameCount / (double) uPosCount; }
double MSA::GetCons(unsigned uColIndex) const { unsigned Counts[MAX_ALPHA]; for (unsigned uLetter = 0; uLetter < g_AlphaSize.get(); ++uLetter) Counts[uLetter] = 0; unsigned uMaxCount = 0; for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) { if (IsGap(uSeqIndex, uColIndex)) continue; char c = GetChar(uSeqIndex, uColIndex); c = toupper(c); if ('X' == c || 'B' == c || 'Z' == c) continue; unsigned uLetter = GetLetter(uSeqIndex, uColIndex); unsigned uCount = Counts[uLetter] + 1; if (uCount > uMaxCount) uMaxCount = uCount; Counts[uLetter] = uCount; } // Cons is undefined for all-gap column if (0 == uMaxCount) { // assert(false); return 1; } double dCons = (double) uMaxCount / (double) GetSeqCount(); assert(dCons > 0 && dCons <= 1); return dCons; }
void MSA::GetFractionalWeightedCounts(unsigned uColIndex, bool bNormalize, FCOUNT fcCounts[], FCOUNT *ptrfcGapStart, FCOUNT *ptrfcGapEnd, FCOUNT *ptrfcGapExtend, FCOUNT *ptrfOcc, FCOUNT *ptrfcLL, FCOUNT *ptrfcLG, FCOUNT *ptrfcGL, FCOUNT *ptrfcGG) const { const unsigned uSeqCount = GetSeqCount(); const unsigned uColCount = GetColCount(); const char* seqName; memset(fcCounts, 0, g_AlphaSize*sizeof(FCOUNT)); WEIGHT wTotal = 0; FCOUNT fGap = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const WEIGHT w = GetSeqWeight(uSeqIndex); if (IsGap(uSeqIndex, uColIndex)) { fGap += w; continue; } else if (IsWildcard(uSeqIndex, uColIndex)) { const unsigned uLetter = GetLetterEx(uSeqIndex, uColIndex); switch (g_Alpha) { case ALPHA_Amino: switch (uLetter) { case AX_B: // D or N fcCounts[AX_D] += w/2; fcCounts[AX_N] += w/2; break; case AX_Z: // E or Q fcCounts[AX_E] += w/2; fcCounts[AX_Q] += w/2; break; default: // any { const FCOUNT f = w/20; for (unsigned uLetter = 0; uLetter < 20; ++uLetter) fcCounts[uLetter] += f; break; } } break; case ALPHA_DNA: case ALPHA_RNA: switch (uLetter) { case AX_R: // G or A fcCounts[NX_G] += w/2; fcCounts[NX_A] += w/2; break; case AX_Y: // C or T/U fcCounts[NX_C] += w/2; fcCounts[NX_T] += w/2; break; default: // any const FCOUNT f = w/20; for (unsigned uLetter = 0; uLetter < 4; ++uLetter) fcCounts[uLetter] += f; break; } break; default: Quit("Alphabet %d not supported", g_Alpha); } continue; } unsigned uLetter = GetLetter(uSeqIndex, uColIndex); //BEGIN MODIFICATIONS TO MUSCLE int original=0; for(unsigned i=0; i<uColIndex; i++){ if (i >= this->GetColCount()){break;} ++original; char c = GetChar(uSeqIndex, i); if(c== '-'){ original--; } } seqName = this->GetSeqName(uSeqIndex); int compositeVectPosition; compositeVectPosition = atoi(seqName); CompositeVect CV = *CVLocation; Composite* CVL = CV[compositeVectPosition]; Composite C = *CVL; for(int j=0; j<21; j++){ fcCounts[j] = w*C[original][j]; wTotal = w*C[original][j]; } //ORIGINAL MUSLCE LINE WAS: //fcCounts[uLetter] += w; //wTotal += w; //END MODIFICATIONS TO MUSCLE } *ptrfOcc = (float) (1.0 - fGap); if (bNormalize && wTotal > 0) { if (wTotal > 1.001) Quit("wTotal=%g\n", wTotal); for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter) fcCounts[uLetter] /= wTotal; // AssertNormalized(fcCounts); } FCOUNT fcStartCount = 0; if (uColIndex == 0) { for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (IsGap(uSeqIndex, uColIndex)) fcStartCount += GetSeqWeight(uSeqIndex); } else { for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (IsGap(uSeqIndex, uColIndex) && !IsGap(uSeqIndex, uColIndex - 1)) fcStartCount += GetSeqWeight(uSeqIndex); } FCOUNT fcEndCount = 0; if (uColCount - 1 == uColIndex) { for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (IsGap(uSeqIndex, uColIndex)) fcEndCount += GetSeqWeight(uSeqIndex); } else { for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (IsGap(uSeqIndex, uColIndex) && !IsGap(uSeqIndex, uColIndex + 1)) fcEndCount += GetSeqWeight(uSeqIndex); } FCOUNT LL = 0; FCOUNT LG = 0; FCOUNT GL = 0; FCOUNT GG = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { WEIGHT w = GetSeqWeight(uSeqIndex); bool bLetterHere = !IsGap(uSeqIndex, uColIndex); bool bLetterPrev = (uColIndex == 0 || !IsGap(uSeqIndex, uColIndex - 1)); if (bLetterHere) { if (bLetterPrev) LL += w; else GL += w; } else { if (bLetterPrev) LG += w; else GG += w; } } FCOUNT fcExtendCount = 0; if (uColIndex > 0 && uColIndex < GetColCount() - 1) for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (IsGap(uSeqIndex, uColIndex) && IsGap(uSeqIndex, uColIndex - 1) && IsGap(uSeqIndex, uColIndex + 1)) fcExtendCount += GetSeqWeight(uSeqIndex); *ptrfcLL = LL; *ptrfcLG = LG; *ptrfcGL = GL; *ptrfcGG = GG; *ptrfcGapStart = fcStartCount; *ptrfcGapEnd = fcEndCount; *ptrfcGapExtend = fcExtendCount; }
void MSA::GetFractionalWeightedCounts(unsigned uColIndex, bool bNormalize, FCOUNT fcCounts[], FCOUNT *ptrfcGapStart, FCOUNT *ptrfcGapEnd, FCOUNT *ptrfcGapExtend, FCOUNT *ptrfOcc, FCOUNT *ptrfcLL, FCOUNT *ptrfcLG, FCOUNT *ptrfcGL, FCOUNT *ptrfcGG) const { const unsigned uSeqCount = GetSeqCount(); const unsigned uColCount = GetColCount(); memset(fcCounts, 0, g_AlphaSize*sizeof(FCOUNT)); WEIGHT wTotal = 0; FCOUNT fGap = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { const WEIGHT w = GetSeqWeight(uSeqIndex); if (IsGap(uSeqIndex, uColIndex)) { fGap += w; continue; } else if (IsWildcard(uSeqIndex, uColIndex)) { const unsigned uLetter = GetLetterEx(uSeqIndex, uColIndex); switch (g_Alpha) { case ALPHA_Amino: switch (uLetter) { case AX_B: // D or N fcCounts[AX_D] += w/2; fcCounts[AX_N] += w/2; break; case AX_Z: // E or Q fcCounts[AX_E] += w/2; fcCounts[AX_Q] += w/2; break; default: // any { const FCOUNT f = w/20; for (unsigned uLetter = 0; uLetter < 20; ++uLetter) fcCounts[uLetter] += f; break; } } break; case ALPHA_DNA: case ALPHA_RNA: switch (uLetter) { case AX_R: // G or A fcCounts[NX_G] += w/2; fcCounts[NX_A] += w/2; break; case AX_Y: // C or T/U fcCounts[NX_C] += w/2; fcCounts[NX_T] += w/2; break; default: // any const FCOUNT f = w/20; for (unsigned uLetter = 0; uLetter < 4; ++uLetter) fcCounts[uLetter] += f; break; } break; default: Quit("Alphabet %d not supported", g_Alpha); } continue; } unsigned uLetter = GetLetter(uSeqIndex, uColIndex); fcCounts[uLetter] += w; wTotal += w; } *ptrfOcc = (float) (1.0 - fGap); if (bNormalize && wTotal > 0) { if (wTotal > 1.001) Quit("wTotal=%g\n", wTotal); for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter) fcCounts[uLetter] /= wTotal; // AssertNormalized(fcCounts); } FCOUNT fcStartCount = 0; if (uColIndex == 0) { for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (IsGap(uSeqIndex, uColIndex)) fcStartCount += GetSeqWeight(uSeqIndex); } else { for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (IsGap(uSeqIndex, uColIndex) && !IsGap(uSeqIndex, uColIndex - 1)) fcStartCount += GetSeqWeight(uSeqIndex); } FCOUNT fcEndCount = 0; if (uColCount - 1 == uColIndex) { for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (IsGap(uSeqIndex, uColIndex)) fcEndCount += GetSeqWeight(uSeqIndex); } else { for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (IsGap(uSeqIndex, uColIndex) && !IsGap(uSeqIndex, uColIndex + 1)) fcEndCount += GetSeqWeight(uSeqIndex); } FCOUNT LL = 0; FCOUNT LG = 0; FCOUNT GL = 0; FCOUNT GG = 0; for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) { WEIGHT w = GetSeqWeight(uSeqIndex); bool bLetterHere = !IsGap(uSeqIndex, uColIndex); bool bLetterPrev = (uColIndex == 0 || !IsGap(uSeqIndex, uColIndex - 1)); if (bLetterHere) { if (bLetterPrev) LL += w; else GL += w; } else { if (bLetterPrev) LG += w; else GG += w; } } FCOUNT fcExtendCount = 0; if (uColIndex > 0 && uColIndex < GetColCount() - 1) for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) if (IsGap(uSeqIndex, uColIndex) && IsGap(uSeqIndex, uColIndex - 1) && IsGap(uSeqIndex, uColIndex + 1)) fcExtendCount += GetSeqWeight(uSeqIndex); *ptrfcLL = LL; *ptrfcLG = LG; *ptrfcGL = GL; *ptrfcGG = GG; *ptrfcGapStart = fcStartCount; *ptrfcGapEnd = fcEndCount; *ptrfcGapExtend = fcExtendCount; }